2003-07-24 23:23:19 +00:00
|
|
|
/*
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
2012-02-06 19:57:08 +00:00
|
|
|
* Copyright (C) 1999-2012, International Business Machines
|
2003-07-24 23:23:19 +00:00
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
* file name: store.c
|
|
|
|
* encoding: US-ASCII
|
|
|
|
* tab size: 8 (not used)
|
|
|
|
* indentation:4
|
|
|
|
*
|
|
|
|
* created on: 2003-02-06
|
|
|
|
* created by: Ram Viswanadha
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "cmemory.h"
|
|
|
|
#include "cstring.h"
|
|
|
|
#include "filestrm.h"
|
|
|
|
#include "unicode/udata.h"
|
2011-07-27 05:53:56 +00:00
|
|
|
#include "unicode/utf16.h"
|
2003-07-24 23:23:19 +00:00
|
|
|
#include "utrie.h"
|
|
|
|
#include "unewdata.h"
|
|
|
|
#include "gensprep.h"
|
|
|
|
#include "uhash.h"
|
|
|
|
|
|
|
|
|
|
|
|
#define DO_DEBUG_OUT 0
|
|
|
|
|
|
|
|
|
2003-08-27 00:02:02 +00:00
|
|
|
/*
|
|
|
|
* StringPrep profile file format ------------------------------------
|
|
|
|
*
|
|
|
|
* The file format prepared and written here contains a 16-bit trie and a mapping table.
|
|
|
|
*
|
|
|
|
* Before the data contents described below, there are the headers required by
|
|
|
|
* the udata API for loading ICU data. Especially, a UDataInfo structure
|
|
|
|
* precedes the actual data. It contains platform properties values and the
|
|
|
|
* file format version.
|
|
|
|
*
|
|
|
|
* The following is a description of format version 2.
|
|
|
|
*
|
|
|
|
* Data contents:
|
|
|
|
*
|
|
|
|
* The contents is a parsed, binary form of RFC3454 and possibly
|
|
|
|
* NormalizationCorrections.txt depending on the options specified on the profile.
|
|
|
|
*
|
|
|
|
* Any Unicode code point from 0 to 0x10ffff can be looked up to get
|
|
|
|
* the trie-word, if any, for that code point. This means that the input
|
|
|
|
* to the lookup are 21-bit unsigned integers, with not all of the
|
|
|
|
* 21-bit range used.
|
|
|
|
*
|
|
|
|
* *.spp files customarily begin with a UDataInfo structure, see udata.h and .c.
|
|
|
|
* After that there are the following structures:
|
|
|
|
*
|
2003-09-23 21:07:16 +00:00
|
|
|
* int32_t indexes[_SPREP_INDEX_TOP]; -- _SPREP_INDEX_TOP=16, see enum in sprpimpl.h file
|
2003-08-27 00:02:02 +00:00
|
|
|
*
|
|
|
|
* UTrie stringPrepTrie; -- size in bytes=indexes[_SPREP_INDEX_TRIE_SIZE]
|
|
|
|
*
|
|
|
|
* uint16_t mappingTable[]; -- Contains the sequecence of code units that the code point maps to
|
|
|
|
* size in bytes = indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]
|
|
|
|
*
|
2004-03-04 23:12:36 +00:00
|
|
|
* The indexes array contains the following values:
|
|
|
|
* indexes[_SPREP_INDEX_TRIE_SIZE] -- The size of the StringPrep trie in bytes
|
|
|
|
* indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] -- The size of the mappingTable in bytes
|
2003-08-27 00:02:02 +00:00
|
|
|
* indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] -- The index of Unicode version of last entry in NormalizationCorrections.txt
|
|
|
|
* indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] -- The starting index of 1 UChar mapping index in the mapping table
|
|
|
|
* indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] -- The starting index of 2 UChars mapping index in the mapping table
|
|
|
|
* indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3 UChars mapping index in the mapping table
|
|
|
|
* indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] -- The starting index of 4 UChars mapping index in the mapping table
|
|
|
|
* indexes[_SPREP_OPTIONS] -- Bit set of options to turn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* StringPrep Trie :
|
|
|
|
*
|
|
|
|
* The StringPrep tries is a 16-bit trie that contains data for the profile.
|
|
|
|
* Each code point is associated with a value (trie-word) in the trie.
|
|
|
|
*
|
|
|
|
* - structure of data words from the trie
|
|
|
|
*
|
2004-03-04 23:12:36 +00:00
|
|
|
* i) A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0)
|
|
|
|
* represents the type associated with the code point
|
|
|
|
* if(trieWord >= _SPREP_TYPE_THRESHOLD){
|
2003-08-27 00:02:02 +00:00
|
|
|
* type = trieWord - 0xFFF0;
|
|
|
|
* }
|
|
|
|
* The type can be :
|
2004-03-04 23:12:36 +00:00
|
|
|
* USPREP_UNASSIGNED
|
|
|
|
* USPREP_PROHIBITED
|
|
|
|
* USPREP_DELETE
|
|
|
|
*
|
2003-08-27 00:02:02 +00:00
|
|
|
* ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and
|
|
|
|
* contains distribution described below
|
|
|
|
*
|
2004-03-04 23:12:36 +00:00
|
|
|
* 0 - ON : The code point is prohibited (USPREP_PROHIBITED). This is to allow for codepoint that are both prohibited and mapped.
|
2003-08-27 00:02:02 +00:00
|
|
|
* 1 - ON : The value in the next 14 bits is an index into the mapping table
|
|
|
|
* OFF: The value in the next 14 bits is an delta value from the code point
|
|
|
|
* 2..15 - Contains data as described by bit 1. If all bits are set
|
|
|
|
* (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELETE
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Mapping Table:
|
|
|
|
* The data in mapping table is sorted according to the length of the mapping sequence.
|
|
|
|
* If the type of the code point is USPREP_MAP and value in trie word is an index, the index
|
|
|
|
* is compared with start indexes of sequence length start to figure out the length according to
|
|
|
|
* the following algorithm:
|
|
|
|
*
|
|
|
|
* if( index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] &&
|
|
|
|
* index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){
|
|
|
|
* length = 1;
|
|
|
|
* }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] &&
|
|
|
|
* index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){
|
|
|
|
* length = 2;
|
|
|
|
* }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] &&
|
|
|
|
* index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){
|
|
|
|
* length = 3;
|
|
|
|
* }else{
|
|
|
|
* // The first position in the mapping table contains the length
|
|
|
|
* // of the sequence
|
|
|
|
* length = mappingTable[index++];
|
|
|
|
*
|
|
|
|
* }
|
|
|
|
*
|
|
|
|
*/
|
2003-07-24 23:23:19 +00:00
|
|
|
|
|
|
|
/* file data ---------------------------------------------------------------- */
|
|
|
|
/* indexes[] value names */
|
|
|
|
|
|
|
|
#if UCONFIG_NO_IDNA
|
|
|
|
|
|
|
|
/* dummy UDataInfo cf. udata.h */
|
|
|
|
static UDataInfo dataInfo = {
|
|
|
|
sizeof(UDataInfo),
|
|
|
|
0,
|
|
|
|
|
|
|
|
U_IS_BIG_ENDIAN,
|
|
|
|
U_CHARSET_FAMILY,
|
|
|
|
U_SIZEOF_UCHAR,
|
|
|
|
0,
|
|
|
|
|
|
|
|
{ 0, 0, 0, 0 }, /* dummy dataFormat */
|
|
|
|
{ 0, 0, 0, 0 }, /* dummy formatVersion */
|
|
|
|
{ 0, 0, 0, 0 } /* dummy dataVersion */
|
|
|
|
};
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static int32_t indexes[_SPREP_INDEX_TOP]={ 0 };
|
|
|
|
|
|
|
|
static uint16_t* mappingData= NULL;
|
|
|
|
static int32_t mappingDataCapacity = 0; /* we skip the first index in mapping data */
|
|
|
|
static int16_t currentIndex = 0; /* the current index into the data trie */
|
|
|
|
static int32_t maxLength = 0; /* maximum length of mapping string */
|
|
|
|
|
|
|
|
|
|
|
|
/* UDataInfo cf. udata.h */
|
|
|
|
static UDataInfo dataInfo={
|
|
|
|
sizeof(UDataInfo),
|
|
|
|
0,
|
|
|
|
|
|
|
|
U_IS_BIG_ENDIAN,
|
|
|
|
U_CHARSET_FAMILY,
|
|
|
|
U_SIZEOF_UCHAR,
|
|
|
|
0,
|
|
|
|
|
|
|
|
{ 0x53, 0x50, 0x52, 0x50 }, /* dataFormat="SPRP" */
|
|
|
|
{ 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
|
|
|
|
{ 3, 2, 0, 0 } /* dataVersion (Unicode version) */
|
|
|
|
};
|
|
|
|
void
|
|
|
|
setUnicodeVersion(const char *v) {
|
|
|
|
UVersionInfo version;
|
|
|
|
u_versionFromString(version, v);
|
|
|
|
uprv_memcpy(dataInfo.dataVersion, version, 4);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
setUnicodeVersionNC(UVersionInfo version){
|
|
|
|
uint32_t univer = version[0] << 24;
|
|
|
|
univer += version[1] << 16;
|
|
|
|
univer += version[2] << 8;
|
|
|
|
univer += version[3];
|
|
|
|
indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] = univer;
|
|
|
|
}
|
|
|
|
static UNewTrie *sprepTrie;
|
|
|
|
|
|
|
|
#define MAX_DATA_LENGTH 11500
|
|
|
|
|
|
|
|
|
|
|
|
#define SPREP_DELTA_RANGE_POSITIVE_LIMIT 8191
|
|
|
|
#define SPREP_DELTA_RANGE_NEGATIVE_LIMIT -8192
|
|
|
|
|
|
|
|
|
|
|
|
extern void
|
|
|
|
init() {
|
|
|
|
|
2011-11-30 17:52:09 +00:00
|
|
|
sprepTrie = (UNewTrie *)uprv_calloc(1, sizeof(UNewTrie));
|
2003-07-24 23:23:19 +00:00
|
|
|
|
|
|
|
/* initialize the two tries */
|
|
|
|
if(NULL==utrie_open(sprepTrie, NULL, MAX_DATA_LENGTH, 0, 0, FALSE)) {
|
|
|
|
fprintf(stderr, "error: failed to initialize tries\n");
|
|
|
|
exit(U_MEMORY_ALLOCATION_ERROR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static UHashtable* hashTable = NULL;
|
|
|
|
|
|
|
|
|
2003-12-05 22:41:54 +00:00
|
|
|
typedef struct ValueStruct {
|
2003-07-24 23:23:19 +00:00
|
|
|
UChar* mapping;
|
|
|
|
int16_t length;
|
|
|
|
UStringPrepType type;
|
2003-12-05 22:41:54 +00:00
|
|
|
} ValueStruct;
|
2003-07-24 23:23:19 +00:00
|
|
|
|
|
|
|
/* Callback for deleting the value from the hashtable */
|
2003-12-05 22:41:54 +00:00
|
|
|
static void U_CALLCONV valueDeleter(void* obj){
|
2003-07-24 23:23:19 +00:00
|
|
|
ValueStruct* value = (ValueStruct*) obj;
|
|
|
|
uprv_free(value->mapping);
|
|
|
|
uprv_free(value);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Callback for hashing the entry */
|
|
|
|
static int32_t U_CALLCONV hashEntry(const UHashTok parm) {
|
|
|
|
return parm.integer;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Callback for comparing two entries */
|
|
|
|
static UBool U_CALLCONV compareEntries(const UHashTok p1, const UHashTok p2) {
|
|
|
|
return (UBool)(p1.integer != p2.integer);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
storeMappingData(){
|
|
|
|
|
|
|
|
int32_t pos = -1;
|
|
|
|
const UHashElement* element = NULL;
|
|
|
|
ValueStruct* value = NULL;
|
|
|
|
int32_t codepoint = 0;
|
2009-02-02 16:16:07 +00:00
|
|
|
int32_t elementCount = 0;
|
2003-07-24 23:23:19 +00:00
|
|
|
int32_t writtenElementCount = 0;
|
|
|
|
int32_t mappingLength = 1; /* minimum mapping length */
|
|
|
|
int32_t oldMappingLength = 0;
|
|
|
|
uint16_t trieWord =0;
|
|
|
|
int32_t limitIndex = 0;
|
|
|
|
|
2009-02-02 16:16:07 +00:00
|
|
|
if (hashTable == NULL) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
elementCount = uhash_count(hashTable);
|
|
|
|
|
|
|
|
/*initialize the mapping data */
|
2011-11-30 17:52:09 +00:00
|
|
|
mappingData = (uint16_t*) uprv_calloc(mappingDataCapacity, U_SIZEOF_UCHAR);
|
2003-07-24 23:23:19 +00:00
|
|
|
|
|
|
|
while(writtenElementCount < elementCount){
|
|
|
|
|
|
|
|
while( (element = uhash_nextElement(hashTable, &pos))!=NULL){
|
|
|
|
|
|
|
|
codepoint = element->key.integer;
|
|
|
|
value = (ValueStruct*)element->value.pointer;
|
|
|
|
|
|
|
|
/* store the start of indexes */
|
|
|
|
if(oldMappingLength != mappingLength){
|
|
|
|
/* Assume that index[] is used according to the enums defined */
|
|
|
|
if(oldMappingLength <=_SPREP_MAX_INDEX_TOP_LENGTH){
|
|
|
|
indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex;
|
|
|
|
}
|
|
|
|
if(oldMappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH &&
|
|
|
|
mappingLength == _SPREP_MAX_INDEX_TOP_LENGTH +1){
|
|
|
|
|
|
|
|
limitIndex = currentIndex;
|
|
|
|
|
|
|
|
}
|
|
|
|
oldMappingLength = mappingLength;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(value->length == mappingLength){
|
|
|
|
uint32_t savedTrieWord = 0;
|
|
|
|
trieWord = currentIndex << 2;
|
|
|
|
/* turn on the 2nd bit to signal that the following bits contain an index */
|
|
|
|
trieWord += 0x02;
|
|
|
|
|
|
|
|
if(trieWord > _SPREP_TYPE_THRESHOLD){
|
|
|
|
fprintf(stderr,"trieWord cannot contain value greater than 0x%04X.\n",_SPREP_TYPE_THRESHOLD);
|
|
|
|
exit(U_ILLEGAL_CHAR_FOUND);
|
|
|
|
}
|
|
|
|
/* figure out if the code point has type already stored */
|
|
|
|
savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL);
|
|
|
|
if(savedTrieWord!=0){
|
|
|
|
if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){
|
|
|
|
/* turn on the first bit in trie word */
|
|
|
|
trieWord += 0x01;
|
|
|
|
}else{
|
|
|
|
/*
|
|
|
|
* the codepoint has value something other than prohibited
|
|
|
|
* and a mapping .. error!
|
|
|
|
*/
|
2004-05-30 09:03:43 +00:00
|
|
|
fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint);
|
2003-07-24 23:23:19 +00:00
|
|
|
exit(U_ILLEGAL_ARGUMENT_ERROR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* now set the value in the trie */
|
|
|
|
if(!utrie_set32(sprepTrie,codepoint,trieWord)){
|
|
|
|
fprintf(stderr,"Could not set the value for code point.\n");
|
|
|
|
exit(U_ILLEGAL_ARGUMENT_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* written the trie word for the codepoint... increment the count*/
|
|
|
|
writtenElementCount++;
|
|
|
|
|
|
|
|
/* sanity check are we exceeding the max number allowed */
|
|
|
|
if(currentIndex+value->length+1 > _SPREP_MAX_INDEX_VALUE){
|
2012-05-25 15:45:35 +00:00
|
|
|
fprintf(stderr, "Too many entries in the mapping table %i. Maximum allowed is %i\n",
|
|
|
|
currentIndex+value->length, _SPREP_MAX_INDEX_VALUE);
|
2003-07-24 23:23:19 +00:00
|
|
|
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* copy the mapping data */
|
2012-05-25 15:45:35 +00:00
|
|
|
/* write the length */
|
|
|
|
if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){
|
|
|
|
/* the cast here is safe since we donot expect the length to be > 65535 */
|
|
|
|
mappingData[currentIndex++] = (uint16_t) mappingLength;
|
|
|
|
}
|
|
|
|
/* copy the contents to mappindData array */
|
|
|
|
uprv_memmove(mappingData+currentIndex, value->mapping, value->length*U_SIZEOF_UCHAR);
|
|
|
|
currentIndex += value->length;
|
|
|
|
if (currentIndex > mappingDataCapacity) {
|
|
|
|
/* If this happens there is a bug in the computation of the mapping data size in storeMapping() */
|
|
|
|
fprintf(stderr, "gensprep, fatal error at %s, %d. Aborting.\n", __FILE__, __LINE__);
|
2012-05-25 18:49:04 +00:00
|
|
|
exit(U_INTERNAL_PROGRAM_ERROR);
|
2003-07-24 23:23:19 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mappingLength++;
|
|
|
|
pos = -1;
|
|
|
|
}
|
|
|
|
/* set the last length for range check */
|
|
|
|
if(mappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH){
|
|
|
|
indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex+1;
|
|
|
|
}else{
|
|
|
|
indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] = limitIndex;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
extern void setOptions(int32_t options){
|
|
|
|
indexes[_SPREP_OPTIONS] = options;
|
|
|
|
}
|
|
|
|
extern void
|
|
|
|
storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length,
|
|
|
|
UStringPrepType type, UErrorCode* status){
|
|
|
|
|
|
|
|
|
|
|
|
UChar* map = NULL;
|
2012-05-25 18:49:04 +00:00
|
|
|
int16_t adjustedLen=0, i;
|
2003-07-24 23:23:19 +00:00
|
|
|
uint16_t trieWord = 0;
|
|
|
|
ValueStruct *value = NULL;
|
|
|
|
uint32_t savedTrieWord = 0;
|
|
|
|
|
|
|
|
/* initialize the hashtable */
|
|
|
|
if(hashTable==NULL){
|
2005-11-11 19:23:09 +00:00
|
|
|
hashTable = uhash_open(hashEntry, compareEntries, NULL, status);
|
2003-07-24 23:23:19 +00:00
|
|
|
uhash_setValueDeleter(hashTable, valueDeleter);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* figure out if the code point has type already stored */
|
|
|
|
savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL);
|
|
|
|
if(savedTrieWord!=0){
|
|
|
|
if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){
|
|
|
|
/* turn on the first bit in trie word */
|
|
|
|
trieWord += 0x01;
|
|
|
|
}else{
|
|
|
|
/*
|
|
|
|
* the codepoint has value something other than prohibited
|
|
|
|
* and a mapping .. error!
|
|
|
|
*/
|
2004-05-30 09:03:43 +00:00
|
|
|
fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint);
|
2003-07-24 23:23:19 +00:00
|
|
|
exit(U_ILLEGAL_ARGUMENT_ERROR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* figure out the real length */
|
|
|
|
for(i=0; i<length; i++){
|
2012-05-25 18:49:04 +00:00
|
|
|
adjustedLen += U16_LENGTH(mapping[i]);
|
2003-07-24 23:23:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if(adjustedLen == 0){
|
|
|
|
trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2);
|
|
|
|
/* make sure that the value of trieWord is less than the threshold */
|
|
|
|
if(trieWord < _SPREP_TYPE_THRESHOLD){
|
|
|
|
/* now set the value in the trie */
|
|
|
|
if(!utrie_set32(sprepTrie,codepoint,trieWord)){
|
|
|
|
fprintf(stderr,"Could not set the value for code point.\n");
|
|
|
|
exit(U_ILLEGAL_ARGUMENT_ERROR);
|
|
|
|
}
|
|
|
|
/* value is set so just return */
|
|
|
|
return;
|
|
|
|
}else{
|
|
|
|
fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD);
|
|
|
|
exit(U_ILLEGAL_CHAR_FOUND);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(adjustedLen == 1){
|
|
|
|
/* calculate the delta */
|
2003-08-17 08:04:31 +00:00
|
|
|
int16_t delta = (int16_t)((int32_t)codepoint - (int16_t) mapping[0]);
|
2003-07-24 23:23:19 +00:00
|
|
|
if(delta >= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RANGE_POSITIVE_LIMIT){
|
|
|
|
|
|
|
|
trieWord = delta << 2;
|
|
|
|
|
|
|
|
|
|
|
|
/* make sure that the second bit is OFF */
|
|
|
|
if((trieWord & 0x02) != 0 ){
|
|
|
|
fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n");
|
|
|
|
exit(U_INTERNAL_PROGRAM_ERROR);
|
|
|
|
}
|
|
|
|
/* make sure that the value of trieWord is less than the threshold */
|
|
|
|
if(trieWord < _SPREP_TYPE_THRESHOLD){
|
|
|
|
/* now set the value in the trie */
|
|
|
|
if(!utrie_set32(sprepTrie,codepoint,trieWord)){
|
|
|
|
fprintf(stderr,"Could not set the value for code point.\n");
|
|
|
|
exit(U_ILLEGAL_ARGUMENT_ERROR);
|
|
|
|
}
|
|
|
|
/* value is set so just return */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* if the delta is not in the given range or if the trieWord is larger than the threshold
|
|
|
|
* just fall through for storing the mapping in the mapping table
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
2011-11-30 17:52:09 +00:00
|
|
|
map = (UChar*) uprv_calloc(adjustedLen + 1, U_SIZEOF_UCHAR);
|
2003-07-24 23:23:19 +00:00
|
|
|
|
2012-05-25 18:49:04 +00:00
|
|
|
for (i=0; i<length;) {
|
|
|
|
UChar32 c = mapping[i];
|
|
|
|
U16_APPEND_UNSAFE(map, i, c);
|
2003-07-24 23:23:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
value = (ValueStruct*) uprv_malloc(sizeof(ValueStruct));
|
|
|
|
value->mapping = map;
|
2012-05-25 15:45:35 +00:00
|
|
|
value->type = type;
|
2003-07-24 23:23:19 +00:00
|
|
|
value->length = adjustedLen;
|
|
|
|
if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){
|
|
|
|
mappingDataCapacity++;
|
|
|
|
}
|
|
|
|
if(maxLength < value->length){
|
|
|
|
maxLength = value->length;
|
|
|
|
}
|
|
|
|
uhash_iput(hashTable,codepoint,value,status);
|
|
|
|
mappingDataCapacity += adjustedLen;
|
|
|
|
|
|
|
|
if(U_FAILURE(*status)){
|
|
|
|
fprintf(stderr, "Failed to put entries into the hastable. Error: %s\n", u_errorName(*status));
|
|
|
|
exit(*status);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
extern void
|
|
|
|
storeRange(uint32_t start, uint32_t end, UStringPrepType type,UErrorCode* status){
|
|
|
|
uint16_t trieWord = 0;
|
|
|
|
|
2005-09-16 18:16:14 +00:00
|
|
|
if((int)(_SPREP_TYPE_THRESHOLD + type) > 0xFFFF){
|
2003-07-24 23:23:19 +00:00
|
|
|
fprintf(stderr,"trieWord cannot contain value greater than 0xFFFF.\n");
|
|
|
|
exit(U_ILLEGAL_CHAR_FOUND);
|
|
|
|
}
|
2005-09-16 18:16:14 +00:00
|
|
|
trieWord = (_SPREP_TYPE_THRESHOLD + type); /* the top 4 bits contain the value */
|
2003-07-24 23:23:19 +00:00
|
|
|
if(start == end){
|
|
|
|
uint32_t savedTrieWord = utrie_get32(sprepTrie, start, NULL);
|
|
|
|
if(savedTrieWord>0){
|
|
|
|
if(savedTrieWord < _SPREP_TYPE_THRESHOLD && type == USPREP_PROHIBITED){
|
|
|
|
/*
|
|
|
|
* A mapping is stored in the trie word
|
|
|
|
* and the only other possible type that a
|
|
|
|
* code point can have is USPREP_PROHIBITED
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* turn on the 0th bit in the savedTrieWord */
|
|
|
|
savedTrieWord += 0x01;
|
|
|
|
|
|
|
|
/* the downcast is safe since we only save 16 bit values */
|
|
|
|
trieWord = (uint16_t)savedTrieWord;
|
|
|
|
|
|
|
|
/* make sure that the value of trieWord is less than the threshold */
|
|
|
|
if(trieWord < _SPREP_TYPE_THRESHOLD){
|
|
|
|
/* now set the value in the trie */
|
|
|
|
if(!utrie_set32(sprepTrie,start,trieWord)){
|
|
|
|
fprintf(stderr,"Could not set the value for code point.\n");
|
|
|
|
exit(U_ILLEGAL_ARGUMENT_ERROR);
|
|
|
|
}
|
|
|
|
/* value is set so just return */
|
|
|
|
return;
|
|
|
|
}else{
|
|
|
|
fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD);
|
|
|
|
exit(U_ILLEGAL_CHAR_FOUND);
|
|
|
|
}
|
|
|
|
|
|
|
|
}else if(savedTrieWord != trieWord){
|
2004-05-30 09:03:43 +00:00
|
|
|
fprintf(stderr,"Value for codepoint \\U%08X already set!.\n", (int)start);
|
2003-07-24 23:23:19 +00:00
|
|
|
exit(U_ILLEGAL_ARGUMENT_ERROR);
|
|
|
|
}
|
|
|
|
/* if savedTrieWord == trieWord .. fall through and set the value */
|
|
|
|
}
|
|
|
|
if(!utrie_set32(sprepTrie,start,trieWord)){
|
2004-05-30 09:03:43 +00:00
|
|
|
fprintf(stderr,"Could not set the value for code point \\U%08X.\n", (int)start);
|
2003-07-24 23:23:19 +00:00
|
|
|
exit(U_ILLEGAL_ARGUMENT_ERROR);
|
|
|
|
}
|
|
|
|
}else{
|
|
|
|
if(!utrie_setRange32(sprepTrie, start, end+1, trieWord, FALSE)){
|
|
|
|
fprintf(stderr,"Value for certain codepoint already set.\n");
|
|
|
|
exit(U_ILLEGAL_CHAR_FOUND);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
|
|
|
|
static uint32_t U_CALLCONV
|
|
|
|
getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
2012-02-06 19:57:08 +00:00
|
|
|
uint32_t value;
|
2003-07-24 23:23:19 +00:00
|
|
|
UChar32 limit=0;
|
|
|
|
UBool inBlockZero;
|
|
|
|
|
|
|
|
limit=start+0x400;
|
|
|
|
while(start<limit) {
|
|
|
|
value=utrie_get32(trie, start, &inBlockZero);
|
|
|
|
if(inBlockZero) {
|
|
|
|
start+=UTRIE_DATA_BLOCK_LENGTH;
|
|
|
|
} else if(value!=0) {
|
|
|
|
return (uint32_t)offset;
|
|
|
|
} else {
|
|
|
|
++start;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* #if !UCONFIG_NO_IDNA */
|
|
|
|
|
|
|
|
extern void
|
2006-02-24 19:47:47 +00:00
|
|
|
generateData(const char *dataDir, const char* bundleName) {
|
2003-07-24 23:23:19 +00:00
|
|
|
static uint8_t sprepTrieBlock[100000];
|
|
|
|
|
|
|
|
UNewDataMemory *pData;
|
|
|
|
UErrorCode errorCode=U_ZERO_ERROR;
|
|
|
|
int32_t size, dataLength;
|
|
|
|
char* fileName = (char*) uprv_malloc(uprv_strlen(bundleName) +100);
|
|
|
|
|
|
|
|
#if UCONFIG_NO_IDNA
|
|
|
|
|
|
|
|
size=0;
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
int32_t sprepTrieSize;
|
|
|
|
|
|
|
|
/* sort and add mapping data */
|
|
|
|
storeMappingData();
|
|
|
|
|
|
|
|
sprepTrieSize=utrie_serialize(sprepTrie, sprepTrieBlock, sizeof(sprepTrieBlock), getFoldedValue, TRUE, &errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
fprintf(stderr, "error: utrie_serialize(sprep trie) failed, %s\n", u_errorName(errorCode));
|
|
|
|
exit(errorCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
size = sprepTrieSize + mappingDataCapacity*U_SIZEOF_UCHAR + sizeof(indexes);
|
|
|
|
if(beVerbose) {
|
2004-05-30 09:03:43 +00:00
|
|
|
printf("size of sprep trie %5u bytes\n", (int)sprepTrieSize);
|
2003-07-24 23:23:19 +00:00
|
|
|
printf("size of " U_ICUDATA_NAME "_%s." DATA_TYPE " contents: %ld bytes\n", bundleName,(long)size);
|
2004-05-30 09:03:43 +00:00
|
|
|
printf("size of mapping data array %5u bytes\n",(int)mappingDataCapacity * U_SIZEOF_UCHAR);
|
2003-07-24 23:23:19 +00:00
|
|
|
printf("Number of code units in mappingData (currentIndex) are: %i \n", currentIndex);
|
2004-05-30 09:03:43 +00:00
|
|
|
printf("Maximum length of the mapping string is : %i \n", (int)maxLength);
|
2003-07-24 23:23:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
2004-04-14 20:08:16 +00:00
|
|
|
|
2006-02-24 19:47:47 +00:00
|
|
|
fileName[0]=0;
|
2003-07-24 23:23:19 +00:00
|
|
|
uprv_strcat(fileName,bundleName);
|
|
|
|
/* write the data */
|
|
|
|
pData=udata_create(dataDir, DATA_TYPE, fileName, &dataInfo,
|
|
|
|
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
fprintf(stderr, "gensprep: unable to create the output file, error %d\n", errorCode);
|
|
|
|
exit(errorCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_IDNA
|
|
|
|
|
|
|
|
indexes[_SPREP_INDEX_TRIE_SIZE]=sprepTrieSize;
|
|
|
|
indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]=mappingDataCapacity*U_SIZEOF_UCHAR;
|
|
|
|
|
|
|
|
udata_writeBlock(pData, indexes, sizeof(indexes));
|
|
|
|
udata_writeBlock(pData, sprepTrieBlock, sprepTrieSize);
|
|
|
|
udata_writeBlock(pData, mappingData, indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]);
|
|
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* finish up */
|
|
|
|
dataLength=udata_finish(pData, &errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
fprintf(stderr, "gensprep: error %d writing the output file\n", errorCode);
|
|
|
|
exit(errorCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
if(dataLength!=size) {
|
|
|
|
fprintf(stderr, "gensprep error: data length %ld != calculated size %ld\n",
|
|
|
|
(long)dataLength, (long)size);
|
|
|
|
exit(U_INTERNAL_PROGRAM_ERROR);
|
|
|
|
}
|
|
|
|
|
2003-12-11 01:47:03 +00:00
|
|
|
#if !UCONFIG_NO_IDNA
|
2003-07-24 23:23:19 +00:00
|
|
|
/* done with writing the data .. close the hashtable */
|
2009-02-02 16:16:07 +00:00
|
|
|
if (hashTable != NULL) {
|
|
|
|
uhash_close(hashTable);
|
|
|
|
}
|
2003-12-11 01:47:03 +00:00
|
|
|
#endif
|
2012-05-17 19:58:22 +00:00
|
|
|
|
|
|
|
uprv_free(fileName);
|
2003-07-24 23:23:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_IDNA
|
|
|
|
|
|
|
|
extern void
|
|
|
|
cleanUpData(void) {
|
2012-05-17 19:58:22 +00:00
|
|
|
uprv_free(mappingData);
|
2003-07-24 23:23:19 +00:00
|
|
|
utrie_close(sprepTrie);
|
|
|
|
uprv_free(sprepTrie);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* #if !UCONFIG_NO_IDNA */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Hey, Emacs, please set the following:
|
|
|
|
*
|
|
|
|
* Local Variables:
|
|
|
|
* indent-tabs-mode: nil
|
|
|
|
* End:
|
|
|
|
*
|
|
|
|
*/
|