09af16f87e
X-SVN-Rev: 24702
860 lines
26 KiB
C++
860 lines
26 KiB
C++
/*
|
|
*******************************************************************************
|
|
*
|
|
* Copyright (C) 2008, International Business Machines
|
|
* Corporation, Google and others. All Rights Reserved.
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
// Author : eldawy@google.com (Mohamed Eldawy)
|
|
// ucnvsel.cpp
|
|
//
|
|
// Purpose: To generate a list of encodings capable of handling
|
|
// a given Unicode text
|
|
//
|
|
// Started 09-April-2008
|
|
|
|
/**
|
|
* \file
|
|
*
|
|
* This is an implementation of an encoding selector.
|
|
* The goal is, given a unicode string, find the encodings
|
|
* this string can be mapped to. To make processing faster
|
|
* a trie is built when you call ucnvsel_open() that
|
|
* stores all encodings a codepoint can map to
|
|
*/
|
|
|
|
#include "unicode/ucnvsel.h"
|
|
|
|
#include <string.h>
|
|
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/ucnv.h"
|
|
#include "unicode/ustring.h"
|
|
#include "unicode/uchriter.h"
|
|
#include "utrie.h"
|
|
#include "propsvec.h"
|
|
#include "uenumimp.h"
|
|
#include "cmemory.h"
|
|
#include "cstring.h"
|
|
|
|
|
|
U_NAMESPACE_USE
|
|
|
|
// maximum possible serialized trie that can ever be reached
|
|
// this was obtained by attempting to serialize a trie for all fallback mapping
|
|
// and for all roundtrip mappings and then selecting the maximum
|
|
// this value actually adds around 30KB of unneeded extra space (the actual
|
|
// maximum space is around 220000).
|
|
// the reasoning is to make it still work if lots of other converters were
|
|
// added to ICU
|
|
#define CAPACITY 250000
|
|
|
|
|
|
struct UConverterSelector {
|
|
uint8_t* serializedTrie;
|
|
uint32_t serializedTrieSize;
|
|
UTrie constructedTrie; // 16 bit trie containing offsets into pv
|
|
uint32_t* pv; // table of bits!
|
|
int32_t pvCount;
|
|
char** encodings; // which encodings did user ask to use?
|
|
int32_t encodingsCount;
|
|
};
|
|
|
|
|
|
/* internal function */
|
|
void generateSelectorData(UConverterSelector* result,
|
|
const USet* excludedEncodings,
|
|
const UConverterUnicodeSet whichSet,
|
|
UErrorCode* status);
|
|
|
|
|
|
U_CAPI int32_t ucnvsel_swap(const UDataSwapper *ds,
|
|
const void *inData,
|
|
int32_t length,
|
|
void *outData,
|
|
UErrorCode *status);
|
|
|
|
|
|
/* open a selector. If converterList is NULL, build for all converters.
|
|
If excludedCodePoints is NULL, don't exclude any codepoints */
|
|
U_CAPI UConverterSelector* ucnvsel_open(const char* const* converterList,
|
|
int32_t converterListSize,
|
|
const USet* excludedCodePoints,
|
|
const UConverterUnicodeSet whichSet,
|
|
UErrorCode* status ) {
|
|
// allocate a new converter
|
|
UConverterSelector* newSelector;
|
|
int32_t i; // for loop counter
|
|
|
|
// the compiler should realize the tail recursion here and optimize
|
|
// accordingly. This call is to get around the constness of
|
|
// converterList by smallest amount of code modification
|
|
if(converterListSize == 0 && converterList != NULL) {
|
|
return ucnvsel_open(NULL, 0, excludedCodePoints, whichSet, status);
|
|
}
|
|
|
|
// check if already failed
|
|
if (U_FAILURE(*status)) {
|
|
return NULL;
|
|
}
|
|
// ensure args make sense!
|
|
if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) {
|
|
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
newSelector = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector));
|
|
if (!newSelector) {
|
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
|
return NULL;
|
|
}
|
|
uprv_memset(newSelector, 0, sizeof(UConverterSelector));
|
|
|
|
// make a backup copy of the list of converters
|
|
if (converterList != NULL && converterListSize > 0) {
|
|
newSelector->encodings =
|
|
(char**)uprv_malloc(converterListSize*sizeof(char*));
|
|
// out of memory. Give user back the 100 bytes or so
|
|
// we allocated earlier, and wish them good luck ;)
|
|
if (!newSelector->encodings) {
|
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
|
uprv_free(newSelector);
|
|
return NULL;
|
|
}
|
|
|
|
char* allStrings = NULL;
|
|
int32_t totalSize = 0;
|
|
for (i = 0 ; i < converterListSize ; i++) {
|
|
totalSize += uprv_strlen(converterList[i])+1;
|
|
}
|
|
allStrings = (char*) uprv_malloc(totalSize);
|
|
//out of memory :(
|
|
if (!allStrings) {
|
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
|
uprv_free(newSelector->encodings);
|
|
uprv_free(newSelector);
|
|
return NULL;
|
|
}
|
|
|
|
for (i = 0 ; i < converterListSize ; i++) {
|
|
newSelector->encodings[i] = allStrings;
|
|
uprv_strcpy(newSelector->encodings[i], converterList[i]);
|
|
allStrings += uprv_strlen(newSelector->encodings[i]) + 1; // calling strlen
|
|
// twice per string is probably faster than allocating memory to
|
|
// cache the lengths!
|
|
}
|
|
} else {
|
|
int32_t count = ucnv_countAvailable();
|
|
newSelector->encodings =
|
|
(char**)uprv_malloc(ucnv_countAvailable()*sizeof(char*));
|
|
// out of memory. Give user back the 100 bytes or so
|
|
// we allocated earlier, and wish them good luck ;)
|
|
if (!newSelector->encodings) {
|
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
|
uprv_free(newSelector);
|
|
return NULL;
|
|
}
|
|
char* allStrings = NULL;
|
|
int32_t totalSize = 0;
|
|
for (i = 0 ; i < count ; i++) {
|
|
const char* conv_moniker = ucnv_getAvailableName(i);
|
|
totalSize += uprv_strlen(conv_moniker)+1;
|
|
}
|
|
allStrings = (char*) uprv_malloc(totalSize);
|
|
//out of memory :(
|
|
if (!allStrings) {
|
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
|
uprv_free(newSelector->encodings);
|
|
uprv_free(newSelector);
|
|
return NULL;
|
|
}
|
|
for (i = 0 ; i < count ; i++) {
|
|
const char* conv_moniker = ucnv_getAvailableName(i);
|
|
newSelector->encodings[i] = allStrings;
|
|
uprv_strcpy(newSelector->encodings[i], conv_moniker);
|
|
allStrings += uprv_strlen(conv_moniker) + 1; // calling strlen twice per
|
|
// string is probably faster than allocating memory to cache the
|
|
// lengths!
|
|
}
|
|
converterListSize = ucnv_countAvailable();
|
|
}
|
|
|
|
newSelector->encodingsCount = converterListSize;
|
|
generateSelectorData(newSelector, excludedCodePoints, whichSet, status);
|
|
|
|
if (U_FAILURE(*status)) {
|
|
// at this point, we know pv and encodings have been allocated. No harm in
|
|
// calling ucnv_closeSelector()
|
|
ucnvsel_close(newSelector);
|
|
return NULL;
|
|
}
|
|
|
|
return newSelector;
|
|
}
|
|
|
|
|
|
/* close opened selector */
|
|
U_CAPI void ucnvsel_close(UConverterSelector *sel) {
|
|
if (!sel) {
|
|
return;
|
|
}
|
|
uprv_free(sel->encodings[0]);
|
|
uprv_free(sel->encodings);
|
|
upvec_close(sel->pv);
|
|
if (sel->serializedTrie) { // this can be reached when
|
|
// generateSelectorData() has failed, and
|
|
// the trie is not serialized yet!
|
|
uprv_free(sel->serializedTrie);
|
|
}
|
|
uprv_free(sel);
|
|
}
|
|
|
|
/* unserialize a selector */
|
|
U_CAPI UConverterSelector* ucnvsel_unserialize(const char* buffer,
|
|
int32_t length,
|
|
UErrorCode* status) {
|
|
// check if already failed
|
|
if (U_FAILURE(*status)) {
|
|
return NULL;
|
|
}
|
|
// ensure args make sense!
|
|
if (buffer == NULL || length <= 0) {
|
|
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return NULL;
|
|
}
|
|
|
|
UConverterSelector* sel;
|
|
int32_t i = 0; // for the for loop
|
|
// check length!
|
|
if (length < sizeof(int32_t) * 3) {
|
|
*status = U_INVALID_FORMAT_ERROR;
|
|
return NULL;
|
|
}
|
|
uint32_t sig, ASCIIness;
|
|
|
|
memcpy(&sig, buffer, sizeof(int32_t));
|
|
buffer += sizeof(uint32_t);
|
|
memcpy(&ASCIIness, buffer, sizeof(int32_t));
|
|
buffer += sizeof(uint32_t);
|
|
// at this point, we don't know what the endianness or Asciiness of
|
|
// our system or data is. Detect everything!
|
|
// notice that a little trick is used here to save work. We don't actually
|
|
// detect endianness of the machine or of the data. We simply detect
|
|
// if the 2 are reversed. If they are, we send flags to udata_openSwapper()
|
|
// to indicate we need endian swapping. Those params are not REALLY
|
|
// the machine and data endianness
|
|
UBool dataEndianness = FALSE;
|
|
//if endianness need to be reversed
|
|
if (sig == 0x99887766) {
|
|
dataEndianness = TRUE;
|
|
} else if (sig != 0x66778899) {
|
|
*status = U_INVALID_FORMAT_ERROR;
|
|
return NULL;
|
|
}
|
|
|
|
int32_t dataASCIIness = ASCIIness;
|
|
if(dataEndianness) {
|
|
//need to convert ASCIIness before using it!
|
|
dataASCIIness = ((char*)&ASCIIness)[3];
|
|
}
|
|
int32_t machineASCIIness = U_CHARSET_FAMILY;
|
|
|
|
//now, we have everything!!
|
|
if(dataEndianness ||
|
|
dataASCIIness != machineASCIIness) {
|
|
//construct a data swapper!
|
|
UDataSwapper *ds;
|
|
|
|
ds=udata_openSwapper(dataEndianness, dataASCIIness, FALSE, machineASCIIness, status);
|
|
char* newBuffer = (char*)uprv_malloc(length);
|
|
if(!newBuffer) {
|
|
udata_closeSwapper(ds);
|
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
|
return NULL;
|
|
}
|
|
//can we pass buffer twice to swap in place?
|
|
ucnvsel_swap(ds, buffer, length, newBuffer, status);
|
|
buffer = newBuffer;
|
|
udata_closeSwapper(ds);
|
|
}
|
|
|
|
length -= 3 * sizeof(int32_t); //sig, Asciiness, and pvCount
|
|
// end of check length!
|
|
|
|
sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector));
|
|
//out of memory :(
|
|
if (!sel) {
|
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
|
return NULL;
|
|
}
|
|
uprv_memset(sel, 0, sizeof(UConverterSelector));
|
|
|
|
memcpy(&sel->pvCount, buffer, sizeof(int32_t));
|
|
buffer+=sizeof(int32_t);
|
|
|
|
// check length
|
|
if (length < (sel->pvCount+1)*sizeof(uint32_t)) {
|
|
uprv_free(sel);
|
|
*status = U_INVALID_FORMAT_ERROR;
|
|
return NULL;
|
|
}
|
|
length -= (sel->pvCount+1)*sizeof(uint32_t);
|
|
// end of check length
|
|
|
|
sel->pv = (uint32_t*)uprv_malloc(sel->pvCount*sizeof(uint32_t));
|
|
if(!sel->pv) {
|
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
|
uprv_free(sel);
|
|
return NULL;
|
|
}
|
|
|
|
memcpy(sel->pv, buffer, sel->pvCount*sizeof(uint32_t));
|
|
buffer += sel->pvCount*sizeof(uint32_t);
|
|
|
|
int32_t encodingsLength;
|
|
memcpy(&encodingsLength, buffer, sizeof(int32_t));
|
|
buffer += sizeof(int32_t);
|
|
char* tempEncodings = (char*) uprv_malloc(encodingsLength+1);
|
|
if(!tempEncodings) {
|
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
|
uprv_free(sel);
|
|
uprv_free(sel->pv);
|
|
return NULL;
|
|
}
|
|
|
|
memcpy(tempEncodings, buffer, encodingsLength);
|
|
tempEncodings[encodingsLength] = 0;
|
|
buffer += encodingsLength;
|
|
// count how many strings are there!
|
|
int32_t numStrings = 0;
|
|
for (int32_t i = 0 ; i < encodingsLength + 1 ; i++) {
|
|
if (tempEncodings[i] == 0) {
|
|
numStrings++;
|
|
}
|
|
}
|
|
sel->encodingsCount = numStrings;
|
|
sel->encodings = (char**) uprv_malloc(numStrings * sizeof(char*));
|
|
if(!sel->encodings) {
|
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
|
uprv_free(sel);
|
|
uprv_free(sel->pv);
|
|
uprv_free(tempEncodings);
|
|
return NULL;
|
|
}
|
|
|
|
int32_t curString = 0;
|
|
sel->encodings[0] = tempEncodings;
|
|
for (i = 0 ; i < encodingsLength ; i++) {
|
|
if (tempEncodings[i] == 0) {
|
|
sel->encodings[++curString] = tempEncodings+i+1;
|
|
}
|
|
}
|
|
|
|
// check length
|
|
if (length < sizeof(uint32_t)) {
|
|
uprv_free(sel->pv);
|
|
uprv_free(tempEncodings);
|
|
uprv_free(sel->encodings);
|
|
uprv_free(sel);
|
|
*status = U_INVALID_FORMAT_ERROR;
|
|
return NULL;
|
|
}
|
|
length -= sizeof(uint32_t);
|
|
// end of check length
|
|
|
|
// the trie
|
|
memcpy(&sel->serializedTrieSize, buffer, sizeof(uint32_t));
|
|
buffer += sizeof(uint32_t);
|
|
|
|
// check length
|
|
if (length < sel->serializedTrieSize) {
|
|
uprv_free(sel->pv);
|
|
uprv_free(tempEncodings);
|
|
uprv_free(sel->encodings);
|
|
uprv_free(sel);
|
|
*status = U_INVALID_FORMAT_ERROR;
|
|
return NULL;
|
|
}
|
|
length -= sizeof(uint32_t);
|
|
// end of check length
|
|
|
|
sel->serializedTrie = (uint8_t*) uprv_malloc(sel->serializedTrieSize);
|
|
if(!sel->serializedTrie) {
|
|
uprv_free(sel->pv);
|
|
uprv_free(tempEncodings);
|
|
uprv_free(sel->encodings);
|
|
uprv_free(sel);
|
|
*status = U_MEMORY_ALLOCATION_ERROR;
|
|
return NULL;
|
|
}
|
|
memcpy(sel->serializedTrie, buffer, sel->serializedTrieSize);
|
|
// unserialize!
|
|
utrie_unserialize(&sel->constructedTrie, sel->serializedTrie,
|
|
sel->serializedTrieSize, status);
|
|
|
|
return sel;
|
|
}
|
|
|
|
/* serialize a selector */
|
|
U_CAPI int32_t ucnvsel_serialize(const UConverterSelector* sel,
|
|
char* buffer,
|
|
int32_t bufferCapacity,
|
|
UErrorCode* status) {
|
|
// compute size and make sure it fits
|
|
int32_t totalSize;
|
|
int32_t encodingStrLength = 0;
|
|
|
|
// check if already failed
|
|
if (U_FAILURE(*status)) {
|
|
return 0;
|
|
}
|
|
// ensure args make sense!
|
|
if (sel == NULL || bufferCapacity < 0) {
|
|
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return 0;
|
|
}
|
|
//utrie_swap(ds, inDa
|
|
totalSize = sizeof(uint32_t) /*signature*/+sizeof(uint32_t) /*ASCIIness*/+
|
|
sizeof(uint32_t)*sel->pvCount /*pv*/+ sizeof(uint32_t) /*pvCount*/+
|
|
sizeof(uint32_t) /*serializedTrieSize*/+ sel->serializedTrieSize /*trie*/;
|
|
|
|
// this is a multi-string! strlen() will stop at the first one
|
|
encodingStrLength =
|
|
uprv_strlen(sel->encodings[sel->encodingsCount-1]) +
|
|
(sel->encodings[sel->encodingsCount-1] - sel->encodings[0]);
|
|
|
|
totalSize += encodingStrLength + sizeof(uint32_t);
|
|
|
|
if (totalSize > bufferCapacity) {
|
|
*status = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
return totalSize;
|
|
}
|
|
// ok, save!
|
|
// 0a. the signature
|
|
uint32_t sig = 0x66778899;
|
|
memcpy(buffer, &sig, sizeof(uint32_t));
|
|
buffer+=sizeof(uint32_t);
|
|
// 0b. ASCIIness
|
|
uint32_t ASCIIness = U_CHARSET_FAMILY;
|
|
memcpy(buffer, &ASCIIness, sizeof(uint32_t));
|
|
buffer+=sizeof(uint32_t);
|
|
|
|
// 1. the array
|
|
memcpy(buffer, &sel->pvCount, sizeof(int32_t));
|
|
buffer+=sizeof(int32_t);
|
|
memcpy(buffer, sel->pv, sel->pvCount*sizeof(int32_t));
|
|
buffer+=sel->pvCount*sizeof(int32_t);
|
|
memcpy(buffer, &encodingStrLength, sizeof(int32_t));
|
|
buffer+=sizeof(int32_t);
|
|
memcpy(buffer, sel->encodings[0], encodingStrLength);
|
|
buffer += encodingStrLength;
|
|
|
|
// the trie
|
|
memcpy(buffer, &sel->serializedTrieSize, sizeof(uint32_t));
|
|
buffer+=sizeof(uint32_t);
|
|
memcpy(buffer, sel->serializedTrie, sel->serializedTrieSize);
|
|
return totalSize;
|
|
}
|
|
|
|
/* internal function! */
|
|
void generateSelectorData(UConverterSelector* result,
|
|
const USet* excludedEncodings,
|
|
const UConverterUnicodeSet whichSet,
|
|
UErrorCode* status) {
|
|
const uint32_t encodingsSize = result->encodingsCount;
|
|
|
|
// 66000 as suggested by Markus [I suggest something like 66000 which
|
|
// exceeds the number of BMP code points. There will be fewer ranges of
|
|
// combinations of encodings. (I believe there are no encodings that have
|
|
// interesting mappings for supplementary code points. All encodings either
|
|
// support all of them or none of them.)]
|
|
result->pv = upvec_open((encodingsSize+31)/32, 66000); // create for all
|
|
// unicode codepoints, and have space for all those bits needed!
|
|
|
|
for (uint32_t i = 0; i < encodingsSize; ++i) {
|
|
uint32_t mask;
|
|
uint32_t column;
|
|
int32_t item_count;
|
|
int32_t j;
|
|
UConverter* test_converter = ucnv_open(result->encodings[i], status);
|
|
if (U_FAILURE(*status)) {
|
|
// status will propagate back to user
|
|
return;
|
|
}
|
|
USet* unicode_point_set;
|
|
unicode_point_set = uset_open(1, 0); // empty set
|
|
|
|
ucnv_getUnicodeSet(test_converter, unicode_point_set,
|
|
whichSet, status);
|
|
|
|
column = i / 32;
|
|
mask = 1 << (i%32);
|
|
// now iterate over intervals on set i!
|
|
item_count = uset_getItemCount(unicode_point_set);
|
|
|
|
for (j = 0; j < item_count; ++j) {
|
|
UChar32 start_char;
|
|
UChar32 end_char;
|
|
UErrorCode smallStatus = U_ZERO_ERROR;
|
|
uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0,
|
|
&smallStatus);
|
|
if (U_FAILURE(smallStatus)) {
|
|
// this will be reached for the converters that fill the set with
|
|
// strings. Those should be ignored by our system
|
|
} else {
|
|
// IMPORTANT: the intervals for usets are INCLUSIVE. However, the
|
|
// intervals for upvec are NOT INCLUSIVE. This is why we need
|
|
// end_char+1 here!
|
|
upvec_setValue(result->pv, start_char, end_char + 1, column, ~0, mask,
|
|
status);
|
|
if (U_FAILURE(*status)) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
ucnv_close(test_converter);
|
|
uset_close(unicode_point_set);
|
|
}
|
|
|
|
|
|
// handle excluded encodings! Simply set their values to all 1's in the upvec
|
|
if (excludedEncodings) {
|
|
int32_t item_count = uset_getItemCount(excludedEncodings);
|
|
for (int32_t j = 0; j < item_count; ++j) {
|
|
UChar32 start_char;
|
|
UChar32 end_char;
|
|
|
|
uset_getItem(excludedEncodings, j, &start_char, &end_char, NULL, 0,
|
|
status);
|
|
if (U_FAILURE(*status)) {
|
|
return;
|
|
} else {
|
|
for (uint32_t col = 0 ; col < (encodingsSize+31)/32 ; col++) {
|
|
upvec_setValue(result->pv, start_char, end_char + 1, col, ~0, ~0,
|
|
status);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// alright. Now, let's put things in the same exact form you'd get when you
|
|
// unserialize things.
|
|
UNewTrie* trie = utrie_open(NULL, NULL, CAPACITY, 0, 0, TRUE);
|
|
result->pvCount = upvec_compact(result->pv, upvec_compactToTrieHandler,
|
|
trie, status);
|
|
uint32_t length = utrie_serialize(trie, NULL, 0, NULL, TRUE, status);
|
|
result->serializedTrie = (uint8_t*) uprv_malloc(length);
|
|
length = utrie_serialize(trie, result->serializedTrie, length, NULL, TRUE,
|
|
status);
|
|
result->serializedTrieSize = length;
|
|
utrie_unserialize(&result->constructedTrie, result->serializedTrie, length,
|
|
status);
|
|
utrie_close(trie);
|
|
}
|
|
|
|
|
|
|
|
// a bunch of functions for the enumeration thingie! Nothing fancy here. Just
|
|
// iterate over the selected encodings
|
|
struct Enumerator {
|
|
int16_t* index;
|
|
int16_t length;
|
|
int16_t cur;
|
|
const UConverterSelector* sel;
|
|
};
|
|
|
|
U_CDECL_BEGIN
|
|
|
|
static void U_CALLCONV
|
|
ucnvsel_close_selector_iterator(UEnumeration *enumerator) {
|
|
uprv_free(((Enumerator*)(enumerator->context))->index);
|
|
uprv_free(enumerator->context);
|
|
uprv_free(enumerator);
|
|
}
|
|
|
|
|
|
static int32_t U_CALLCONV
|
|
ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) {
|
|
// check if already failed
|
|
if (U_FAILURE(*status)) {
|
|
return 0;
|
|
}
|
|
return ((Enumerator*)(enumerator->context))->length;
|
|
}
|
|
|
|
|
|
static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator,
|
|
int32_t* resultLength,
|
|
UErrorCode* status) {
|
|
// check if already failed
|
|
if (U_FAILURE(*status)) {
|
|
return NULL;
|
|
}
|
|
|
|
int16_t cur = ((Enumerator*)(enumerator->context))->cur;
|
|
const UConverterSelector* sel;
|
|
const char* result;
|
|
if (cur >= ((Enumerator*)(enumerator->context))->length) {
|
|
return NULL;
|
|
}
|
|
sel = ((Enumerator*)(enumerator->context))->sel;
|
|
result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ];
|
|
((Enumerator*)(enumerator->context))->cur++;
|
|
if (resultLength) {
|
|
*resultLength = uprv_strlen(result);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator,
|
|
UErrorCode* status) {
|
|
// check if already failed
|
|
if (U_FAILURE(*status)) {
|
|
return ;
|
|
}
|
|
((Enumerator*)(enumerator->context))->cur = 0;
|
|
}
|
|
|
|
U_CDECL_END
|
|
|
|
|
|
static const UEnumeration defaultEncodings = {
|
|
NULL,
|
|
NULL,
|
|
ucnvsel_close_selector_iterator,
|
|
ucnvsel_count_encodings,
|
|
uenum_unextDefault,
|
|
ucnvsel_next_encoding,
|
|
ucnvsel_reset_iterator
|
|
};
|
|
|
|
|
|
// internal fn to intersect two sets of masks
|
|
// returns whether the mask has reduced to all zeros
|
|
UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) {
|
|
int32_t i;
|
|
uint32_t oredDest = 0;
|
|
for (i = 0 ; i < len ; ++i) {
|
|
oredDest |= (dest[i] &= source1[i]);
|
|
}
|
|
return oredDest == 0;
|
|
}
|
|
|
|
// internal fn to count how many 1's are there in a mask
|
|
// algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html
|
|
int16_t countOnes(uint32_t* mask, int32_t len) {
|
|
int32_t i, totalOnes = 0;
|
|
for (i = 0 ; i < len ; ++i) {
|
|
uint32_t ent = mask[i];
|
|
for (; ent; totalOnes++)
|
|
{
|
|
ent &= ent - 1; // clear the least significant bit set
|
|
}
|
|
}
|
|
return totalOnes;
|
|
}
|
|
|
|
|
|
/* internal function! */
|
|
UEnumeration *ucnvsel_select(const UConverterSelector* sel, const void *s,
|
|
int32_t length, UErrorCode *status, UBool isUTF16) {
|
|
const UChar* utf16buffer = (UChar*) s;
|
|
const char* utf8buffer = (char*) s;
|
|
|
|
UEnumeration *en = NULL;
|
|
uint32_t* mask;
|
|
UChar32 next = 0;
|
|
int32_t offset = 0;
|
|
int32_t i, j;
|
|
|
|
// check if already failed
|
|
if (U_FAILURE(*status)) {
|
|
return NULL;
|
|
}
|
|
// ensure args make sense!
|
|
if (sel == NULL || (s == NULL && length != 0)) {
|
|
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return NULL;
|
|
}
|
|
|
|
// this is the context we will use. Store a table of indices to which
|
|
// encodings are legit.
|
|
struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator));
|
|
result->index = NULL; // this will be allocated later!
|
|
result->length = result->cur = 0;
|
|
result->sel = sel;
|
|
|
|
en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
|
|
memcpy(en, &defaultEncodings, sizeof(UEnumeration));
|
|
en->context = result;
|
|
|
|
mask = (uint32_t*) uprv_malloc((sel->encodingsCount+31)/32 *
|
|
sizeof(uint32_t));
|
|
uprv_memset(mask, ~0, (sel->encodingsCount+31)/32 * sizeof(uint32_t));
|
|
|
|
if(length == -1) {
|
|
if(isUTF16)
|
|
length = u_strlen(utf16buffer);
|
|
else
|
|
length = uprv_strlen(utf8buffer);
|
|
}
|
|
|
|
if(s) {
|
|
while (offset < length) {
|
|
uint16_t result = 0;
|
|
if (isUTF16)
|
|
U16_NEXT(utf16buffer, offset, length, next)
|
|
else
|
|
U8_NEXT(utf8buffer, offset, length, next)
|
|
|
|
if (next != -1) {
|
|
UTRIE_GET16((&sel->constructedTrie), next, result)
|
|
|
|
if (intersectMasks(mask, sel->pv+result, (sel->encodingsCount+31)/32)) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int16_t numOnes = countOnes(mask, (sel->encodingsCount+31)/32);
|
|
// now, we know the exact space we need for index
|
|
if (numOnes > 0) {
|
|
result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t));
|
|
} //otherwise, index will remain NULL (and will never be touched by
|
|
//the enumerator code anyway)
|
|
|
|
for (j = 0 ; j < (sel->encodingsCount+31)/32 ; j++) {
|
|
for (i = 0 ; i < 32 ; i++) {
|
|
uint32_t v = mask[j] & 1;
|
|
if (v && j*32+i < sel->encodingsCount) {
|
|
result->index[result->length++] = j*32+i;
|
|
}
|
|
mask[j] >>= 1;
|
|
}
|
|
}
|
|
uprv_free(mask);
|
|
return en;
|
|
}
|
|
|
|
/* check a string against the selector - UTF16 version */
|
|
U_CAPI UEnumeration *ucnvsel_selectForString(const UConverterSelector* sel,
|
|
const UChar *s,
|
|
int32_t length,
|
|
UErrorCode *status) {
|
|
return ucnvsel_select(sel, s, length, status, TRUE);
|
|
}
|
|
|
|
/* check a string against the selector - UTF8 version */
|
|
U_CAPI UEnumeration *ucnvsel_selectForUTF8(const UConverterSelector* sel,
|
|
const char *utf8str,
|
|
int32_t length,
|
|
UErrorCode *status) {
|
|
return ucnvsel_select(sel, utf8str, length, status, FALSE);
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
* swap a selector into the desired Endianness and Asciiness of
|
|
* the system. Just as FYI, selectors are always saved in the format
|
|
* of the system that created them. They are only converted if used
|
|
* on another system. In other words, selectors created on different
|
|
* system can be different even if the params are identical (endianness
|
|
* and Asciiness differences only)
|
|
*
|
|
* @param ds pointer to data swapper containing swapping info
|
|
* @param inData pointer to incoming data
|
|
* @param length length of inData in bytes
|
|
* @param outData pointer to output data. Capacity should
|
|
* be at least equal to capacity of inData
|
|
* @param status an in/out ICU UErrorCode
|
|
* @return 0 on failure, number of bytes swapped on success
|
|
* number of bytes swapped can be smaller than length
|
|
*
|
|
*/
|
|
U_CAPI int32_t ucnvsel_swap(const UDataSwapper *ds,
|
|
const void *inData,
|
|
int32_t length,
|
|
void *outData,
|
|
UErrorCode *status) {
|
|
const char* inDataC = (const char*) inData;
|
|
char * outDataC = (char*) outData;
|
|
int32_t passedLength = length;
|
|
//args check
|
|
if(U_FAILURE(*status)) {
|
|
return 0;
|
|
}
|
|
if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
|
|
*status=U_ILLEGAL_ARGUMENT_ERROR;
|
|
return 0;
|
|
}
|
|
|
|
if(length < 3 * sizeof(uint32_t)) {
|
|
* status = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
return 0;
|
|
}
|
|
|
|
ds->swapArray32(ds, inDataC, 3, outDataC, status);
|
|
int32_t pvCount = ((int32_t*)outData)[2];
|
|
|
|
if(((int32_t*)outData)[0] != 0x66778899)
|
|
return 0;
|
|
|
|
length -= 3 * sizeof(uint32_t);
|
|
inDataC += 3 * sizeof(uint32_t);
|
|
outDataC += 3 * sizeof(uint32_t);
|
|
|
|
|
|
if(length < pvCount * sizeof(uint32_t)) {
|
|
* status = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
return 0;
|
|
}
|
|
ds->swapArray32(ds, inDataC, pvCount, outDataC, status);
|
|
length -= pvCount * sizeof(uint32_t);
|
|
inDataC += pvCount * sizeof(uint32_t);
|
|
outDataC += pvCount * sizeof(uint32_t);
|
|
|
|
if(length < 1 * sizeof(uint32_t)) {
|
|
* status = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
return 0;
|
|
}
|
|
ds->swapArray32(ds, inDataC, 1, outDataC, status);
|
|
int32_t encodingStrLength = ((int32_t*)outData)[0];
|
|
length -= sizeof(uint32_t);
|
|
inDataC += sizeof(uint32_t);
|
|
outDataC += sizeof(uint32_t);
|
|
|
|
if(length < encodingStrLength) {
|
|
* status = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
return 0;
|
|
}
|
|
ds->swapInvChars(ds, inDataC, encodingStrLength, outDataC, status);
|
|
length -= encodingStrLength;
|
|
inDataC += encodingStrLength;
|
|
outDataC += encodingStrLength;
|
|
|
|
if(length < 1 * sizeof(uint32_t)) {
|
|
* status = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
return 0;
|
|
}
|
|
ds->swapArray32(ds, inDataC, 1, outDataC, status);
|
|
int32_t trieSize = ((int32_t*)outData)[0];
|
|
length -= sizeof(uint32_t);
|
|
inDataC += sizeof(uint32_t);
|
|
outDataC += sizeof(uint32_t);
|
|
|
|
if(length < trieSize) {
|
|
* status = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
return 0;
|
|
}
|
|
utrie_swap(ds, inDataC, trieSize, outDataC, status);
|
|
length -= trieSize;
|
|
return passedLength - length;
|
|
}
|