ICU-6453 Encoding Selection moved from development branch.

X-SVN-Rev: 24438
This commit is contained in:
Andy Heninger 2008-08-04 21:56:02 +00:00
parent 4cad45383c
commit ecdc0463b8
13 changed files with 2088 additions and 8 deletions

1
.gitattributes vendored
View File

@ -108,6 +108,7 @@ icu4c/source/test/perf/strsrchperf/StrSrchPerf_r.pl -text
icu4c/source/test/perf/strsrchperf/strsrchperf.cpp -text icu4c/source/test/perf/strsrchperf/strsrchperf.cpp -text
icu4c/source/test/perf/strsrchperf/strsrchperf.h -text icu4c/source/test/perf/strsrchperf/strsrchperf.h -text
icu4c/source/test/perf/strsrchperf/strsrchperf.vcproj -text icu4c/source/test/perf/strsrchperf/strsrchperf.vcproj -text
icu4c/source/test/testdata/ConverterSelectorTestUTF16.txt -text
icu4c/source/test/testdata/TestFont1.otf -text icu4c/source/test/testdata/TestFont1.otf -text
icu4c/source/test/testdata/icu26_testtypes.res -text icu4c/source/test/testdata/icu26_testtypes.res -text
icu4c/source/test/testdata/icu26e_testtypes.res -text icu4c/source/test/testdata/icu26e_testtypes.res -text

View File

@ -85,7 +85,7 @@ uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o triedict.o \
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \ rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \ serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \
uidna.o usprep.o punycode.o \ uidna.o usprep.o punycode.o \
util.o util_props.o parsepos.o locbased.o cwchar.o wintz.o mutex.o dtintrv.o util.o util_props.o parsepos.o locbased.o cwchar.o wintz.o mutex.o dtintrv.o ucnvsel.o propsvec.o
## Header files to install ## Header files to install
HEADERS = $(srcdir)/unicode/*.h unicode/*.h HEADERS = $(srcdir)/unicode/*.h unicode/*.h

View File

@ -1,7 +1,7 @@
/* /*
******************************************************************************* *******************************************************************************
* *
* Copyright (C) 2002-2005, International Business Machines * Copyright (C) 2002-2008, International Business Machines
* Corporation and others. All Rights Reserved. * Corporation and others. All Rights Reserved.
* *
******************************************************************************* *******************************************************************************

View File

@ -1,7 +1,7 @@
/* /*
******************************************************************************* *******************************************************************************
* *
* Copyright (C) 2002-2005, International Business Machines * Copyright (C) 2002-2008, International Business Machines
* Corporation and others. All Rights Reserved. * Corporation and others. All Rights Reserved.
* *
******************************************************************************* *******************************************************************************

View File

@ -0,0 +1,859 @@
/*
*******************************************************************************
*
* Copyright (C) 2008, International Business Machines
* Corporation, Google and others. All Rights Reserved.
*
*******************************************************************************
*/
// Author : eldawy@google.com (Mohamed Eldawy)
// ucnvsel.cpp
//
// Purpose: To generate a list of encodings capable of handling
// a given Unicode text
//
// Started 09-April-2008
/**
* \file
*
* This is an implementation of an encoding selector.
* The goal is, given a unicode string, find the encodings
* this string can be mapped to. To make processing faster
* a trie is built when you call ucnvsel_open() that
* stores all encodings a codepoint can map to
*/
#include "unicode/ucnvsel.h"
#include <string.h>
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/ucnv.h"
#include "unicode/ustring.h"
#include "unicode/uchriter.h"
#include "utrie.h"
#include "propsvec.h"
#include "uenumimp.h"
#include "cmemory.h"
#include "cstring.h"
U_NAMESPACE_USE
// maximum possible serialized trie that can ever be reached
// this was obtained by attempting to serialize a trie for all fallback mapping
// and for all roundtrip mappings and then selecting the maximum
// this value actually adds around 30KB of unneeded extra space (the actual
// maximum space is around 220000).
// the reasoning is to make it still work if lots of other converters were
// added to ICU
#define CAPACITY 250000
struct UConverterSelector {
uint8_t* serializedTrie;
uint32_t serializedTrieSize;
UTrie constructedTrie; // 16 bit trie containing offsets into pv
uint32_t* pv; // table of bits!
int32_t pvCount;
char** encodings; // which encodings did user ask to use?
int32_t encodingsCount;
};
/* internal function */
void generateSelectorData(UConverterSelector* result,
const USet* excludedEncodings,
const UConverterUnicodeSet whichSet,
UErrorCode* status);
U_CAPI int32_t ucnvsel_swap(const UDataSwapper *ds,
const void *inData,
int32_t length,
void *outData,
UErrorCode *status);
/* open a selector. If converterList is NULL, build for all converters.
If excludedCodePoints is NULL, don't exclude any codepoints */
U_CAPI UConverterSelector* ucnvsel_open(const char* const* converterList,
int32_t converterListSize,
const USet* excludedCodePoints,
const UConverterUnicodeSet whichSet,
UErrorCode* status ) {
// allocate a new converter
UConverterSelector* newSelector;
int i; // for loop counter
// the compiler should realize the tail recursion here and optimize
// accordingly. This call is to get around the constness of
// converterList by smallest amount of code modification
if(converterListSize == 0 && converterList != NULL) {
return ucnvsel_open(NULL, 0, excludedCodePoints, whichSet, status);
}
// check if already failed
if (U_FAILURE(*status)) {
return NULL;
}
// ensure args make sense!
if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
newSelector = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector));
if (!newSelector) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
uprv_memset(newSelector, 0, sizeof(UConverterSelector));
// make a backup copy of the list of converters
if (converterList != NULL && converterListSize > 0) {
newSelector->encodings =
(char**)uprv_malloc(converterListSize*sizeof(char*));
// out of memory. Give user back the 100 bytes or so
// we allocated earlier, and wish them good luck ;)
if (!newSelector->encodings) {
*status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(newSelector);
return NULL;
}
char* allStrings = NULL;
int totalSize = 0;
for (i = 0 ; i < converterListSize ; i++) {
totalSize += uprv_strlen(converterList[i])+1;
}
allStrings = (char*) uprv_malloc(totalSize);
//out of memory :(
if (!allStrings) {
*status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(newSelector->encodings);
uprv_free(newSelector);
return NULL;
}
for (i = 0 ; i < converterListSize ; i++) {
newSelector->encodings[i] = allStrings;
uprv_strcpy(newSelector->encodings[i], converterList[i]);
allStrings += uprv_strlen(newSelector->encodings[i]) + 1; // calling strlen
// twice per string is probably faster than allocating memory to
// cache the lengths!
}
} else {
int count = ucnv_countAvailable();
newSelector->encodings =
(char**)uprv_malloc(ucnv_countAvailable()*sizeof(char*));
// out of memory. Give user back the 100 bytes or so
// we allocated earlier, and wish them good luck ;)
if (!newSelector->encodings) {
*status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(newSelector);
return NULL;
}
char* allStrings = NULL;
int totalSize = 0;
for (i = 0 ; i < count ; i++) {
const char* conv_moniker = ucnv_getAvailableName(i);
totalSize += uprv_strlen(conv_moniker)+1;
}
allStrings = (char*) uprv_malloc(totalSize);
//out of memory :(
if (!allStrings) {
*status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(newSelector->encodings);
uprv_free(newSelector);
return NULL;
}
for (i = 0 ; i < count ; i++) {
const char* conv_moniker = ucnv_getAvailableName(i);
newSelector->encodings[i] = allStrings;
uprv_strcpy(newSelector->encodings[i], conv_moniker);
allStrings += uprv_strlen(conv_moniker) + 1; // calling strlen twice per
// string is probably faster than allocating memory to cache the
// lengths!
}
converterListSize = ucnv_countAvailable();
}
newSelector->encodingsCount = converterListSize;
generateSelectorData(newSelector, excludedCodePoints, whichSet, status);
if (U_FAILURE(*status)) {
// at this point, we know pv and encodings have been allocated. No harm in
// calling ucnv_closeSelector()
ucnvsel_close(newSelector);
return NULL;
}
return newSelector;
}
/* close opened selector */
U_CAPI void ucnvsel_close(UConverterSelector *sel) {
if (!sel) {
return;
}
uprv_free(sel->encodings[0]);
uprv_free(sel->encodings);
upvec_close(sel->pv);
if (sel->serializedTrie) { // this can be reached when
// generateSelectorData() has failed, and
// the trie is not serialized yet!
uprv_free(sel->serializedTrie);
}
uprv_free(sel);
}
/* unserialize a selector */
U_CAPI UConverterSelector* ucnvsel_unserialize(const char* buffer,
int32_t length,
UErrorCode* status) {
// check if already failed
if (U_FAILURE(*status)) {
return NULL;
}
// ensure args make sense!
if (buffer == NULL || length <= 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
UConverterSelector* sel;
int i = 0; // for the for loop
// check length!
if (length < sizeof(int32_t) * 3) {
*status = U_INVALID_FORMAT_ERROR;
return NULL;
}
uint32_t sig, ASCIIness;
memcpy(&sig, buffer, sizeof(int32_t));
buffer += sizeof(uint32_t);
memcpy(&ASCIIness, buffer, sizeof(int32_t));
buffer += sizeof(uint32_t);
// at this point, we don't know what the endianness or Asciiness of
// our system or data is. Detect everything!
// notice that a little trick is used here to save work. We don't actually
// detect endianness of the machine or of the data. We simply detect
// if the 2 are reversed. If they are, we send flags to udata_openSwapper()
// to indicate we need endian swapping. Those params are not REALLY
// the machine and data endianness
uint32_t dataEndianness= 0;
//if endianness need to be reversed
if (sig == 0x99887766) {
dataEndianness = 1;
} else if (sig != 0x66778899) {
*status = U_INVALID_FORMAT_ERROR;
return NULL;
}
int32_t dataASCIIness = ASCIIness;
if(dataEndianness) {
//need to convert ASCIIness before using it!
dataASCIIness = ((char*)&ASCIIness)[3];
}
int32_t machineASCIIness = U_CHARSET_FAMILY;
//now, we have everything!!
if(dataEndianness ||
dataASCIIness != machineASCIIness) {
//construct a data swapper!
UDataSwapper *ds;
ds=udata_openSwapper(dataEndianness, dataASCIIness, 0, machineASCIIness, status);
char* newBuffer = (char*)uprv_malloc(length);
if(!newBuffer) {
udata_closeSwapper(ds);
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
//can we pass buffer twice to swap in place?
ucnvsel_swap(ds, buffer, length, newBuffer, status);
buffer = newBuffer;
udata_closeSwapper(ds);
}
length -= 3 * sizeof(int32_t); //sig, Asciiness, and pvCount
// end of check length!
sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector));
//out of memory :(
if (!sel) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
uprv_memset(sel, 0, sizeof(UConverterSelector));
memcpy(&sel->pvCount, buffer, sizeof(int32_t));
buffer+=sizeof(int32_t);
// check length
if (length < (sel->pvCount+1)*sizeof(uint32_t)) {
uprv_free(sel);
*status = U_INVALID_FORMAT_ERROR;
return NULL;
}
length -= (sel->pvCount+1)*sizeof(uint32_t);
// end of check length
sel->pv = (uint32_t*)uprv_malloc(sel->pvCount*sizeof(uint32_t));
if(!sel->pv) {
*status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(sel);
return NULL;
}
memcpy(sel->pv, buffer, sel->pvCount*sizeof(uint32_t));
buffer += sel->pvCount*sizeof(uint32_t);
int32_t encodingsLength;
memcpy(&encodingsLength, buffer, sizeof(int32_t));
buffer += sizeof(int32_t);
char* tempEncodings = (char*) uprv_malloc(encodingsLength+1);
if(!tempEncodings) {
*status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(sel);
uprv_free(sel->pv);
return NULL;
}
memcpy(tempEncodings, buffer, encodingsLength);
tempEncodings[encodingsLength] = 0;
buffer += encodingsLength;
// count how many strings are there!
int numStrings = 0;
for (int i = 0 ; i < encodingsLength + 1 ; i++) {
if (tempEncodings[i] == 0) {
numStrings++;
}
}
sel->encodingsCount = numStrings;
sel->encodings = (char**) uprv_malloc(numStrings * sizeof(char*));
if(!sel->encodings) {
*status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(sel);
uprv_free(sel->pv);
uprv_free(tempEncodings);
return NULL;
}
int curString = 0;
sel->encodings[0] = tempEncodings;
for (i = 0 ; i < encodingsLength ; i++) {
if (tempEncodings[i] == 0) {
sel->encodings[++curString] = tempEncodings+i+1;
}
}
// check length
if (length < sizeof(uint32_t)) {
int j;
uprv_free(sel->pv);
uprv_free(tempEncodings);
uprv_free(sel->encodings);
uprv_free(sel);
*status = U_INVALID_FORMAT_ERROR;
return NULL;
}
length -= sizeof(uint32_t);
// end of check length
// the trie
memcpy(&sel->serializedTrieSize, buffer, sizeof(uint32_t));
buffer += sizeof(uint32_t);
// check length
if (length < sel->serializedTrieSize) {
int j;
uprv_free(sel->pv);
uprv_free(tempEncodings);
uprv_free(sel->encodings);
uprv_free(sel);
*status = U_INVALID_FORMAT_ERROR;
return NULL;
}
length -= sizeof(uint32_t);
// end of check length
sel->serializedTrie = (uint8_t*) uprv_malloc(sel->serializedTrieSize);
if(!sel->serializedTrie) {
int j;
uprv_free(sel->pv);
uprv_free(tempEncodings);
uprv_free(sel->encodings);
uprv_free(sel);
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
memcpy(sel->serializedTrie, buffer, sel->serializedTrieSize);
// unserialize!
utrie_unserialize(&sel->constructedTrie, sel->serializedTrie,
sel->serializedTrieSize, status);
return sel;
}
/* serialize a selector */
U_CAPI int32_t ucnvsel_serialize(const UConverterSelector* sel,
char* buffer,
int32_t bufferCapacity,
UErrorCode* status) {
// compute size and make sure it fits
int totalSize;
int32_t encodingStrLength = 0;
// check if already failed
if (U_FAILURE(*status)) {
return 0;
}
// ensure args make sense!
if (sel == NULL || bufferCapacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
//utrie_swap(ds, inDa
totalSize = sizeof(uint32_t) /*signature*/+sizeof(uint32_t) /*ASCIIness*/+
sizeof(uint32_t)*sel->pvCount /*pv*/+ sizeof(uint32_t) /*pvCount*/+
sizeof(uint32_t) /*serializedTrieSize*/+ sel->serializedTrieSize /*trie*/;
// this is a multi-string! strlen() will stop at the first one
encodingStrLength =
uprv_strlen(sel->encodings[sel->encodingsCount-1]) +
(sel->encodings[sel->encodingsCount-1] - sel->encodings[0]);
totalSize += encodingStrLength + sizeof(uint32_t);
if (totalSize > bufferCapacity) {
*status = U_INDEX_OUTOFBOUNDS_ERROR;
return totalSize;
}
// ok, save!
// 0a. the signature
uint32_t sig = 0x66778899;
memcpy(buffer, &sig, sizeof(uint32_t));
buffer+=sizeof(uint32_t);
// 0b. ASCIIness
uint32_t ASCIIness = U_CHARSET_FAMILY;
memcpy(buffer, &ASCIIness, sizeof(uint32_t));
buffer+=sizeof(uint32_t);
// 1. the array
memcpy(buffer, &sel->pvCount, sizeof(int32_t));
buffer+=sizeof(int32_t);
memcpy(buffer, sel->pv, sel->pvCount*sizeof(int32_t));
buffer+=sel->pvCount*sizeof(int32_t);
memcpy(buffer, &encodingStrLength, sizeof(int32_t));
buffer+=sizeof(int32_t);
memcpy(buffer, sel->encodings[0], encodingStrLength);
buffer += encodingStrLength;
// the trie
memcpy(buffer, &sel->serializedTrieSize, sizeof(uint32_t));
buffer+=sizeof(uint32_t);
memcpy(buffer, sel->serializedTrie, sel->serializedTrieSize);
return totalSize;
}
/* internal function! */
void generateSelectorData(UConverterSelector* result,
const USet* excludedEncodings,
const UConverterUnicodeSet whichSet,
UErrorCode* status) {
const uint32_t encodingsSize = result->encodingsCount;
uint32_t i;
uint32_t length;
UNewTrie* trie;
// 66000 as suggested by Markus [I suggest something like 66000 which
// exceeds the number of BMP code points. There will be fewer ranges of
// combinations of encodings. (I believe there are no encodings that have
// interesting mappings for supplementary code points. All encodings either
// support all of them or none of them.)]
result->pv = upvec_open((encodingsSize+31)/32, 66000); // create for all
// unicode codepoints, and have space for all those bits needed!
for (i = 0; i < encodingsSize; ++i) {
uint32_t mask;
uint32_t column;
int32_t item_count;
int32_t j;
UConverter* test_converter = ucnv_open(result->encodings[i], status);
if (U_FAILURE(*status)) {
// status will propagate back to user
return;
}
USet* unicode_point_set;
unicode_point_set = uset_open(1, 0); // empty set
ucnv_getUnicodeSet(test_converter, unicode_point_set,
whichSet, status);
column = i / 32;
mask = 1 << (i%32);
// now iterate over intervals on set i!
item_count = uset_getItemCount(unicode_point_set);
for (j = 0; j < item_count; ++j) {
UChar32 start_char;
UChar32 end_char;
UErrorCode smallStatus = U_ZERO_ERROR;
uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0,
&smallStatus);
if (U_FAILURE(smallStatus)) {
// this will be reached for the converters that fill the set with
// strings. Those should be ignored by our system
} else {
// IMPORTANT: the intervals for usets are INCLUSIVE. However, the
// intervals for upvec are NOT INCLUSIVE. This is why we need
// end_char+1 here!
upvec_setValue(result->pv, start_char, end_char + 1, column, ~0, mask,
status);
if (U_FAILURE(*status)) {
return;
}
}
}
ucnv_close(test_converter);
uset_close(unicode_point_set);
}
// handle excluded encodings! Simply set their values to all 1's in the upvec
if (excludedEncodings) {
int32_t item_count = uset_getItemCount(excludedEncodings);
int32_t j;
for (j = 0; j < item_count; ++j) {
UChar32 start_char;
UChar32 end_char;
uset_getItem(excludedEncodings, j, &start_char, &end_char, NULL, 0,
status);
if (U_FAILURE(*status)) {
return;
} else {
for (int col = 0 ; col < (encodingsSize+31)/32 ; col++) {
upvec_setValue(result->pv, start_char, end_char + 1, col, ~0, ~0,
status);
}
}
}
}
// alright. Now, let's put things in the same exact form you'd get when you
// unserialize things.
trie = utrie_open(NULL, NULL, CAPACITY, 0, 0, TRUE);
result->pvCount = upvec_compact(result->pv, upvec_compactToTrieHandler,
trie, status);
length = utrie_serialize(trie, NULL, 0, NULL, TRUE, status);
result->serializedTrie = (uint8_t*) uprv_malloc(length);
length = utrie_serialize(trie, result->serializedTrie, length, NULL, TRUE,
status);
result->serializedTrieSize = length;
utrie_unserialize(&result->constructedTrie, result->serializedTrie, length,
status);
utrie_close(trie);
}
// a bunch of functions for the enumeration thingie! Nothing fancy here. Just
// iterate over the selected encodings
struct Enumerator {
int16_t* index;
int16_t length;
int16_t cur;
const UConverterSelector* sel;
};
static void U_CALLCONV
ucnvsel_close_selector_iterator(UEnumeration *enumerator) {
uprv_free(((Enumerator*)(enumerator->context))->index);
uprv_free(enumerator->context);
}
static int32_t U_CALLCONV
ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) {
// check if already failed
if (U_FAILURE(*status)) {
return 0;
}
return ((Enumerator*)(enumerator->context))->length;
}
static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator,
int32_t* resultLength,
UErrorCode* status) {
// check if already failed
if (U_FAILURE(*status)) {
return NULL;
}
int16_t cur = ((Enumerator*)(enumerator->context))->cur;
const UConverterSelector* sel;
const char* result;
if (cur >= ((Enumerator*)(enumerator->context))->length) {
return NULL;
}
sel = ((Enumerator*)(enumerator->context))->sel;
result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ];
((Enumerator*)(enumerator->context))->cur++;
if (resultLength) {
*resultLength = uprv_strlen(result);
}
return result;
}
static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator,
UErrorCode* status) {
// check if already failed
if (U_FAILURE(*status)) {
return ;
}
((Enumerator*)(enumerator->context))->cur = 0;
}
static const UEnumeration defaultEncodings = {
NULL,
NULL,
ucnvsel_close_selector_iterator,
ucnvsel_count_encodings,
uenum_unextDefault,
ucnvsel_next_encoding,
ucnvsel_reset_iterator
};
// internal fn to intersect two sets of masks
// returns whether the mask has reduced to all zeros
UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) {
int i;
uint32_t oredDest = 0;
for (i = 0 ; i < len ; ++i) {
oredDest |= (dest[i] &= source1[i]);
}
return oredDest == 0;
}
// internal fn to count how many 1's are there in a mask
// algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html
int16_t countOnes(uint32_t* mask, int32_t len) {
int i, totalOnes = 0;
for (i = 0 ; i < len ; ++i) {
uint32_t ent = mask[i];
for (; ent; totalOnes++)
{
ent &= ent - 1; // clear the least significant bit set
}
}
return totalOnes;
}
/* internal function! */
UEnumeration *ucnvsel_select(const UConverterSelector* sel, const void *s,
int32_t length, UErrorCode *status, UBool isUTF16) {
const UChar* utf16buffer = (UChar*) s;
const char* utf8buffer = (char*) s;
UEnumeration *en = NULL;
uint32_t* mask;
UChar32 next = 0;
int offset = 0;
int i, j;
// check if already failed
if (U_FAILURE(*status)) {
return NULL;
}
// ensure args make sense!
if (sel == NULL || (s == NULL && length != 0)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
// this is the context we will use. Store a table of indices to which
// encodings are legit.
struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator));
result->index = NULL; // this will be allocated later!
result->length = result->cur = 0;
result->sel = sel;
en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
memcpy(en, &defaultEncodings, sizeof(UEnumeration));
en->context = result;
mask = (uint32_t*) uprv_malloc((sel->encodingsCount+31)/32 *
sizeof(uint32_t));
uprv_memset(mask, ~0, (sel->encodingsCount+31)/32 * sizeof(uint32_t));
if(length == -1) {
if(isUTF16)
length = u_strlen(utf16buffer);
else
length = uprv_strlen(utf8buffer);
}
if(s) {
while (offset < length) {
uint16_t result = 0;
if (isUTF16)
U16_NEXT(utf16buffer, offset, length, next)
else
U8_NEXT(utf8buffer, offset, length, next)
if (next != -1) {
UTRIE_GET16((&sel->constructedTrie), next, result)
if (intersectMasks(mask, sel->pv+result, (sel->encodingsCount+31)/32)) {
break;
}
}
}
}
int16_t numOnes = countOnes(mask, (sel->encodingsCount+31)/32);
// now, we know the exact space we need for index
if (numOnes > 0) {
result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t));
} //otherwise, index will remain NULL (and will never be touched by
//the enumerator code anyway)
for (j = 0 ; j < (sel->encodingsCount+31)/32 ; j++) {
for (i = 0 ; i < 32 ; i++) {
uint32_t v = mask[j] & 1;
if (v && j*32+i < sel->encodingsCount) {
result->index[result->length++] = j*32+i;
}
mask[j] >>= 1;
}
}
uprv_free(mask);
return en;
}
/* check a string against the selector - UTF16 version */
U_CAPI UEnumeration *ucnvsel_selectForString(const UConverterSelector* sel,
const UChar *s,
int32_t length,
UErrorCode *status) {
return ucnvsel_select(sel, s, length, status, TRUE);
}
/* check a string against the selector - UTF8 version */
U_CAPI UEnumeration *ucnvsel_selectForUTF8(const UConverterSelector* sel,
const char *utf8str,
int32_t length,
UErrorCode *status) {
return ucnvsel_select(sel, utf8str, length, status, FALSE);
}
/**
* swap a selector into the desired Endianness and Asciiness of
* the system. Just as FYI, selectors are always saved in the format
* of the system that created them. They are only converted if used
* on another system. In other words, selectors created on different
* system can be different even if the params are identical (endianness
* and Asciiness differences only)
*
* @param ds pointer to data swapper containing swapping info
* @param inData pointer to incoming data
* @param length length of inData in bytes
* @param outData pointer to output data. Capacity should
* be at least equal to capacity of inData
* @param status an in/out ICU UErrorCode
* @return 0 on failure, number of bytes swapped on success
* number of bytes swapped can be smaller than length
*
*/
U_CAPI int32_t ucnvsel_swap(const UDataSwapper *ds,
const void *inData,
int32_t length,
void *outData,
UErrorCode *status) {
const char* inDataC = (const char*) inData;
char * outDataC = (char*) outData;
int passedLength = length;
//args check
if(U_FAILURE(*status)) {
return 0;
}
if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
*status=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(length < 3 * sizeof(uint32_t)) {
* status = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
ds->swapArray32(ds, inDataC, 3, outDataC, status);
int pvCount = ((int32_t*)outData)[2];
if(((int32_t*)outData)[0] != 0x66778899)
return 0;
length -= 3 * sizeof(uint32_t);
inDataC += 3 * sizeof(uint32_t);
outDataC += 3 * sizeof(uint32_t);
if(length < pvCount * sizeof(uint32_t)) {
* status = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
ds->swapArray32(ds, inDataC, pvCount, outDataC, status);
length -= pvCount * sizeof(uint32_t);
inDataC += pvCount * sizeof(uint32_t);
outDataC += pvCount * sizeof(uint32_t);
if(length < 1 * sizeof(uint32_t)) {
* status = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
ds->swapArray32(ds, inDataC, 1, outDataC, status);
int encodingStrLength = ((int32_t*)outData)[0];
length -= sizeof(uint32_t);
inDataC += sizeof(uint32_t);
outDataC += sizeof(uint32_t);
if(length < encodingStrLength) {
* status = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
ds->swapInvChars(ds, inDataC, encodingStrLength, outDataC, status);
length -= encodingStrLength;
inDataC += encodingStrLength;
outDataC += encodingStrLength;
if(length < 1 * sizeof(uint32_t)) {
* status = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
ds->swapArray32(ds, inDataC, 1, outDataC, status);
int trieSize = ((int32_t*)outData)[0];
length -= sizeof(uint32_t);
inDataC += sizeof(uint32_t);
outDataC += sizeof(uint32_t);
if(length < trieSize) {
* status = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
utrie_swap(ds, inDataC, trieSize, outDataC, status);
length -= trieSize;
return passedLength - length;
}

View File

@ -0,0 +1,169 @@
/*
*******************************************************************************
*
* Copyright (C) 2008, International Business Machines
* Corporation, Google and others. All Rights Reserved.
*
*******************************************************************************
*/
// Author : eldawy@google.com (Mohamed Eldawy)
// ucnvsel.h
//
// Purpose: To generate a list of encodings capable of handling
// a given Unicode text
//
// Started 09-April-2008
#ifndef __ICU_UCNV_SEL_H__
#define __ICU_UCNV_SEL_H__
#include "unicode/uset.h"
#include "unicode/utypes.h"
#include "unicode/utf16.h"
#include "unicode/uenum.h"
#include "unicode/ucnv.h"
/**
* \file
*
* This is the declarations for the encoding selector.
* The goal is, given a unicode string, find the encodings
* this string can be mapped to.
*
*/
/**
* The selector data structure
*/
struct UConverterSelector;
typedef struct UConverterSelector UConverterSelector;
/**
* open a selector. If converterList is NULL, build for all converters. If excludedCodePoints
* is NULL, don't exclude any codepoints
*
*
* @param converterList a pointer to encoding names needed to be involved.
* NULL means build a selector for all possible converters
* @param converterListSize number of encodings in above list.
* Setting converterListSize to 0, builds a selector for all
* converters. ucnvsel_open() does not transfer ownership to this
* array. Once uncvsel_open() returns, the caller is free to reuse/destroy
* the array.
* @param excludedCodePoints a set of codepoints to be excluded from
* consideration. set to NULL to exclude nothing
* @param whichset what converter set to use? use this to determine whether
* to construct selector for fallback or for roundtrip only mappings
* @param status an in/out ICU UErrorCode
* @return a pointer to the created selector
*
* @draft ICU 4.2
*/
U_CAPI UConverterSelector* ucnvsel_open(const char* const* converterList,
int32_t converterListSize,
const USet* excludedCodePoints,
const UConverterUnicodeSet whichSet,
UErrorCode* status);
/* close opened selector */
/**
* closes a selector. and releases allocated memory
* if any Enumerations were returned by ucnv_select*, they become invalid.
* They can be closed before or after calling ucnv_closeSelector,
* but should never be used after selector is closed
*
* @see ucnv_selectForString
* @see ucnv_selectForUTF8
*
* @param sel selector to close
*
* @draft ICU 4.2
*/
U_CAPI void ucnvsel_close(UConverterSelector *sel);
/**
* unserialize a selector from a linear buffer. No alignment necessary.
* the function does NOT take ownership of the given buffer. Caller is free
* to reuse/destroy buffer immediately after calling this function
* Unserializing a selector is much faster than creating it from scratch
* and is nicer on the heap (not as many allocations and frees)
* ucnvsel_open() is expensive. Therefore, it is desirable to unserialize the data structre
* rather than building it from scratch.
*
* @param buffer pointer to a linear buffer containing serialized data
* @param length the capacity of this buffer (can be equal to or larger than
the actual data length)
* @param status an in/out ICU UErrorCode
* @return a pointer to the created selector
*
* @draft ICU 4.2
*/
U_CAPI UConverterSelector* ucnvsel_unserialize(const char* buffer,
int32_t length,
UErrorCode* status);
/**
* serialize a selector into a linear buffer. No alignment necessary
* The current serialized form is portable to different Endianness, and can
* travel between ASCII and EBCDIC systems
*
* @param sel selector to consider
* @param buffer pointer to a linear buffer to receive data
* @param bufferCapacity the capacity of this buffer
* @param status an in/out ICU UErrorCode
* @return the required buffer capacity to hold serialize data (even if the call fails
with a U_BUFFER_OVERFLOW_ERROR, it will return the required capacity)
*
* @draft ICU 4.2
*/
U_CAPI int32_t ucnvsel_serialize(const UConverterSelector* sel,
char* buffer,
int32_t bufferCapacity,
UErrorCode* status);
/**
* check a UTF16 string using the selector. Find out what encodings it can be mapped to
*
*
* @param sel built selector
* @param s pointer to UTF16 string
* @param length length of UTF16 string in UChars, or -1 if NULL terminated
* @param status an in/out ICU UErrorCode
* @return an enumeration containing encoding names. Returned encoding names
* will be the same as supplied to ucnv_openSelector, or will be the
* canonical names if selector was built for all encodings.
* The order of encodings will be the same as supplied by the call to
* ucnv_openSelector (if encodings were supplied)
*
* @draft ICU 4.2
*/
U_CAPI UEnumeration *ucnvsel_selectForString(const UConverterSelector*, const UChar *s,
int32_t length, UErrorCode *status);
/**
* check a UTF8 string using the selector. Find out what encodings it can be
* mapped to illegal codepoints will be ignored by this function! Only legal
* codepoints will be considered for conversion
*
* @param sel built selector
* @param s pointer to UTF8 string
* @param length length of UTF8 string (in chars), or -1 if NULL terminated
* @param status an in/out ICU UErrorCode
* @return an enumeration containing encoding names. Returned encoding names
* will be the same as supplied to ucnv_openSelector, or will be the canonical
* names if selector was built for all encodings.
* The order of encodings will be the same as supplied by the call to
* ucnv_openSelector (if encodings were supplied)
*
* @draft ICU 4.2
*/
U_CAPI UEnumeration *ucnvsel_selectForUTF8(const UConverterSelector*,
const char *s,
int32_t length,
UErrorCode *status);
#endif // __ICU_UCNV_SEL_H__

View File

@ -41,7 +41,7 @@ DEFS += -D'ICU_UNICODE_VERSION="$(UNICODE_VERSION)"' -D'ICU_VERSION="@VERSION@"'
LIBS = $(LIBCTESTFW) $(LIBICUI18N) $(LIBICUUC) $(LIBICUTOOLUTIL) $(DEFAULT_LIBS) $(LIB_M) LIBS = $(LIBCTESTFW) $(LIBICUI18N) $(LIBICUUC) $(LIBICUTOOLUTIL) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = callcoll.o calltest.o colutil.o capitst.o cbiapts.o cbkittst.o \ OBJECTS = callcoll.o calltest.o colutil.o capitst.o cbiapts.o cbkittst.o \
ccaltst.o cctest.o ccapitst.o ccolltst.o encoll.o cconvtst.o ccurrtst.o \ ccaltst.o ucnvseltst.o cctest.o ccapitst.o ccolltst.o encoll.o cconvtst.o ccurrtst.o \
cdattst.o cdetst.o cdtdptst.o cdtrgtst.o cestst.o cfintst.o cformtst.o \ cdattst.o cdetst.o cdtdptst.o cdtrgtst.o cestst.o cfintst.o cformtst.o \
cfrtst.o cg7coll.o chashtst.o cintltst.o citertst.o cjaptst.o cloctst.o \ cfrtst.o cg7coll.o chashtst.o cintltst.o citertst.o cjaptst.o cloctst.o \
cmsccoll.o cmsgtst.o cposxtst.o cldrtest.o \ cmsccoll.o cmsgtst.o cposxtst.o cldrtest.o \

View File

@ -1,6 +1,6 @@
/******************************************************************** /********************************************************************
* COPYRIGHT: * COPYRIGHT:
* Copyright (c) 1996-2006, International Business Machines Corporation and * Copyright (c) 1996-2008, International Business Machines Corporation and
* others. All Rights Reserved. * others. All Rights Reserved.
********************************************************************/ ********************************************************************/
/******************************************************************************** /********************************************************************************
@ -37,10 +37,11 @@ void addUTraceTest(TestNode** root);
void addURegexTest(TestNode** root); void addURegexTest(TestNode** root);
void addUTextTest(TestNode** root); void addUTextTest(TestNode** root);
void addUCsdetTest(TestNode** root); void addUCsdetTest(TestNode** root);
void addCnvSelTest(TestNode** root);
void addAllTests(TestNode** root) void addAllTests(TestNode** root)
{ {
addCnvSelTest(root);
addUDataTest(root); addUDataTest(root);
addHeapMutexTest(root); addHeapMutexTest(root);
addPUtilTest(root); addPUtilTest(root);
@ -74,5 +75,6 @@ void addAllTests(TestNode** root)
#if !UCONFIG_NO_TRANSLITERATION #if !UCONFIG_NO_TRANSLITERATION
addUTransTest(root); addUTransTest(root);
#endif #endif
} }

View File

@ -0,0 +1,768 @@
/********************************************************************
* Copyright (c) 1997-2008, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************
*
* File UCNVSELTST.C
*
* Modification History:
* Name Description
* MOHAMED ELDAWY Creation
********************************************************************
*/
/* C API AND FUNCTIONALITY TEST FOR CONVERTER SELECTOR (ucnvsel.h)*/
#include "ucnvseltst.h"
#include <stdio.h>
#include "unicode/utypes.h"
#include "unicode/ucnvsel.h"
#include "cmemory.h"
#include "cstring.h"
void addCnvSelTest(TestNode** root)
{
addTest(root, &TestConversionUTF16, "ucnv/ucnvseltst/TestConversionUTF16");
addTest(root, &TestConversionUTF8, "ucnv/ucnvseltst/TestConversionUTF8");
addTest(root, &TestSerializationAndUnserialization, "ucnv/ucnvseltst/TestSerializationAndUnserialization");
}
// warning: this function is retarded!
// there doesn't seem to be a fn in ucnv to get the index of a converter
// given one of its aliases!
int findIndex (const char* converterName) {
UErrorCode status = U_ZERO_ERROR;
int i;
for (i = 0 ; i < ucnv_countAvailable() ; i++) {
int alias_index;
const char* convName = ucnv_getAvailableName(i);
if(ucnv_compareNames(convName, converterName) == 0) {
return i;
}
for (alias_index = 0 ; alias_index < ucnv_countAliases(convName, & status) ; alias_index++) {
const char* aliasName = ucnv_getAlias(convName, alias_index, & status);
if(ucnv_compareNames(aliasName, converterName) == 0) {
return i;
}
}
}
return -1;
}
// fill a boolean array with whether the conversion succeeded
// or not
void fillBool(UEnumeration* res, UBool* toFill, int toFillLen) {
UErrorCode status = U_ZERO_ERROR;
int i;
for(i = 0 ; i < toFillLen ; i++)
toFill[i] = FALSE;
for(i = 0 ; i < uenum_count(res,&status) ; i++) {
const char* name = uenum_next(res,NULL, &status);
toFill[findIndex(name)] = TRUE;
}
}
void verifyResultUTF8(const char* const s, const char** encodings, int num_encodings, UEnumeration* res, const USet* excludedEncodings, const UConverterUnicodeSet whichSet) {
UBool* resultsFromSystem;
UBool* resultsManually;
int i;
resultsFromSystem = (UBool*) uprv_malloc(ucnv_countAvailable() * sizeof(UBool));
resultsManually = (UBool*) uprv_malloc(ucnv_countAvailable() * sizeof(UBool));
for(i = 0 ; i < ucnv_countAvailable() ; i++)
resultsFromSystem[i] = resultsManually[i] = FALSE;
for(i = 0 ; i < num_encodings ; i++) {
UErrorCode status = U_ZERO_ERROR;
//get unicode set for that converter
USet* unicode_point_set;
unicode_point_set = uset_open(1, 0);
UConverter* test_converter = ucnv_open(encodings[i], &status);
ucnv_getUnicodeSet(test_converter, unicode_point_set,
whichSet, &status);
int offset = 0;
int length = uprv_strlen(s);
resultsManually[findIndex(encodings[i])] = TRUE;
UChar32 next = 0;
while(offset<length) {
U8_NEXT(s, offset, length, next)
if (next >= 0 && uset_contains(excludedEncodings, next)==FALSE && uset_contains(unicode_point_set, next)==FALSE) {
resultsManually[findIndex(encodings[i])] = FALSE;
break;
}
}
uset_close(unicode_point_set);
ucnv_close(test_converter);
}
//fill the bool for the selector results!
fillBool(res, resultsFromSystem, ucnv_countAvailable());
for(i = 0 ; i < ucnv_countAvailable() ; i++) {
if(resultsManually[i] != resultsFromSystem[i]) {
log_err("failure in converter selector converter %s had conflicting results manual: %d, system %d\n",ucnv_getAvailableName(i), resultsManually[i], resultsFromSystem[i]);
exit(1);
}
}
uprv_free(resultsFromSystem);
uprv_free(resultsManually);
}
static void TestConversionUTF8()
{
//test cases are separated by a -1
//each line is one test case including encodings to check for
//I'd like to generate this array randomly but not sure if this is an allowed practice in ICU
int encodingsTestCases[] = { 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, -1,
1, 3, 7, 9, 11, 13, 12, 15, 19, 20, 22, 24, -1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1,
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, -1,
1, 5, 9, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, -1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, -1, 1, -1};
USet* excluded_sets[3];
int i;
excluded_sets[0] = uset_open(1,0);
for(i = 1 ; i < 3 ; i++)
excluded_sets[i] = uset_open(i*30, i*30+500);
int prev, testCaseIdx;
int excluded_set_id;
int curCase = 0;
for(excluded_set_id = 0 ; excluded_set_id < 3 ; excluded_set_id++)
for(testCaseIdx = 0, prev=0, curCase=0 ; testCaseIdx < sizeof(encodingsTestCases) / sizeof(int) ; testCaseIdx++)
{
if(encodingsTestCases[testCaseIdx] != -1) continue;
curCase++;
if(QUICK && curCase > 2)
break;
UErrorCode status = U_ZERO_ERROR;
UEnumeration* res1;
int i;
USet* partial_set = NULL;
UConverterSelector* sel;
char** encodings = (char**) uprv_malloc((testCaseIdx - prev) * sizeof(char*));
int num_rndm_encodings = testCaseIdx - prev;
int totalStrLen = 0;
for(i = prev ; i < testCaseIdx ; i++) {
totalStrLen += uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i]))+1;
}
char* names = (char*)uprv_malloc(totalStrLen);
uprv_memset(names, 0, totalStrLen);
for(i = prev ; i < testCaseIdx ; i++) {
uprv_memcpy(names, ucnv_getAvailableName(encodingsTestCases[i]), uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i])));
encodings[i-prev] = names;
names+=uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i]))+1;
}
status = U_ZERO_ERROR;
sel = ucnvsel_open((const char**)encodings, testCaseIdx-prev, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_SET, &status);
//count how many bytes (Is there a portable function that is more efficient than this?)
FILE* f1 = fopen("../testdata/ConverterSelectorTestUTF8.txt","rb");
if(!f1) log_err("Couldn't find ConverterSelectorTestUTF8.txt");
int counter = 0;
char c;
while(fread(&c, 1, 1, f1) > 0) counter++;
fclose(f1);
char* text = (char*)uprv_malloc((counter+1));
f1 = fopen("../testdata/ConverterSelectorTestUTF8.txt","rb");
if(!f1) log_err("Couldn't find ConverterSelectorTestUTF8.txt");
fread(text,1, counter,f1);
fclose(f1);
for (i = 0 ; i < counter ; i++) {
if(text[i] == '#')
text[i] = 0;
}
text[counter] = 0;
int curTestCase=0;
for (i = 0 ; i < counter ; i++) {
if(i==0 || text[i-1] == 0) {
curTestCase++;
if(curTestCase > 2 && QUICK)
break;
//test, both with length, and NULL terminated
res1 = ucnvsel_selectForUTF8(sel, text+i, -1, &status);
//make sure result is correct!
verifyResultUTF8(text+i, (const char**) encodings, num_rndm_encodings, res1, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_SET);
res1 = ucnvsel_selectForUTF8(sel, text+i, uprv_strlen(text+i), &status);
//make sure result is correct!
verifyResultUTF8(text+i, (const char**)encodings, num_rndm_encodings, res1, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_SET);
}
}
uprv_free(text);
uprv_free(encodings[0]);
uprv_free(encodings);
ucnvsel_close(sel);
prev = testCaseIdx + 1;
}
//////////////////////////////////////////////////////////////////////////
//try fallback mapping!
curCase = 0;
for(excluded_set_id = 0 ; excluded_set_id < 3 ; excluded_set_id++)
for(testCaseIdx = 0, prev=0, curCase=0 ; testCaseIdx < sizeof(encodingsTestCases) / sizeof(int) ; testCaseIdx++)
{
if(encodingsTestCases[testCaseIdx] != -1) continue;
curCase++;
if(QUICK && curCase > 2)
break;
UErrorCode status = U_ZERO_ERROR;
UEnumeration* res1;
int i;
USet* partial_set = NULL;
UConverterSelector* sel;
char** encodings = (char**)uprv_malloc((testCaseIdx - prev) * sizeof(char*));
int num_rndm_encodings = testCaseIdx - prev;
int totalStrLen = 0;
for(i = prev ; i < testCaseIdx ; i++) {
totalStrLen += uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i]))+1;
}
char* names = (char*)uprv_malloc(totalStrLen);
uprv_memset(names, 0, totalStrLen);
for(i = prev ; i < testCaseIdx ; i++) {
uprv_memcpy(names, ucnv_getAvailableName(encodingsTestCases[i]), uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i])));
encodings[i-prev] = names;
names+=uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i]))+1;
}
status = U_ZERO_ERROR;
sel = ucnvsel_open((const char**)encodings, testCaseIdx-prev, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
//count how many bytes (Is there a portable function that is more efficient than this?)
FILE* f1 = fopen("../testdata/ConverterSelectorTestUTF8.txt","rb");
if(!f1) log_err("Couldn't find ConverterSelectorTestUTF8.txt");
int counter = 0;
char c;
while(fread(&c, 1, 1, f1) > 0) counter++;
fclose(f1);
char* text = (char*)uprv_malloc(counter+1);
f1 = fopen("../testdata/ConverterSelectorTestUTF8.txt","rb");
if(!f1) log_err("Couldn't find ConverterSelectorTestUTF8.txt");
fread(text,1, counter,f1);
fclose(f1);
for (i = 0 ; i < counter ; i++) {
if(text[i] == '#')
text[i] = 0;
}
text[counter] = 0;
int curTestCase=0;
for (i = 0 ; i < counter ; i++) {
if(i==0 || text[i-1] == 0) {
curTestCase++;
if(curTestCase > 2 && QUICK)
break;
//test, both with length, and NULL terminated
res1 = ucnvsel_selectForUTF8(sel, text+i, -1, &status);
//make sure result is correct!
verifyResultUTF8(text+i, (const char**)encodings, num_rndm_encodings, res1,excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_AND_FALLBACK_SET);
res1 = ucnvsel_selectForUTF8(sel, text+i, uprv_strlen(text+i), &status);
//make sure result is correct!
verifyResultUTF8(text+i, (const char**)encodings, num_rndm_encodings, res1,excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_AND_FALLBACK_SET);
}
}
uprv_free(text);
ucnvsel_close(sel);
prev = testCaseIdx + 1;
}
for(i = 0 ; i < 3 ; i++)
uset_close(excluded_sets[i]);
}
void verifyResultUTF16(const UChar* const s, const char** encodings, int num_encodings, UEnumeration* res, const USet* excludedEncodings, const UConverterUnicodeSet whichSet) {
UBool* resultsFromSystem;
UBool* resultsManually;
int i;
resultsFromSystem = (UBool*) uprv_malloc(ucnv_countAvailable() * sizeof(UBool));
resultsManually = (UBool*) uprv_malloc(ucnv_countAvailable() * sizeof(UBool));
for(i = 0 ; i < ucnv_countAvailable() ; i++)
resultsFromSystem[i] = resultsManually[i] = FALSE;
for(i = 0 ; i < num_encodings ; i++) {
UErrorCode status = U_ZERO_ERROR;
//get unicode set for that converter
USet* unicode_point_set;
unicode_point_set = uset_open(1, 0);
UConverter* test_converter = ucnv_open(encodings[i], &status);
ucnv_getUnicodeSet(test_converter, unicode_point_set,
whichSet, &status);
int offset = 0;
int length = u_strlen(s);
resultsManually[findIndex(encodings[i])] = TRUE;
UChar32 next = 0;
while(offset<length) {
//loop over string
uint16_t result = 0;
U16_NEXT(s, offset, length, next)
if (uset_contains(excludedEncodings, next)==FALSE && uset_contains(unicode_point_set, next)==FALSE) {
resultsManually[findIndex(encodings[i])] = FALSE;
break;
}
}
uset_close(unicode_point_set);
ucnv_close(test_converter);
}
//fill the bool for the selector results!
fillBool(res, resultsFromSystem, ucnv_countAvailable());
for(i = 0 ; i < ucnv_countAvailable() ; i++) {
if(resultsManually[i] != resultsFromSystem[i]) {
log_err("failure in converter selector converter %s had conflicting results manual: %d, system %d\n",ucnv_getAvailableName(i), resultsManually[i], resultsFromSystem[i]);
}
}
uprv_free(resultsFromSystem);
uprv_free(resultsManually);
}
//does selectForUTF16() work well?
static void TestConversionUTF16()
{
//test cases are separated by a -1
//each line is one test case including encodings to check for
//I'd like to generate this array randomly but not sure if this is an allowed practice in ICU
int encodingsTestCases[] = { 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, -1,
1, 3, 7, 9, 11, 13, 12, 15, 19, 20, 22, 24, -1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1,
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, -1,
1, 5, 9, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, -1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, -1, 1, -1};
USet* excluded_sets[3];
int i;
excluded_sets[0] = uset_open(1,0);
for(i = 1 ; i < 3 ; i++)
excluded_sets[i] = uset_open(i*30, i*30+500);
int prev, testCaseIdx;
//try roundtrip mapping
int excluded_set_id;
int curCase = 0;
for(excluded_set_id = 0 ; excluded_set_id < 3 ; excluded_set_id++)
for(testCaseIdx = 0, prev=0, curCase=0 ; testCaseIdx < sizeof(encodingsTestCases) / sizeof(int) ; testCaseIdx++)
{
if(encodingsTestCases[testCaseIdx] != -1) continue;
curCase++;
if(QUICK && curCase > 2)
break;
UErrorCode status = U_ZERO_ERROR;
UEnumeration* res1;
int i;
USet* partial_set = NULL;
UConverterSelector* sel;
char** encodings = (char**) uprv_malloc((testCaseIdx - prev) * sizeof(char*));
int num_rndm_encodings = testCaseIdx - prev;
int totalStrLen = 0;
for(i = prev ; i < testCaseIdx ; i++) {
totalStrLen += uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i]))+1;
}
char* names = (char*)uprv_malloc(totalStrLen);
uprv_memset(names, 0, totalStrLen);
for(i = prev ; i < testCaseIdx ; i++) {
uprv_memcpy(names, ucnv_getAvailableName(encodingsTestCases[i]), uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i])));
encodings[i-prev] = names;
names+=uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i]))+1;
}
status = U_ZERO_ERROR;
sel = ucnvsel_open((const char**)encodings, testCaseIdx-prev, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_SET, &status);
//count how many bytes (Is there a portable function that is more efficient than this?)
FILE* f1 = fopen("../testdata/ConverterSelectorTestUTF16.txt","rb");
if(!f1) log_err("Couldn't find ConverterSelectorTestUTF16.txt");
int counter = 0;
UChar c;
while(fread(&c, sizeof(UChar), 1, f1) > 0) counter++;
fclose(f1);
UChar* text = (UChar*)uprv_malloc((counter+1)*sizeof(UChar));
f1 = fopen("../testdata/ConverterSelectorTestUTF16.txt","rb");
if(!f1) log_err("Couldn't find ConverterSelectorTestUTF16.txt");
fread(text,sizeof(UChar), counter,f1);
fclose(f1);
for (i = 0 ; i < counter ; i++) {
if(text[i] == (UChar)'#')
text[i] = 0;
}
text[counter] = 0;
int curTestCase=0;
for (i = 0 ; i < counter ; i++) {
if(i==0 || text[i-1] == 0) {
curTestCase++;
if(curTestCase > 2 && QUICK)
break;
//test, both with length, and NULL terminated
res1 = ucnvsel_selectForString(sel, text+i, -1, &status);
//make sure result is correct!
verifyResultUTF16(text+i, (const char**) encodings, num_rndm_encodings, res1, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_SET);
res1 = ucnvsel_selectForString(sel, text+i, u_strlen(text+i), &status);
//make sure result is correct!
verifyResultUTF16(text+i, (const char**)encodings, num_rndm_encodings, res1, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_SET);
}
}
uprv_free(text);
uprv_free(encodings[0]);
uprv_free(encodings);
ucnvsel_close(sel);
prev = testCaseIdx + 1;
}
//////////////////////////////////////////////////////////////////////////
//try fallback mapping!
for(excluded_set_id = 0 ; excluded_set_id < 3 ; excluded_set_id++)
for(testCaseIdx = 0, prev=0, curCase=0 ; testCaseIdx < sizeof(encodingsTestCases) / sizeof(int) ; testCaseIdx++)
{
if(encodingsTestCases[testCaseIdx] != -1) continue;
curCase++;
if(QUICK && curCase > 2)
break;
UErrorCode status = U_ZERO_ERROR;
UEnumeration* res1;
int i;
USet* partial_set = NULL;
UConverterSelector* sel;
char** encodings = (char**)uprv_malloc((testCaseIdx - prev) * sizeof(char*));
int num_rndm_encodings = testCaseIdx - prev;
int totalStrLen = 0;
for(i = prev ; i < testCaseIdx ; i++) {
totalStrLen += uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i]))+1;
}
char* names = (char*)uprv_malloc(totalStrLen);
uprv_memset(names, 0, totalStrLen);
for(i = prev ; i < testCaseIdx ; i++) {
uprv_memcpy(names, ucnv_getAvailableName(encodingsTestCases[i]), uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i])));
encodings[i-prev] = names;
names+=uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i]))+1;
}
//first time
status = U_ZERO_ERROR;
sel = ucnvsel_open((const char**)encodings, testCaseIdx-prev, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
//count how many bytes (Is there a portable function that is more efficient than this?)
FILE* f1 = fopen("../testdata/ConverterSelectorTestUTF16.txt","rb");
if(!f1) log_err("Couldn't find ConverterSelectorTestUTF16.txt");
int counter = 0;
UChar c;
while(fread(&c, sizeof(UChar), 1, f1) > 0) counter++;
fclose(f1);
UChar* text = (UChar*)uprv_malloc((counter+1)*sizeof(UChar));
f1 = fopen("../testdata/ConverterSelectorTestUTF16.txt","rb");
if(!f1) log_err("Couldn't find ConverterSelectorTestUTF16.txt");
fread(text,sizeof(UChar), counter,f1);
fclose(f1);
for (i = 0 ; i < counter ; i++) {
if(text[i] == (UChar)'#')
text[i] = 0;
}
text[counter] = 0;
int curTestCase=0;
for (i = 0 ; i < counter ; i++) {
if(i==0 || text[i-1] == 0) {
curTestCase++;
if(curTestCase > 2 && QUICK)
break;
//test, both with length, and NULL terminated
res1 = ucnvsel_selectForString(sel, text+i, -1, &status);
//make sure result is correct!
verifyResultUTF16(text+i, (const char**)encodings, num_rndm_encodings, res1,excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_AND_FALLBACK_SET);
res1 = ucnvsel_selectForString(sel, text+i, u_strlen(text+i), &status);
//make sure result is correct!
verifyResultUTF16(text+i, (const char**)encodings, num_rndm_encodings, res1,excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_AND_FALLBACK_SET);
}
}
uprv_free(text);
ucnvsel_close(sel);
prev = testCaseIdx + 1;
}
for(i = 0 ; i < 3 ; i++)
uset_close(excluded_sets[i]);
}
//does selectForUTF16() work well?
static void TestSerializationAndUnserialization()
{
//test cases are separated by a -1
//each line is one test case including encodings to check for
//I'd like to generate this array randomly but not sure if this is an allowed practice in ICU
int encodingsTestCases[] = { 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, -1,
1, 3, 7, 9, 11, 13, 12, 15, 19, 20, 22, 24, -1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1,
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, -1,
1, 5, 9, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, -1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, -1, 1, -1};
USet* excluded_sets[3];
int i;
excluded_sets[0] = uset_open(1,0);
for(i = 1 ; i < 3 ; i++)
excluded_sets[i] = uset_open(i*30, i*30+500);
int prev, testCaseIdx;
//try roundtrip mapping
int excluded_set_id;
int curCase = 0;
for(excluded_set_id = 0 ; excluded_set_id < 3 ; excluded_set_id++)
for(testCaseIdx = 0, prev=0, curCase =0 ; testCaseIdx < sizeof(encodingsTestCases) / sizeof(int) ; testCaseIdx++)
{
if(encodingsTestCases[testCaseIdx] != -1) continue;
curCase++;
if(QUICK && curCase > 2)
break;
UErrorCode status = U_ZERO_ERROR;
UEnumeration* res1;
int i;
USet* partial_set = NULL;
UConverterSelector* sel;
char** encodings = (char**) uprv_malloc((testCaseIdx - prev) * sizeof(char*));
int num_rndm_encodings = testCaseIdx - prev;
int totalStrLen = 0;
for(i = prev ; i < testCaseIdx ; i++) {
totalStrLen += uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i]))+1;
}
char* names = (char*)uprv_malloc(totalStrLen);
uprv_memset(names, 0, totalStrLen);
for(i = prev ; i < testCaseIdx ; i++) {
uprv_memcpy(names, ucnv_getAvailableName(encodingsTestCases[i]), uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i])));
encodings[i-prev] = names;
names+=uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i]))+1;
}
//first time
status = U_ZERO_ERROR;
sel = ucnvsel_open((const char**)encodings, testCaseIdx-prev, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_SET, &status);
char *buffer = NULL;
unsigned int ser_len = ucnvsel_serialize(sel, NULL, 0, &status);
status = U_ZERO_ERROR;
buffer = uprv_malloc(ser_len);
ucnvsel_serialize(sel, buffer, ser_len, &status);
ucnvsel_close(sel);
sel = ucnvsel_unserialize( buffer, ser_len,&status);
//count how many bytes (Is there a portable function that is more efficient than this?)
FILE* f1 = fopen("../testdata/ConverterSelectorTestUTF16.txt","rb");
if(!f1) log_err("Couldn't find ConverterSelectorTestUTF16.txt");
int counter = 0;
UChar c;
while(fread(&c, sizeof(UChar), 1, f1) > 0) counter++;
fclose(f1);
UChar* text = (UChar*)uprv_malloc((counter+1)*sizeof(UChar));
f1 = fopen("../testdata/ConverterSelectorTestUTF16.txt","rb");
if(!f1) log_err("Couldn't find ConverterSelectorTestUTF16.txt");
fread(text,sizeof(UChar), counter,f1);
fclose(f1);
for (i = 0 ; i < counter ; i++) {
if(text[i] == (UChar)'#')
text[i] = 0;
}
text[counter] = 0;
int curTestCase=0;
for (i = 0 ; i < counter ; i++) {
if(i==0 || text[i-1] == 0) {
curTestCase++;
if(curTestCase > 2 && QUICK)
break;
//test, both with length, and NULL terminated
res1 = ucnvsel_selectForString(sel, text+i, -1, &status);
//make sure result is correct!
verifyResultUTF16(text+i, (const char**) encodings, num_rndm_encodings, res1, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_SET);
res1 = ucnvsel_selectForString(sel, text+i, u_strlen(text+i), &status);
//make sure result is correct!
verifyResultUTF16(text+i, (const char**)encodings, num_rndm_encodings, res1, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_SET);
}
}
uprv_free(text);
uprv_free(encodings[0]);
uprv_free(encodings);
ucnvsel_close(sel);
prev = testCaseIdx + 1;
uprv_free(buffer);
}
//////////////////////////////////////////////////////////////////////////
//try fallback mapping!
for(excluded_set_id = 0 ; excluded_set_id < 3 ; excluded_set_id++)
for(testCaseIdx = 0, prev=0, curCase=0 ; testCaseIdx < sizeof(encodingsTestCases) / sizeof(int) ; testCaseIdx++)
{
if(encodingsTestCases[testCaseIdx] != -1) continue;
curCase++;
if(QUICK && curCase > 2)
break;
UErrorCode status = U_ZERO_ERROR;
UEnumeration* res1;
int i;
USet* partial_set = NULL;
UConverterSelector* sel;
char** encodings = (char**)uprv_malloc((testCaseIdx - prev) * sizeof(char*));
int num_rndm_encodings = testCaseIdx - prev;
int totalStrLen = 0;
for(i = prev ; i < testCaseIdx ; i++) {
totalStrLen += uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i]))+1;
}
char* names = (char*)uprv_malloc(totalStrLen);
uprv_memset(names, 0, totalStrLen);
for(i = prev ; i < testCaseIdx ; i++) {
uprv_memcpy(names, ucnv_getAvailableName(encodingsTestCases[i]), uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i])));
encodings[i-prev] = names;
names+=uprv_strlen(ucnv_getAvailableName(encodingsTestCases[i]))+1;
}
//first time
status = U_ZERO_ERROR;
sel = ucnvsel_open((const char**)encodings, testCaseIdx-prev, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
char *buffer = NULL;
unsigned int ser_len = ucnvsel_serialize(sel, NULL, 0, &status);
buffer = uprv_malloc(ser_len);
status = U_ZERO_ERROR;
ucnvsel_serialize(sel, buffer, ser_len, &status);
ucnvsel_close(sel);
sel = ucnvsel_unserialize( buffer, ser_len,&status);
//count how many bytes (Is there a portable function that is more efficient than this?)
FILE* f1 = fopen("../testdata/ConverterSelectorTestUTF16.txt","rb");
if(!f1) log_err("Couldn't find ConverterSelectorTestUTF16.txt");
int counter = 0;
char c;
while(fread(&c, 2, 1, f1) > 0) counter++;
fclose(f1);
UChar* text = (UChar*)uprv_malloc((counter+1)*sizeof(UChar));
f1 = fopen("../testdata/ConverterSelectorTestUTF16.txt","rb");
if(!f1) log_err("Couldn't find ConverterSelectorTestUTF16.txt");
fread(text,2, counter,f1);
fclose(f1);
for (i = 0 ; i < counter ; i++) {
if(text[i] == (UChar)'#')
text[i] = 0;
}
text[counter] = 0;
int curTestCase=0;
for (i = 0 ; i < counter ; i++) {
if(i==0 || text[i-1] == 0) {
curTestCase++;
if(curTestCase > 2 && QUICK)
break;
//test, both with length, and NULL terminated
res1 = ucnvsel_selectForString(sel, text+i, -1, &status);
//make sure result is correct!
verifyResultUTF16(text+i, (const char**)encodings, num_rndm_encodings, res1,excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_AND_FALLBACK_SET);
res1 = ucnvsel_selectForString(sel, text+i, u_strlen(text+i), &status);
//make sure result is correct!
verifyResultUTF16(text+i, (const char**)encodings, num_rndm_encodings, res1,excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_AND_FALLBACK_SET);
}
}
uprv_free(text);
ucnvsel_close(sel);
prev = testCaseIdx + 1;
uprv_free(buffer);
}
for(i = 0 ; i < 3 ; i++)
uset_close(excluded_sets[i]);
}

View File

@ -0,0 +1,33 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2008, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
*
* File UCNVSELTST.H
*
* Modification History:
* Name Description
* Mohamed Eldawy Creation
*********************************************************************************/
/* C API TEST FOR CONVERTER SELECTOR */
#ifndef _UCNVSELTST
#define _UCNVSELTST
#include "unicode/utypes.h"
#include "cintltst.h"
/**
* The function used to test selection for UTF8 strings
**/
static void TestConversionUTF8(void);
/**
* The function used to test selection for UTF16 strings
**/
static void TestConversionUTF16(void);
/**
* The function used to test serialization and unserialization
**/
static void TestSerializationAndUnserialization(void);
#endif

Binary file not shown.

View File

@ -0,0 +1,248 @@
Что такое Unicode?
Unicode - это уникальный код для любого символа,
независимо от платформы,
независимо от программы,
независимо от языка.
По своей природе компьютеры могут работать лишь с числами. И для того, чтобы они могли хранить в памяти буквы или другие символы, каждому такому символу должно быть поставлено в соответствие число. До того, как появился Unicode, в мире имели хождение сотни различных схем подобного кодирования символов. Но ни одна из этих схем не была столь универсальной, чтобы описать все необходимые символы: например, только для кодирования букв, входящих в алфавиты языков Европейского Сообщества, необходимо было использовать несколько различных кодировок. По большому счёту даже и для отдельного языка, скажем, английского, не существовало единой системы кодирования, включавшей в себя все обычно используемые буквы, знаки пунктуации и технические символы.
Более того, все эти схемы кодирования часто даже не были совместимы друг с другом. К примеру, две разные кодировки могли использовать один и тот же код для представления двух разных символов или присваивать разные коды одной и той же букве. В этой ситуации для любого компьютера, а особенно сервера, приходилось поддерживать несколько разных кодировок, которые могли понадобиться, но даже и тогда при передаче данных на другую платформу или при их преобразовании в другую кодировку всегда оставался риск, что эти данные окажутся повреждёнными.
Unicode изменяет такое положение вещей!
Система Unicode присваивает уникальный код любому символу, независимо от платформы, независимо от программы, независимо от языка. Unicode был принят как стандарт такими лидерами компьютерной индустрии, как Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys да и многими другими. Именно эта схема кодирования используется такими современными технологиями и стандартами, как например XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML и так далее. Именно Unicode является официальной схемой реализации ISO/IEC 10646. Наконец, эта кодировка поддерживается во множестве операционных систем, во всех современных браузерах Интернет и в большом количестве других программ. Повсеместное распространение стандарта Unicode ровно как и доступность поддерживающих его средств в настоящее время являются одними из наиболее важных направлений развития индустрии программного обеспечения.
Использование Unicode в многоуровневых приложениях или программных комплексах, построенных в рамках архитектуры клиент-сервер, а также при представлении данных в сети Интернет, приводит к значительному снижению расходов на поддержку этих продуктов или сервисов по сравнению со случаем использования старых схем кодирования. Действительно, Unicode позволяет создавать единый программный продукт или сайт Интернет для множества платформ, языков и стран без каких-либо переделок. А его использование при передаче данных между различными системами предохраняет эти данные от поврежде
#
ما هي الشفرة الموحدة "يونِكود" ؟
أساسًا، تتعامل الحواسيب فقط مع الأرقام، وتقوم بتخزين الأحرف والمحارف الأخرى بعد أن تُعطي رقما معينا لكل واحد منها. وقبل اختراع "يونِكود"، كان هناك مئات الأنظمة للتشفير وتخصيص هذه الأرقام للمحارف، ولم يوجد نظام تشفير واحد يحتوي على جميع المحارف الضرورية. وعلى سبيل المثال، فإن الاتحاد الأوروبي لوحده، احتوى العديد من الشفرات المختلفة ليغطي جميع اللغات المستخدمة في الاتحاد. وحتى لو اعتبرنا لغة واحدة، كاللغة الإنجليزية، فإن جدول شفرة واحد لم يكف لاستيعاب جميع الأحرف وعلامات الترقيم والرموز الفنية والعلمية الشائعة الاستعمال.
وتجدر الملاحظة أن أنظمة التشفير المختلفة تتعارض مع بعضها البعض. وبعبارة أخرى، يمكن أن يستخدِم جدولي شفرة نفس الرقم لتمثيل محرفين مختلفين، أو رقمين مختلفين لتمثيل نفس المحرف. ولو أخذنا أي جهاز حاسوب، وبخاصة جهاز النادل (server)، فيجب أن تكون لديه القدرة على التعامل مع عدد كبير من الشفرات المختلفة، ويتم تصميمه على هذا الأساس. ومع ذلك، فعندما تمر البيانات عبر أنظمة مختلفة، توجد هناك خطورة لضياع أو تحريف بعض هذه البيانات.
"يونِكود" تغير هذا كليـا !
تخصص الشفرة الموحدة "يونِكود" رقما وحيدا لكل محرف في جميع اللغات العالمية، وذلك بغض النظر عن نوع الحاسوب أو البرامج المستخدمة. وقد تـم تبني مواصفة "يونِكود" مــن قبـل قادة الصانعين لأنظمة الحواسيب فـي العالم، مثل شركات آي.بي.إم. (IBM)، أبـل (APPLE)، هِيـْولِـت بـاكـرد (Hewlett-Packard) ، مايكروسوفت (Microsoft)، أوراكِـل (Oracle) ، صن (Sun) وغيرها. كما أن المواصفات والمقاييس الحديثة (مثل لغة البرمجة "جافا" "JAVA" ولغة "إكس إم إل" "XML" التي تستخدم لبرمجة الانترنيت) تتطلب استخدام "يونِكود". علاوة على ذلك ، فإن "يونِكود" هي الطـريـقـة الرسـمية لتطبيق المقيـاس الـعـالـمي إيزو ١٠٦٤٦ (ISO 10646) .
إن بزوغ مواصفة "يونِكود" وتوفُّر الأنظمة التي تستخدمه وتدعمه، يعتبر من أهم الاختراعات الحديثة في عولمة البرمجيات لجميع اللغات في العالم. وإن استخدام "يونِكود" في عالم الانترنيت سيؤدي إلى توفير كبير مقارنة مع استخدام المجموعات التقليدية للمحارف المشفرة. كما أن استخدام "يونِكود" سيُمكِّن المبرمج من كتابة البرنامج مرة واحدة، واستخدامه على أي نوع من الأجهزة
#
什麽是Unicode(統一碼/標準萬國碼)?
Unicode給每個字元提供了一個唯一的數位
不論是什麽平臺,
不論是什麽程式,
不論是什麽語言。
基本上電腦只是處理數位。它們指定一個數位來儲存字母或其他字元。在創造Unicode之前有數百種指定這些數位的編碼系統。沒有一個編碼可以包含足夠的字元例如單單歐州共同體就需要好幾種不同的編碼來包括所有的語言。即使是單一種語言例如英語也沒有哪一個編碼可以適用於所有的字母標點符號和常用的技術符號。
這些編碼系統也會互相衝突。也就是說,兩種編碼可能使用相同的數位代表兩個不同的字元,或使用不同的數位代表相同的字元。任何一台特定的電腦 (特別是伺服器都需要支援許多不同的編碼,但是,不論什麽時候資料通過不同的編碼或平臺之間,那些資料總會有損壞的危險。
Unicode正在改變所有這一切
Unicode給每個字元提供了一個唯一的數位不論是什麽平臺不論是什麽程式不論什麽語言。Unicode標準已經被這些工業界的領導們所採用例如Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys 和其他許多公司。最新的標準都需要Unicode例如XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML等等並且Unicode是實現ISO/IEC 10646的正規方式。許多作業系統所有最新的瀏覽器和 許多其他産品都支援它。Unicode標準的出現和支援它工具的存在是近來全球軟體技術最重要的發展趨勢。
將Unicode與客戶伺服器或多層應用程式和網際網路結合比使用傳統字元集節省費用。Unicode使單一軟體産品或單一網站能夠貫穿多個平臺語言和國家而不需要重建。它可將資料傳輸到許多不同的系統而無損壞。
關於Unicode學術學會
Unicode學術學會
#
Ç'është UNICODE?
Unicode siguron një numër të vetëm për çdo gërmë,
për cilëndo platformë,
për cilindo program,
për cilëndo gjuhë.
Në themel, kompjuterat veprojnë me anën e numrave. Ata ruajnë gërmat dhe shënjat (karakteret) e tjera duke u caktuar nga një numër (kod). Para zbulimit të Unicode-s, kishte qindra sisteme kodimi të ndryshëm për të caktuar këta numra. Por asnjë kodim i vetëm nuk mund të përmbante mjaft shënja: për shembull, vetëm Bashkimi Europian kërkon disa kodime të ndryshme për të mbuluar gjithë gjuhët e tij. Edhe për një gjuhë të vetme si Anglishtja asnjë kodim nuk ishte i mjaftueshëm për të gjitha gërmat, shënjat e pikësimit, dhe simbolet teknike në përdorim të zakonshëm.
Këta sisteme kodimi gjithashtu përplasen me njëri-tjetrin. Dmth, dy kodime mundet që të përdorin të njëjtin numër për dy shënja krejt të ndryshme, ose dy numra të ndryshëm për të njëjtën shënjë. Secili kompjuter (sidomos shërbyesit - serverat) duhet të jetë i aftë të mbështesë shumë kodime, dhe megjithatë kur të dhënat kalohen përmes kodimeve të ndryshme ose platformave, është gjithmonë rreziku i korruptimit të tyre.
Unicode po e ndryshon gjithë këtë rrëmujë!
Unicode siguron një numër të vetëm për çdo gërmë, për cilëndo platformë, për cilindo program, për cilëndo gjuhë. Standarti Unicode është fëmijëruar (adaptuar) nga udhëheqës të tillë të industrisë si Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys dhe shumë të tjerë. Unicode kërkohet nga standarte moderne si XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML, etj., dhe është mënyra zyrtare për të zbatuar ISO/IEC 10646. Unicode është i mbështetur nga shumë sisteme vepruese (operativë), gjithë shfletuesat (brauzerat) modernë, dhe shumë produkte të tjera. Dalja në dritë e Standartit Unicode, dhe pasja e veglave që e mbështesin, janë midis faktorëve më domethënës të kohëve të fundit të drejtimeve të zhvillimit të përgjithëshme të teknologjive të softuerëve.
Përtrupëzimi i Unicode në zbatimet klient-server apo shumë-shkallësh si dhe në faqet internet, krijon mundësinë për kursime kostoje të ndjeshme në krahasim me përdorimin e kodimeve të mëparshme. Unicode bën të mundur që një produkt softuer ose një faqe interneti të përdoret për shumë platforma, gjuhë dhe vende pa re-inxhinierim. Ai lejon të dhënat të kalohen përmes shumë sistemeve të ndryshme pa korruptim.
#
यूनिकोड क्या है?
यूनिकोड प्रत्येक अक्षर के लिए एक विशेष नम्बर प्रदान करता है,
चाहे कोई भी प्लैटफॉर्म हो,
चाहे कोई भी प्रोग्राम हो,
चाहे कोई भी भाषा हो।
कम्प्यूटर, मूल रूप से, नंबरों से सम्बंध रखते हैं। ये प्रत्येक अक्षर और वर्ण के लिए एक नंबर निर्धारित करके अक्षर और वर्ण संग्रहित करते हैं। यूनिकोड का आविष्कार होने से पहले, ऐसे नंबर देने के लिए सैंकडों विभिन्न संकेत लिपि प्रणालियां थीं। किसी एक संकेत लिपि में पर्याप्त अक्षर नहीं हो सकते हैं : उदाहरण के लिए, यूरोपिय संघ को अकेले ही, अपनी सभी भाषाऒं को कवर करने के लिए अनेक विभिन्न संकेत लिपियों की आवश्यकता होती है। अंग्रेजी जैसी भाषा के लिए भी, सभी अक्षरों, विरामचिन्हों और सामान्य प्रयोग के तकनीकी प्रतीकों हेतु एक ही संकेत लिपि पर्याप्त नहीं थी।
ये संकेत लिपि प्रणालियां परस्पर विरोधी भी हैं। इसीलिए, दो संकेत लिपियां दो विभिन्न अक्षरों के लिए, एक ही नंबर प्रयोग कर सकती हैं, अथवा समान अक्षर के लिए विभिन्न नम्बरों का प्रयोग कर सकती हैं। किसी भी कम्प्यूटर (विशेष रूप से सर्वर) को विभिन्न संकेत लिपियां संभालनी पड़ती है; फिर भी जब दो विभिन्न संकेत लिपियों अथवा प्लैटफॉर्मों के बीच डाटा भेजा जाता है तो उस डाटा के हमेशा खराब होने का जोखिम रहता है।
यूनिकोड से यह सब कुछ बदल रहा है!
यूनिकोड, प्रत्येक अक्षर के लिए एक विशेष नंबर प्रदान करता है, चाहे कोई भी प्लैटफॉर्म हो, चाहे कोई भी प्रोग्राम हो, चाहे कोई भी भाषा हो। यूनिकोड स्टैंडर्ड को ऐपल, एच.पी., आई.बी.एम., जस्ट सिस्टम, माईक्रोसॉफ्ट, औरेकल, सैप, सन, साईबेस, यूनिसिस जैसी उद्योग की प्रमुख कम्पनियों और कई अन्य ने अपनाया है। यूनिकोड की आवश्यकता आधुनिक मानदंडों, जैसे एक्स.एम.एल., जावा, एकमा स्क्रिप्ट (जावा स्क्रिप्ट), एल.डी.ए.पी., कोर्बा 3.0, डब्ल्यू.एम.एल. के लिए होती है और यह आई.एस.ओ./आई.ई.सी. 10646 को लागू करने का अधिकारिक तरीका है। यह कई संचालन प्रणालियों, सभी आधुनिक ब्राउजरों और कई अन्य उत्पादों में होता है। यूनिकोड स्टैंडर्ड की उत्पति और इसके सहायक उपकरणों की उपलब्धता, हाल ही के अति महत्वपूर्ण विश्वव्यापी सॉफ्टवेयर टेक्नोलॉजी रुझानों में से हैं।
यूनिकोड को ग्राहक-सर्वर अथवा बहु-आयामी उपकरणों और वेबसाइटों में शामिल करने से, परंपरागत उपकरणों के प्रयोग की अपेक्षा खर्च में अत्यधिक बचत होती है। यूनिकोड से एक ऐसा अकेला सॉफ्टवेयर उत्पाद अथवा अकेला वेबसाइट मिल जाता है, जिसे री-इंजीनियरिंग के बिना विभिन्न प्लैटफॉर्मों, भाषाओं और देशों में उपयोग किया जा सकता है। इससे डाटा को बिना किसी बाधा के विभिन्न प्रणालियों से होकर ले जाया जा सकता है।
#
บเรื่องของตัวเลข. คอมพิวเตอร์จัดเก็บตัวอักษรและอักขระอื่นๆ โดยการกำหนดหมายเลขให้สำหรับแต่ละตัว. ก่อนหน้าที่๊ Unicode จะถูกสร้างขึ้น, ได้มีระบบ encoding อยู่หลายร้อยระบบสำหรับการกำหนดหมายเลขเหล่านี้. ไม่มี encoding ใดที่มีจำนวนตัวอักขระมากเพียงพอ: ยกตัวอย่างเช่น, เฉพาะในกลุ่มสหภาพยุโรปเพียงแห่งเดียว ก็ต้องการหลาย encoding ในการครอบคลุมทุกภาษาในกลุ่ม. หรือแม้แต่ในภาษาเดี่ยว เช่น ภาษาอังกฤษ ก็ไม่มี encoding ใดที่เพียงพอสำหรับทุกตัวอักษร, เครื่องหมายวรรคตอน และสัญลักษณ์ทางเทคนิคที่ใช้กันอยู่ทั่วไป.
ระบบ encoding เหล่านี้ยังขัดแย้งซึ่งกันและกัน. นั่นก็คือ, ในสอง encoding สามารถใช้หมายเลขเดียวกันสำหรับตัวอักขระสองตัวที่แตกต่างกัน,หรือใช้หมายเลขต่างกันสำหรับอักขระตัวเดียวกัน. ในระบบคอมพิวเตอร์ (โดยเฉพาะเซิร์ฟเวอร์) ต้องมีการสนับสนุนหลาย encoding; และเมื่อข้อมูลที่ผ่านไปมาระหว่างการเข้ารหัสหรือแพล็ตฟอร์มที่ต่างกัน, ข้อมูลนั้นจะเสี่ยงต่อการผิดพลาดเสียหาย.
Unicode จะเปลี่ยนแปลงสิ่งเหล่านั้นทั้งหมด!
Unicode กำหนดหมายเลขเฉพาะสำหรับแต่ละอักขระ, โดยไม่สนใจว่าเป็นแพล็ตฟอร์มใด, ไม่ขึ้นกับว่าจะเป็นโปรแกรมใดและไม่ว่าจะเป็นภาษาใด. มาตรฐาน Unicode ได้ถูกนำไปใช้โดยผู้นำในอุตสาหกรรม เช่น Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys และอื่นๆ อีกมาก. Unicode เป็นสิ่งที่จำเป็นสำหรับมาตรฐานใหม่ๆ เช่น XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML ฯลฯ., และเป็นแนวทางอย่างเป็นทางการในการทำ ISO/IEC 10646. Unicode ได้รับการสนับสนุนในระบบปฏิบัติการจำนวนมาก, บราวเซอร์ใหม่ๆ ทกตัว, และผลิตภัณฑ์อื่นๆ อีกมาก. การเกิดขึ้นของ Unicode Standard และทูลส์ต่างๆ ที่มีในการสนับสนุน Unicode, เป็นหนึ่งในแนวโน้มทางเทคโนโลยีซอฟต์แวร์ระดับโลกที่มีความสำค
#
Cos'è Unicode?
Unicode assegna un numero univoco a ogni carattere,
indipendentemente dalla piattaforma,
indipendentemente dall'applicazione,
indipendentemente dalla lingua.
I computer, in buona sostanza, non sanno far altro che trattare numeri. Per immagazzinare in memoria lettere o altri segni è necessario che a ogni carattere venga assegnato un numero. Esistono centinaia di sistemi di codifica, preesistenti a Unicode, e ognuno di questi abbina i numeri ai caratteri in modo differente. Nessuna di queste codifiche comprende un numero di caratteri sufficiente per tutte le circostanze. Per le sole lingue dell'Unione Europea, ad esempio, è necessario utilizzare parecchi sistemi di codifica distinti. Anche considerando una solo lingua, come l'italiano, non esiste una codifica unica che comprenda tutte le lettere e tutti i segni di punteggiatura e simboli tecnici di uso comune.
Questi sistemi di codifica, inoltre, sono in contraddizione l'uno con l'altro. Succede che due codifiche utilizzino lo stesso numero per due caratteri diversi o che, viceversa, adottino numeri diversi per lo stesso carattere. Qualsiasi elaboratore, e a maggior ragione un server di rete, ha bisogno di utilizzare codifiche diverse. Il problema è che, quando i dati passano da una codifica a un'altra, o da una piattaforma a un'altra, si corre il serio rischio di perdere informazioni.
Unicode sta cambiando le cose!
Unicode attribuisce un numero univoco a ogni carattere, indipendentemente dalla piattaforma, dall'applicativo, dalla lingua. Lo standard Unicode è stato adottato da leader di mercato del calibro di Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys e molti altri. Unicode è alla base di molti moderni standard, come XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML eccetera, e costituisce l'implementazione ufficiale dello standard internazionale ISO/IEC 10646. Unicode è supportato da molti sistemi operativi, da tutti i più moderni web browser e da molti altri prodotti. L'emergere dello standard Unicode, unito alla recente disponibilità di strumenti che lo supportano, è fra i più significativi sviluppi della tecnologia della globalizzazione del software.
L'adozione di Unicode sui siti web e nelle applicazioni client/server o multi-tiered, rispetto all'utilizzo dei set di caratteri tradizionali, permette un significativo abbattimento dei costi di gestione. Unicode consente che un'unica versione di un software o di un sito web siano fruibili con piattaforme, lingue e paesi diversi, evitando la necessità di reingenierizzare il prodotto per ogni situazione specifica. Permette, inoltre, il trasporto del testo fra sistemi diversi senza che abbia luogo alcuna corruzione dei dati.
#
什么是Unicode(统一码)?
Unicode给每个字符提供了一个唯一的数字
不论是什么平台,
不论是什么程序,
不论是什么语言。
基本上计算机只是处理数字。它们指定一个数字来储存字母或其他字符。在创造Unicode之前有数百种指定这些数字的编码系统。没有一个编码可以包含足够的字符例如单单欧州共同体就需要好几种不同的编码来包括所有的语言。即使是单一种语言例如英语也没有哪一个编码可以适用于所有的字母标点符号和常用的技术符号。
这些编码系统也会互相冲突。也就是说,两种编码可能使用相同的数字代表两个不同的字符,或使用不同的数字代表相同的字符。任何一台特定的计算机(特别是服务器)都需要支持许多不同的编码,但是,不论什么时候数据通过不同的编码或平台之间,那些数据总会有损坏的危险。
Unicode正在改变所有这一切
Unicode给每个字符提供了一个唯一的数字不论是什么平台不论是什么程序不论什么语言。Unicode标准已经被这些工业界的领导们所采用例如Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys和其它许多公司。最新的标准都需要Unicode例如XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML等等并且Unicode是实现ISO/IEC 10646的正规方式。许多操作系统所有最新的浏览器和许多其他产品都支持它。Unicode标准的出现和支持它工具的存在是近来全球软件技术最重要的发展趋势。
将Unicode与客户服务器或多层应用程序和网站结合比使用传统字符集节省费用。Unicode使单一软件产品或单一网站能够贯穿多个平台语言和国家而不需要重建。它可将数据传输到许多不同的系统而无损坏。
关于Unicode学术学会
Unicode学术学会是一个非盈利的组织是为发展扩展和推广使用Unicode标准而建立的Unicode学术学会设立了现代软件产品和标准文本的表示法。学术学会的会员代表了广泛领域的计算机和资讯工业的公司和组织。学术学会只由会员提供资金。Unicode学术学会的会员资格开放给世界上任何支持Unicode标准和希望协助其扩展和执行的组织及个人。
#
Mikä Unicode on?
Unicode määrittää jokaiselle merkille yksilöllisen arvon.
Riippumatta käyttöjärjestelmästä
Riippumatta ohjelmistosta
Riippumatta kielestä
Tietokoneiden toiminta perustuu numeroiden käsittelylle. Myös kirjaimet ja muut merkit ovat tietokoneiden muistissa lukusarjoina. Ennen Unicodea oli käytössä satoja eri tapoja esittää merkkejä ja kirjaimia mutta yksikään näistä koodisivuista ei kyennyt määrittämään kovin kattavaa merkistöä. Jo yksinomaan Euroopan Unionissa käytetyille kielille tarvitaan useita erilaisia merkistöjä ja koodisivuja, eikä edes englannin kaltaisen, yksinkertaisen kielen kaikkien kirjaimien, välimerkkien ja yleisten teknisten symbolien esittäminen onnistu yhdellä näistä koodisivuista.
Lisäksi koodisivut ovat ristiriidassa keskenään. Eri koodisivuilla voidaan käyttää samaa arvoa kahdelle täysin erilaiselle merkille tai samalla merkillä voi olla eri arvo eri koodivulla. Jokaisen tietokoneen, varsinkin jos kyseessä on palvelin, pitää tukea lukuisia eri merkistöjä ja koodisivuja ja kun tietoa siirretään järjestelmien ja merkistöjen välillä on tiedon vääristymisen ja tuhoutumisen riski suuri.
Unicode on ratkaisu
Unicodessa jokaiselle merkille on määritetty yksilöllinen, riippumaton arvo. Alan johtavat yritykset, kuten Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys ja lukuisat muut toimijat käyttävät Unicodea. Unicode on sisällytetty vaatimuksena moniin nykyisiin standardeihin, kuten XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0 ja WML. Unicode on virallinen keino toteuttaa ISO/IEC 10646 -standardi. Useat käyttöjärjestelmät, kaikki internet-selaimet ja monet muut tuotteet käyttävät Unicodea. Unicoden syntyminen ja sitä tukevien työkalujen suuri määrä on yksi viime aikojen merkittävimmistä maailmanlaajuisen ohjelmistotuotannon suuntauksista.
Unicoden käyttäminen palvelintekniikassa, vaativissa sovelluksissa ja internet-sivustoilla tuo merkittäviä kustannussäästöjä verrattuna usean eri koodisivun käytölle. Unicoden ansiosta yksittäinen sovellus tai sivusto voidaan tarjota useille eri käyttöjärjestelmille ja monilla eri kielillä maailmanlaajuisesti, ilman uudelleenohjelmoinnin tarvetta. Tieto kulkee järjestelmien välillä vailla tuhoutumisen tai vääristymisen vaaraa.
Unicode-konsortio
#
Co je Unicode?
Unicode přiřazuje každému znaku jedinečné číslo,
nezávisle na platformě,
nezávisle na programu,
nezávisle na jazyce.
Počítače, ze své podstaty, pracují pouze s čísly. Písmena a další znaky ukládají tak, že každému z nich přiřadí číslo. Před vznikem Unicode existovaly stovky rozdílných kódovacích systémů pro přiřazování těchto čísel. Žádné z těchto kódování nemohlo obsahovat dostatek znaků: například Evropská unie sama potřebuje několik různých kódování, aby pokryla všechny své jazyky. Dokonce i pro jeden jediný jazyk, jako je angličtina, nevyhovovalo žádné kódování pro všechny písmena, interpunkci a běžně používané technické symboly.
Tyto kódovací systémy také byly v konfliktu jeden s druhým. To znamená, že dvě kódování mohou používat stejné číslo pro dva různé znaky, nebo používat různá čísla pro stejný znak. Jakýkoli počítač (zvláště servery) musí podporovat mnoho různých kódování; přesto, kdykoli jsou data předávána mezi různými kódováními nebo platformami, hrozí, že tato data budou poškozena.
Unicode toto všechno mění!
Unicode přiřazuje každému znaku jedinečné číslo, nezávisle na platformě, nezávisle na programu, nezávisle na jazyku. Unicode Standard byl přijat takovými průmyslovými vůdci, jako jsou Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys a mnoha dalšími. Unicode je vyžadován moderními standardy, jako jsou XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML atd. a je oficiální formou implementace ISO/IEC 10646. Je podporován v mnoha operačních systémech, všech moderních prohlížečích a mnoha dalších produktech. To, že se objevil Unicode Standard a dostupnost nástrojů, které jej podporují, patří mezi nejvýznamnější nedávné trendy v globální technologii softwaru.
Začlenění Unicode do klient-server nebo vícevrstvých aplikací a webových stránek nabízí významné ušetření nákladů oproti dřívějším znakovým sadám. Unicode umožňuje, aby jediný softwarový produkt nebo jediná webová stránka byla zaměřena na mnoho platforem, jazyků a zemí beze změn návrhu. To dovoluje přenášet data přes mnoho různých systémů bez porušení.
#
?
יוניקוד מקצה מספר ייחודי לכל תו,
לא משנה על איזו פלטפורמה,
לא משנה באיזו תוכנית,
ולא משנה באיזו שפה.
באופן בסיסי, מחשבים עוסקים רק במספרים. הם מאחסנים אותיות ותווים אחרים על-ידי הקצאת מספר לכל אחד מהם. בטרם הומצא היוניקוד, היו מאות מערכות קידוד שונות להקצאת המספרים הללו. אף לא אחת מהן יכלה להכיל כמות תווים מספקת. לדוגמא: רק לאיחוד האירופאי נדרשים כמה סוגי קידודים שונים על מנת לכסות את כל השפות המדוברות בו. יתירה מזאת אף לשפה בודדת, כמו אנגלית למשל, לא היה די במערכת קידוד אחת בעבור כל האותיות, סימני הפיסוק והסמלים הטכניים שבשימוש שוטף.
מערכות קידוד אלו אף סותרות זו את זו. כלומר, שני קידודים יכולים להשתמש באותו מספר לשני תוים נבדלים, או להשתמש במספרים שונים לאותו תו. על כל מחשב (ובמיוחד שרתים) לתמוך במספר רב של מערכות קידוד שונות; אולם כל אימת שנתונים עוברים בין מערכות קידוד או פלטפורמות שונות קיים הסיכון שייפגמו.
יוניקוד משנה את כל זה!
יוניקוד מקצה מספר ייחודי לכל תו, ללא תלות בפלטפורמה, בתוכנית, או בשפה. תקן היוניקוד אומץ על-ידי המובילים בתעשייה כמו Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys ורבים אחרים. יוניקוד נדרש על-ידי תקנים מודרניים כמו XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML וכדומה, ומהווה למעשה את היישום הרשמי של תקן ISO/IEC 10646. הוא נתמך על ידי מערכות הפעלה רבות, כל הדפדפנים החדישים, ומוצרים רבים אחרים. הופעת תקן היוניקוד וזמינות הכלים התומכים בו נמנות עם המגמות הכלל-עולמיות החשובות ביותר, אשר מסתמנות לאחרונה בטכנולוגיית התוכנה.
שילוב יוניקוד ביישומי שרת-לקוח או ביישומים רבי-שכבות ובאתרי אינטרנט מאפשר חיסכון ניכר בעלויות לעומת השימוש בסדרות התווים המסורתיות. הודות ליוניקוד, מוצר תוכנה אחד או אתר יחיד ברשת יכול להרחיב את יעדיו למגוון פלטפורמות, ארצות ושפות ללא צורך בשינויים מרחיקים. יוניקוד מאפשר מעבר נתונים דרך מערכות רבות ושונות מבלי שייפגמו.
פרטים אודות הקונסורציום של יוניקוד (Unicode Consortium
#
Hvað er Unicode?
Unicode staðallinn úthlutar hverju skriftákni tölu,
sem er óháð tölvugerð,
sem er óháð forriti,
sem er óháð tungumáli.
Tölvur geta í eðli sínu aðeins unnið með tölur. Þær geyma bókstafi og önnur skriftákn með því að úthluta þeim tölu. Áður en Unicode kom til voru hundruð mismunandi túlkunarkerfa sem úthlutuðu þessum tölum. Ekkert eitt túlkunarkerfi gat innihaldið nægilegan fjölda skriftákna; t.d. þarfnast Evrópusambandið nokkurra mismunandi kerfa til að spanna öll tungumál þess. Jafnvel fyrir eitt tungumál, eins og ensku, var eitt túlkunarkerfi ekki nóg fyrir alla bókstafi, greinarmerki og algengustu einingatákn.
Túlkunarkerfin hafa einnig verið í andstöðu hvert við annað, þ.e. tvö kerfi geta notað sömu tölu fyrir tvö ólík skriftákn eða notað tvær mismunandi tölur fyrir sama táknið. Sérhver tölva þarf(sérstaklega miðlarar) að styðja margs konar túlkanir á stöfum; engu að síður er alltaf hætta á stafabrenglun þegar gögn fara á milli tölva og á milli mismunandi túlkunarkerfa.
Unicode breytir þessu öllu!
Unicode gefur hverju skriftákni eigin tölu sem breytist ekki eftir tölvugerð, forriti eða tungumáli. Unicode staðallinn hefur verið tekinn upp af forkólfum tölvuiðnaðarins; Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys og mörgum öðrum. Unicode er notað af nútímastöðlum eins og XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML, o.s.frv. og er hin opinbera leið til að útfæra ISO/IEC 10646. Unicode staðallinn er studdur af mörgum stýrikerfum, öllum nútímavöfrum og mörgum öðrum búnaði. Tilkoma Unicode staðalsins og búnaðar til að styðja hann eru veigamikil skref í þróun hnattrænnar tækni.
Að nota Unicode í notendamiðlurum eða í forritum og vefsíðum með mörgum notendaviðmótum býður upp á umtalsverðan sparnað, í samanburði við að nota eldri stafatöflur. Unicode leyfir einum forritapakka eða einni vefslóð að ná til margra tölvugerða, tungumála og landa án endurhönnunar. Unicode gerir gögnum kleift að ferðast gegnum mörg mismunandi kerfi án brenglunar.
Um Unicode samtökin (Consortium)
Unicode Consortium samtökin stefna ekki að hagnaði. Þau voru stofnuð til að þróa, útvíkka og koma á framfæri Unicode staðlinum, sem skilgreinir framsetningu á texta í nútímaforritapökkum og stöðlum. Meðlimir samtakanna eru fulltrúar margs konar fyrirtækja og stofnana í tölvu- og upplýsingaiðnaðinum. Samtökin hafa eingöngu tekjur af aðildargjöldum sínum. Fyrirtæki, stofnanir og einstaklingar hvar sem er í heiminum sem ó
#
!@()*&(&@(@*(@*)(@*$$$%^^
#
+=-=`'
#

View File

@ -1,6 +1,6 @@
#****************************************************************************** #******************************************************************************
# #
# Copyright (C) 1999-2007, International Business Machines # Copyright (C) 1999-2008, International Business Machines
# Corporation and others. All Rights Reserved. # Corporation and others. All Rights Reserved.
# #
#****************************************************************************** #******************************************************************************
@ -51,7 +51,7 @@ DEFS += -DU_TOOLUTIL_IMPLEMENTATION
LDFLAGS += $(LDFLAGSICUTOOLUTIL) LDFLAGS += $(LDFLAGSICUTOOLUTIL)
LIBS = $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) LIBS = $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS)
OBJECTS = filestrm.o package.o pkgitems.o propsvec.o swapimpl.o toolutil.o unewdata.o \ OBJECTS = filestrm.o package.o pkgitems.o swapimpl.o toolutil.o unewdata.o \
ucm.o ucmstate.o uoptions.o uparse.o \ ucm.o ucmstate.o uoptions.o uparse.o \
ucbuf.o xmlparser.o writesrc.o ucbuf.o xmlparser.o writesrc.o