ICU-2235 add swapping of Unicode character names (unames.icu)

X-SVN-Rev: 13114
This commit is contained in:
Markus Scherer 2003-09-16 00:30:57 +00:00
parent add1ce4e34
commit 4a782de375
3 changed files with 367 additions and 1 deletions

View File

@ -29,6 +29,7 @@
#include "cstring.h"
#include "ucln_cmn.h"
#include "uprops.h"
#include "udataswp.h"
/* prototypes ------------------------------------------------------------- */
@ -1778,6 +1779,361 @@ uprv_getISOCommentCharacters(USet* set) {
}
#endif
/* data swapping ------------------------------------------------------------ */
/*
* The token table contains non-negative entries for token bytes,
* and -1 for bytes that represent themselves in the data file's charset.
* -2 entries are used for lead bytes.
*
* Direct bytes (-1 entries) must be translated from the input charset family
* to the output charset family.
* makeTokenMap() writes a permutation mapping for this.
* Use it once for single-/lead-byte tokens and once more for all trail byte
* tokens. (';' is an unused trail byte marked with -1.)
*/
static void
makeTokenMap(const UDataSwapper *ds,
int16_t tokens[], uint16_t tokenCount,
uint8_t map[256],
UErrorCode *pErrorCode) {
UBool usedOutChar[256];
uint16_t i, j;
uint8_t c1, c2;
if(U_FAILURE(*pErrorCode)) {
return;
}
if(ds->inCharset==ds->outCharset) {
/* Same charset family: identity permutation */
for(i=0; i<256; ++i) {
map[i]=(uint8_t)i;
}
} else {
uprv_memset(map, 0, 256);
uprv_memset(usedOutChar, 0, 256);
if(tokenCount>256) {
tokenCount=256;
}
/* set the direct bytes (byte 0 always maps to itself) */
for(i=1; i<tokenCount; ++i) {
if(tokens[i]==-1) {
/* convert the direct byte character */
c1=(uint8_t)i;
ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d) - %s\n",
i, ds->inCharset, u_errorName(*pErrorCode));
return;
}
/* enter the converted character into the map and mark it used */
map[c1]=c2;
usedOutChar[c2]=TRUE;
}
}
/* set the mappings for the rest of the permutation */
for(i=j=1; i<tokenCount; ++i) {
/* set mappings that were not set for direct bytes */
if(map[i]==0) {
/* set an output byte value that was not used as an output byte above */
while(usedOutChar[j]) {
++j;
}
map[i]=(uint8_t)j++;
}
}
/*
* leave mappings at tokenCount and above unset if tokenCount<256
* because they won't be used
*/
}
}
U_CAPI int32_t U_EXPORT2
uchar_swapNames(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode) {
const UDataInfo *pInfo;
int32_t headerSize;
const uint8_t *inBytes;
uint8_t *outBytes;
uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
offset, i, count, stringsCount;
const AlgorithmicRange *inRange;
AlgorithmicRange *outRange;
/* udata_swapDataHeader checks the arguments */
headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
/* check data format and format version */
pInfo=(const UDataInfo *)((const char *)inData+4);
if(!(
pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
pInfo->dataFormat[1]==0x6e &&
pInfo->dataFormat[2]==0x61 &&
pInfo->dataFormat[3]==0x6d &&
pInfo->formatVersion[0]==1
)) {
udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
pInfo->dataFormat[2], pInfo->dataFormat[3],
pInfo->formatVersion[0]);
*pErrorCode=U_UNSUPPORTED_ERROR;
return 0;
}
inBytes=(const uint8_t *)inData+headerSize;
outBytes=(uint8_t *)outData+headerSize;
if(length<0) {
algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
} else {
length-=headerSize;
if( length<20 ||
(uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
) {
udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
length);
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
if(length<0) {
/* preflighting: iterate through algorithmic ranges */
offset=algNamesOffset;
count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
offset+=4;
for(i=0; i<count; ++i) {
inRange=(const AlgorithmicRange *)(inBytes+offset);
offset+=ds->readUInt16(inRange->size);
}
} else {
/* swap data */
const uint16_t *p;
uint16_t *q, *temp;
int16_t tokens[512];
uint16_t tokenCount;
uint8_t map[256], trailMap[256];
/* the initial 4 offsets first */
tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
/*
* now the tokens table
* it needs to be permutated along with the compressed name strings
*/
p=(const uint16_t *)(inBytes+16);
q=(uint16_t *)(outBytes+16);
/* read and swap the tokenCount */
tokenCount=ds->readUInt16(*p);
ds->swapArray16(ds, p, 2, q, pErrorCode);
++p;
++q;
/* read the first 512 tokens and make the token maps */
if(tokenCount<=512) {
count=tokenCount;
} else {
count=512;
}
for(i=0; i<count; ++i) {
tokens[i]=udata_readInt16(ds, p[i]);
}
for(; i<512; ++i) {
tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
}
makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
/*
* swap and permutate the tokens
* go through a temporary array to support in-place swapping
*/
temp=(uint16_t *)uprv_malloc(tokenCount*2);
if(temp==NULL) {
udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
tokenCount);
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return 0;
}
/* swap and permutate single-/lead-byte tokens */
for(i=0; i<tokenCount && i<256; ++i) {
ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
}
/* swap and permutate trail-byte tokens */
for(; i<tokenCount; ++i) {
ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
}
/* copy the result into the output and free the temporary array */
uprv_memcpy(q, temp, tokenCount*2);
uprv_free(temp);
/* swap the token strings */
count=groupsOffset-tokenStringOffset;
if(count>0 && inBytes[groupsOffset-1]!=0) {
/*
* do not swap a possible padding byte after
* the terminating NUL of the last string
*/
--count;
}
ds->swapInvChars(ds, inBytes+tokenStringOffset, (int32_t)count,
outBytes+tokenStringOffset, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
udata_printError(ds, "uchar_swapNames(token strings) failed - %s\n",
u_errorName(*pErrorCode));
return 0;
}
/* swap the group table */
count=*((const uint16_t *)(inBytes+groupsOffset));
ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
outBytes+groupsOffset, pErrorCode);
/*
* swap the group strings
* swap the string bytes but not the nibble-encoded string lengths
*/
if(ds->inCharset!=ds->outCharset) {
uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
const uint8_t *inStrings, *nextInStrings;
uint8_t *outStrings;
uint8_t c;
inStrings=inBytes+groupStringOffset;
outStrings=outBytes+groupStringOffset;
stringsCount=algNamesOffset-groupStringOffset;
/* iterate through string groups until only a few padding bytes are left */
while(stringsCount>32) {
nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
/* move past the length bytes */
stringsCount-=(uint32_t)(nextInStrings-inStrings);
outStrings+=nextInStrings-inStrings;
inStrings=nextInStrings;
count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
stringsCount-=count;
/* swap the string bytes using map[] and trailMap[] */
while(count>0) {
c=*inStrings++;
*outStrings++=map[c];
if(tokens[c]!=-2) {
--count;
} else {
/* token lead byte: swap the trail byte, too */
*outStrings++=trailMap[*inStrings++];
count-=2;
}
}
}
}
/* swap the algorithmic ranges */
offset=algNamesOffset;
count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
offset+=4;
for(i=0; i<count; ++i) {
if(offset>(uint32_t)length) {
udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
length, i);
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
inRange=(const AlgorithmicRange *)(inBytes+offset);
outRange=(AlgorithmicRange *)(outBytes+offset);
offset+=ds->readUInt16(inRange->size);
ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
switch(inRange->type) {
case 0:
/* swap prefix string */
ds->swapInvChars(ds, inRange+1, uprv_strlen((const char *)(inRange+1)),
outRange+1, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed - %s\n",
i, u_errorName(*pErrorCode));
return 0;
}
break;
case 1:
{
/* swap factors and the prefix and factor strings */
uint16_t factors[8];
uint32_t j, factorsCount;
factorsCount=inRange->variant;
if(factorsCount==0 || factorsCount>LENGTHOF(factors)) {
udata_printError(ds, "uchar_swapNames(): too many factors (%u) in algorithmic range %u\n",
factorsCount, i);
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
/* read and swap the factors */
p=(const uint16_t *)(inRange+1);
q=(uint16_t *)(outRange+1);
for(j=0; j<factorsCount; ++j) {
factors[j]=ds->readUInt16(p[j]);
}
ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
/* swap the strings, up to the last terminating NUL */
p+=factorsCount;
q+=factorsCount;
stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
--stringsCount;
}
ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
}
break;
default:
udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
inRange->type, i);
*pErrorCode=U_UNSUPPORTED_ERROR;
return 0;
}
}
}
return headerSize+(int32_t)offset;
}
/*
* Hey, Emacs, please set the following:
*

View File

@ -364,4 +364,13 @@ uprops_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
/**
* Swap the ICU Unicode character names file. See uchar.c.
* @internal
*/
U_CAPI int32_t U_EXPORT2
uchar_swapNames(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
#endif

View File

@ -256,11 +256,12 @@ static const struct {
UDataSwapFn *swapFn;
} swapFns[]={
{ { 0x52, 0x65, 0x73, 0x42 }, ures_swap }, /* dataFormat="ResB" */
{ { 0x43, 0x76, 0x41, 0x6c }, ucnv_swapAliases }, /* dataFormat="CvAl" */
/* insert data formats here, descending by expected frequency of occurrence */
{ { 0x55, 0x50, 0x72, 0x6f }, uprops_swap }, /* dataFormat="UPro" */
{ { 0x55, 0x43, 0x6f, 0x6c }, ucol_swap }, /* dataFormat="UCol" */
{ { 0x49, 0x6e, 0x76, 0x43 }, ucol_swapInverseUCA },/* dataFormat="InvC" */
{ { 0x43, 0x76, 0x41, 0x6c }, ucnv_swapAliases } /* dataFormat="CvAl" */
{ { 0x75, 0x6e, 0x61, 0x6d }, uchar_swapNames } /* dataFormat="unam" */
};
static int32_t