ICU-484 add unicodeMask to UConverterStaticData for optimized implementations

X-SVN-Rev: 3280
This commit is contained in:
Markus Scherer 2000-12-19 23:07:50 +00:00
parent b2b6812d1e
commit 998f792a5a
11 changed files with 657 additions and 84 deletions

View File

@ -376,7 +376,8 @@ const UConverterStaticData _ISO2022StaticData={
1,
FALSE,
FALSE,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} /* reserved */
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};

View File

@ -64,6 +64,10 @@ typedef union UConverterTable UConverterTable;
struct UConverterImpl;
typedef struct UConverterImpl UConverterImpl;
/** values for the unicodeMask */
#define UCNV_HAS_SUPPLEMENTARY 1
#define UCNV_HAS_SURROGATES 2
typedef struct UConverterStaticData { /* +offset: size */
uint32_t structSize; /* +0: 4 Size of this structure */
@ -83,7 +87,8 @@ typedef struct UConverterStaticData { /* +offset: size */
uint8_t hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
uint8_t hasFromUnicodeFallback; /* +78: 1 */
uint8_t reserved[21]; /* +79: 21 to round out the structure */
uint8_t unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */
uint8_t reserved[20]; /* +80: 20 to round out the structure */
/* total size: 100 */
} UConverterStaticData;

View File

@ -518,7 +518,7 @@ const UConverterStaticData _LMBCSStaticData##n={\
sizeof(UConverterStaticData),\
"LMBCS-" #n,\
0, UCNV_IBM, UCNV_LMBCS_##n, 1, 1,\
{ 0x3f, 0, 0, 0 },1,FALSE,FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \
{ 0x3f, 0, 0, 0 },1,FALSE,FALSE,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \
};\
const UConverterSharedData _LMBCSData##n={\
sizeof(UConverterSharedData), ~((uint32_t) 0),\

View File

@ -784,7 +784,9 @@ const UConverterStaticData _UTF8StaticData={
sizeof(UConverterStaticData),
"UTF8",
1208, UCNV_IBM, UCNV_UTF8, 1, 4,
{ 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
{ 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
@ -964,7 +966,9 @@ const UConverterStaticData _UTF16BEStaticData={
sizeof(UConverterStaticData),
"UTF16_BigEndian",
1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
{ 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
{ 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
@ -1154,7 +1158,9 @@ const UConverterStaticData _UTF16LEStaticData={
sizeof(UConverterStaticData),
"UTF16_LittleEndian",
1200, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
{ 0xfd, 0xff, 0, 0 },2,0,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
{ 0xfd, 0xff, 0, 0 },2,0,0,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
@ -1382,7 +1388,8 @@ const UConverterStaticData _UTF32BEStaticData = {
0, /* Should be the UTF-32 CCSID */
UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
{ 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
const UConverterSharedData _UTF32BEData = {
@ -1610,7 +1617,8 @@ const UConverterStaticData _UTF32LEStaticData = {
0, /* Should be the UTF-32 CCSID */
UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
{ 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};

View File

@ -85,7 +85,8 @@ const UConverterStaticData _HZStaticData={
"HZ",
2023, UCNV_IBM, UCNV_HZ, 1, 4,
{ 0x1a, 0, 0, 0 },1, FALSE, FALSE,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} /* reserved */
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};

View File

@ -195,10 +195,12 @@ static const UConverterImpl _Latin1Impl={
};
const UConverterStaticData _Latin1StaticData={
sizeof(UConverterStaticData),
"LATIN_1",
sizeof(UConverterStaticData),
"LATIN_1",
819, UCNV_IBM, UCNV_LATIN_1, 1, 1,
{ 0x1a, 0, 0, 0 },1,FALSE, FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
{ 0x1a, 0, 0, 0 },1,FALSE, FALSE,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};

View File

@ -33,6 +33,7 @@
#include "unicode/utypes.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_cb.h"
#include "unicode/udata.h"
#include "ucnv_bld.h"
#include "ucnvmbcs.h"
#include "ucnv_cnv.h"
@ -173,6 +174,10 @@ U_CFUNC void
_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode);
U_CFUNC void
_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode);
U_CFUNC UChar32
_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode);
@ -185,6 +190,10 @@ U_CFUNC void
_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
UErrorCode *pErrorCode);
U_CFUNC void
_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
UErrorCode *pErrorCode);
static void
fromUCallback(UConverter *cnv,
void *context, UConverterFromUnicodeArgs *pArgs,
@ -238,6 +247,7 @@ U_CFUNC void
_MBCSLoad(UConverterSharedData *sharedData,
const uint8_t *raw,
UErrorCode *pErrorCode) {
UDataInfo info;
UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs;
_MBCSHeader *header=(_MBCSHeader *)raw;
@ -255,6 +265,20 @@ _MBCSLoad(UConverterSharedData *sharedData,
mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
mbcsTable->outputType=(uint8_t)header->flags;
/*
* converter versions 6.1 and up contain a unicodeMask that is
* used here to select the most efficient function implementations
*/
info.size=sizeof(UDataInfo);
udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
if(info.formatVersion[0]>6 || info.formatVersion[0]==6 && info.formatVersion[1]>=1) {
/* mask off possible future extensions to be safe */
mbcsTable->unicodeMask=sharedData->staticData->unicodeMask&3;
} else {
/* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
}
}
U_CFUNC void
@ -338,7 +362,11 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
/* use optimized function if possible */
cnv=pArgs->converter;
if(cnv->sharedData->table->mbcs.countStates==1) {
_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
if(!(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
} else {
_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
}
return;
}
@ -669,16 +697,16 @@ callback:
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(cnv->UCharErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
offset=0;
state=0;
byteIndex=0;
break;
} else if(cnv->UCharErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
/*
@ -717,7 +745,7 @@ endloop:
pArgs->offsets=offsets;
}
/* This version of _MBCSToUnicode() is optimized for single-byte, single-state codepages. */
/* This version of _MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
U_CFUNC void
_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
@ -875,13 +903,13 @@ callback:
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
break;
} else if(cnv->UCharErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
break;
}
/*
@ -903,6 +931,175 @@ endloop:
pArgs->offsets=offsets;
}
/*
* This version of _MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
* that only map to and from the BMP.
* In addition to single-byte optimizations, the offset calculations
* become much easier.
*/
U_CFUNC void
_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv;
const uint8_t *source, *sourceLimit, *lastSource;
UChar *target;
int32_t targetCapacity, length;
int32_t *offsets;
const int32_t (*stateTable)[256];
int32_t sourceIndex;
int32_t entry;
uint8_t b;
UConverterCallbackReason reason;
/* set up the local pointers */
cnv=pArgs->converter;
source=(const uint8_t *)pArgs->source;
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
target=pArgs->target;
targetCapacity=pArgs->targetLimit-pArgs->target;
offsets=pArgs->offsets;
stateTable=cnv->sharedData->table->mbcs.stateTable;
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex=0;
lastSource=source;
/*
* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
* for the minimum of the sourceLength and targetCapacity
*/
length=sourceLimit-source;
if(length<targetCapacity) {
targetCapacity=length;
}
/* conversion loop */
while(targetCapacity>0) {
b=*source++;
entry=stateTable[0][b];
/* entry<0 */
/*
* bit 31 is set, bits:
* 30..27 action code
* (do not mask out bit 31 for speed, include it in action values)
* 26..7 depend on the action code
* 6..0 next state
*/
/* switch per action code */
switch((uint32_t)entry>>27U) {
case 16|MBCS_STATE_ILLEGAL:
/* bits 26..7 are not used, 0 */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
case 16|MBCS_STATE_UNASSIGNED:
/* bits 26..7 are not used, 0 */
/* callback(unassigned) */
reason=UCNV_UNASSIGNED;
*pErrorCode=U_INVALID_CHAR_FOUND;
break;
case 16|MBCS_STATE_FALLBACK_DIRECT_16:
/* bits 26..23 are not used, 0 */
/* bits 22..7 contain the Unicode BMP code point */
if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
/* callback(unassigned) */
reason=UCNV_UNASSIGNED;
*pErrorCode=U_INVALID_CHAR_FOUND;
break;
}
/* fall through to the MBCS_STATE_VALID_DIRECT_16 branch */
case 16|MBCS_STATE_VALID_DIRECT_16:
/* bits 26..23 are not used, 0 */
/* bits 22..7 contain the Unicode BMP code point */
/* output BMP code point */
*target++=(UChar)(entry>>7);
--targetCapacity;
continue;
default:
/* reserved, must never occur */
/* bits 26..7 are not used, 0 */
continue;
}
/* call the callback function with all the preparations and post-processing */
/* set offsets since the start or the last callback */
if(offsets!=NULL) {
int32_t count=(int32_t)(source-lastSource);
/* predecrement: do not set the offset for the callback-causing character */
while(--count>0) {
*offsets++=sourceIndex++;
}
/* offset and sourceIndex are now set for the current character */
}
/* update the arguments structure */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
/* copy the current bytes to invalidCharBuffer */
cnv->invalidCharBuffer[0]=b;
cnv->invalidCharLength=1;
/* call the callback function */
toUCallback(cnv, cnv->toUContext, pArgs, (const char *)&b, 1, reason, pErrorCode);
/* update target and deal with offsets if necessary */
offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
target=pArgs->target;
/* update the source pointer and index */
sourceIndex+=1+((const uint8_t *)pArgs->source-source);
source=lastSource=(const uint8_t *)pArgs->source;
targetCapacity=pArgs->targetLimit-target;
length=sourceLimit-source;
if(length<targetCapacity) {
targetCapacity=length;
}
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
break;
} else if(cnv->UCharErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
}
if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
/* set offsets since the start or the last callback */
if(offsets!=NULL) {
size_t count=source-lastSource;
while(count>0) {
*offsets++=sourceIndex++;
--count;
}
}
/* write back the updated pointers */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
}
U_CFUNC UChar32
_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
@ -1521,7 +1718,11 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
cnv=pArgs->converter;
outputType=cnv->sharedData->table->mbcs.outputType;
if(outputType==MBCS_OUTPUT_1) {
_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
if(!(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
} else {
_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
}
return;
}
@ -1917,14 +2118,14 @@ callback:
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
c=0;
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
/*
@ -1942,7 +2143,7 @@ callback:
if(pArgs->flush && source>=sourceLimit) {
/* reset the state for the next conversion */
if(c!=0 && U_SUCCESS(*pErrorCode)) {
/* a character byte sequence remains incomplete */
/* a Unicode code point remains incomplete (only a first surrogate) */
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
}
cnv->fromUSurrogateLead=0;
@ -1969,7 +2170,6 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
const uint16_t *table;
const uint8_t *bytes;
uint8_t outputType;
UChar32 c;
@ -1977,7 +2177,7 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
UConverterCallbackReason reason;
uint32_t i;
uint32_t value;
uint8_t value;
/* set up the local pointers */
cnv=pArgs->converter;
@ -1989,7 +2189,6 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
table=cnv->sharedData->table->mbcs.fromUnicodeTable;
bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes;
outputType=cnv->sharedData->table->mbcs.outputType;
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
@ -2064,7 +2263,21 @@ getTrail:
value=*p;
/* is the codepage value really an "unassigned" indicator? */
if(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0) {
if(!(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0)) {
/* assigned, write the output character bytes from value and length */
/* length==1 */
/* this is easy because we know that there is enough space */
*target++=value;
if(offsets!=NULL) {
*offsets++=sourceIndex;
}
--targetCapacity;
/* normal end of conversion: prepare for a new character */
c=0;
sourceIndex=nextSourceIndex;
continue;
} else { /* unassigned */
/*
* We allow a 0 byte output if the Unicode code point is
* U+0000 and also if the "assigned" bit is set for this entry.
@ -2074,29 +2287,13 @@ getTrail:
/* callback(unassigned) */
reason=UCNV_UNASSIGNED;
*pErrorCode=U_INVALID_CHAR_FOUND;
goto callback;
}
} else {
/* callback(unassigned) */
reason=UCNV_UNASSIGNED;
*pErrorCode=U_INVALID_CHAR_FOUND;
goto callback;
}
/* write the output character bytes from value and length */
/* length==1 */
/* this is easy because we know that there is enough space */
*target++=(uint8_t)value;
if(offsets!=NULL) {
*offsets++=sourceIndex;
}
--targetCapacity;
/* normal end of conversion: prepare for a new character */
c=0;
sourceIndex=nextSourceIndex;
continue;
callback:
/* call the callback function with all the preparations and post-processing */
/* update the arguments structure */
@ -2133,14 +2330,14 @@ callback:
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
c=0;
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
/*
@ -2158,7 +2355,242 @@ callback:
if(pArgs->flush && source>=sourceLimit) {
/* reset the state for the next conversion */
if(c!=0 && U_SUCCESS(*pErrorCode)) {
/* a character byte sequence remains incomplete */
/* a Unicode code point remains incomplete (only a first surrogate) */
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
}
cnv->fromUSurrogateLead=0;
} else {
/* set the converter state back into UConverter */
cnv->fromUSurrogateLead=(UChar)c;
}
/* write back the updated pointers */
pArgs->source=source;
pArgs->target=(char *)target;
pArgs->offsets=offsets;
}
/*
* This version of _MBCSFromUnicode() is optimized for single-byte codepages
* that map only to and from the BMP.
* In addition to single-byte/state optimizations, the offset calculations
* become much easier.
*/
U_CFUNC void
_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv;
const UChar *source, *sourceLimit, *lastSource;
uint8_t *target;
int32_t targetCapacity, length;
int32_t *offsets;
const uint16_t *table;
const uint8_t *bytes;
UChar32 c;
int32_t sourceIndex;
UConverterCallbackReason reason;
uint32_t i;
uint8_t value;
/* set up the local pointers */
cnv=pArgs->converter;
source=pArgs->source;
sourceLimit=pArgs->sourceLimit;
target=(uint8_t *)pArgs->target;
targetCapacity=pArgs->targetLimit-pArgs->target;
offsets=pArgs->offsets;
table=cnv->sharedData->table->mbcs.fromUnicodeTable;
bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes;
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex= c==0 ? 0 : -1;
lastSource=source;
/*
* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
* for the minimum of the sourceLength and targetCapacity
*/
length=sourceLimit-source;
if(length<targetCapacity) {
targetCapacity=length;
}
/* conversion loop */
if(c!=0 && targetCapacity>0) {
goto getTrail;
}
while(targetCapacity>0) {
/*
* Get a correct Unicode code point:
* a single UChar for a BMP code point or
* a matched surrogate pair for a "surrogate code point".
*/
c=*source++;
if(!UTF_IS_SURROGATE(c)) {
/* convert the Unicode code point in c into codepage bytes */
i=0x440+2*((uint32_t)table[c>>10]+((c>>4)&0x3f));
/* is this code point assigned, or do we use fallbacks? */
if((table[i++]&(1<<(c&0xf)))!=0 || UCNV_FROM_U_USE_FALLBACK(cnv, c)) {
const uint8_t *p=bytes;
/* MBCS_OUTPUT_1 */
p+=(16*(uint32_t)table[i]+(c&0xf));
value=*p;
/* is the codepage value really an "unassigned" indicator? */
if(!(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0)) {
/* assigned, write the output character bytes from value and length */
/* length==1 */
/* this is easy because we know that there is enough space */
*target++=value;
--targetCapacity;
/* normal end of conversion: prepare for a new character */
c=0;
continue;
} else { /* unassigned */
/*
* We allow a 0 byte output if the Unicode code point is
* U+0000 and also if the "assigned" bit is set for this entry.
* There is no way with this data structure for fallback output
* for other than U+0000 to be a zero byte.
*/
/* callback(unassigned) */
reason=UCNV_UNASSIGNED;
*pErrorCode=U_INVALID_CHAR_FOUND;
}
} else {
/* callback(unassigned) */
reason=UCNV_UNASSIGNED;
*pErrorCode=U_INVALID_CHAR_FOUND;
}
} else {
if(UTF_IS_SURROGATE_FIRST(c)) {
getTrail:
if(source<sourceLimit) {
/* test the following code unit */
UChar trail=*source;
if(UTF_IS_SECOND_SURROGATE(trail)) {
++source;
c=UTF16_GET_PAIR_VALUE(c, trail);
/* this codepage does not map supplementary code points */
/* callback(unassigned) */
reason=UCNV_UNASSIGNED;
*pErrorCode=U_INVALID_CHAR_FOUND;
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
}
} else {
/* no more input */
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
}
}
/* call the callback function with all the preparations and post-processing */
/* get the number of code units for c to correctly advance sourceIndex after the callback call */
length=UTF_CHAR_LENGTH(c);
/* set offsets since the start or the last callback */
if(offsets!=NULL) {
int32_t count=(int32_t)(source-lastSource);
/* do not set the offset for the callback-causing character */
count-=length;
while(count>0) {
*offsets++=sourceIndex++;
--count;
}
/* offset and sourceIndex are now set for the current character */
}
/* update the arguments structure */
pArgs->source=source;
pArgs->target=(char *)target;
pArgs->offsets=offsets;
/* set the converter state in UConverter to deal with the next character */
cnv->fromUSurrogateLead=0;
/* write the code point as code units */
i=0;
UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, c);
cnv->invalidUCharLength=(int8_t)i;
/* i==length */
/* call the callback function */
fromUCallback(cnv, cnv->fromUContext, pArgs, cnv->invalidUCharBuffer, i, c, reason, pErrorCode);
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
/* update target and deal with offsets if necessary */
offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
target=(uint8_t *)pArgs->target;
/* update the source pointer and index */
sourceIndex+=length+(pArgs->source-source);
source=lastSource=pArgs->source;
targetCapacity=(uint8_t *)pArgs->targetLimit-target;
length=sourceLimit-source;
if(length<targetCapacity) {
targetCapacity=length;
}
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
c=0;
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
}
if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
/* set offsets since the start or the last callback */
if(offsets!=NULL) {
size_t count=source-lastSource;
while(count>0) {
*offsets++=sourceIndex++;
--count;
}
}
if(pArgs->flush && source>=sourceLimit) {
/* reset the state for the next conversion */
if(c!=0 && U_SUCCESS(*pErrorCode)) {
/* a Unicode code point remains incomplete (only a first surrogate) */
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
}
cnv->fromUSurrogateLead=0;
@ -2295,21 +2727,54 @@ _MBCSFromUChar32(UConverterSharedData *sharedData,
}
/* is the codepage value really an "unassigned" indicator? */
if(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0) {
/*
* We allow a 0 byte output if the Unicode code point is
* U+0000 and also if the "assigned" bit is set for this entry.
* There is no way with this data structure for fallback output
* for other than U+0000 to be a zero byte.
*/
return 0;
} else {
/*
* We allow a 0 byte output if the Unicode code point is
* U+0000 and also if the "assigned" bit is set for this entry.
* There is no way with this data structure for fallback output
* for other than U+0000 to be a zero byte.
*/
if(!(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0)) {
/* assigned */
*pValue=value;
return length;
}
} else {
return 0;
}
return 0;
}
U_CFUNC int32_t
_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
UChar32 c,
UBool useFallback) {
const uint16_t *table=sharedData->table->mbcs.fromUnicodeTable;
uint32_t i;
int32_t value;
/* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
i=0x440+2*((uint32_t)table[c>>10]+((c>>4)&0x3f));
/* is this code point assigned, or do we use fallbacks? */
if((table[i++]&(1<<(c&0xf)))!=0 || FROM_U_USE_FALLBACK(useFallback, c)) {
const uint8_t *p=sharedData->table->mbcs.fromUnicodeBytes;
/* get the byte for the output */
/* MBCS_OUTPUT_1 */
p+=(16*(uint32_t)table[i]+(c&0xf));
value=*p;
/* is the codepage value really an "unassigned" indicator? */
/*
* We allow a 0 byte output if the Unicode code point is
* U+0000 and also if the "assigned" bit is set for this entry.
* There is no way with this data structure for fallback output
* for other than U+0000 to be a zero byte.
*/
if(!(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0)) {
/* assigned */
return value;
}
}
return -1;
}
/* miscellaneous ------------------------------------------------------------ */
@ -2369,8 +2834,6 @@ const UConverterSharedData _MBCSData={
/* GB 18030 special handling ------------------------------------------------ */
/* ### IMPORTANT: THIS IS ALPHA-VERSION SUPPORT CODE FOR GB 18030 AND MAY CHANGE WITHOUT NOTICE */
/* definition of LINEAR macros and gb18030Ranges see near the beginning of the file */
/* the callback functions handle GB 18030 specially */

View File

@ -21,6 +21,10 @@
/* MBCS converter data and state -------------------------------------------- */
/**
* MBCS action codes for conversions to Unicode.
* These values are in bits 30..27 of the state table entries.
*/
enum {
MBCS_STATE_ILLEGAL,
MBCS_STATE_CHANGE_ONLY,
@ -36,6 +40,11 @@ enum {
MBCS_STATE_VALID_16_PAIR
};
/**
* MBCS output types for conversions from Unicode.
* These per-converter types determine the storage method in stage 3 of the lookup table,
* mostly how many bytes are stored per entry.
*/
enum {
MBCS_OUTPUT_1,
MBCS_OUTPUT_2,
@ -46,11 +55,19 @@ enum {
MBCS_OUTPUT_4_EUC
};
/**
* Fallbacks to Unicode are stored outside the normal state table and code point structures
* in a vector of items of this type. They are sorted by offset.
*/
typedef struct {
uint32_t offset;
UChar32 codePoint;
} _MBCSToUFallback;
/**
* This is the MBCS part of the UConverterTable union (a runtime data structure).
* It keeps all the per-converter data and points into the loaded mapping tables.
*/
typedef struct UConverterMBCSTable {
/* toUnicode */
uint8_t countStates;
@ -63,10 +80,10 @@ typedef struct UConverterMBCSTable {
/* fromUnicode */
const uint16_t *fromUnicodeTable;
const uint8_t *fromUnicodeBytes;
uint8_t outputType;
uint8_t outputType, unicodeMask;
} UConverterMBCSTable;
/*
/**
* MBCS data structure as part of a .cnv file:
*
* uint32_t [8]; -- 8 values:
@ -105,20 +122,78 @@ typedef struct {
reserved;
} _MBCSHeader;
/** Forward declaration to enable the following function declarations. */
struct UConverterSharedData;
/** Forward declaration to enable the following function declarations. */
typedef struct UConverterSharedData UConverterSharedData;
/**
* This is a simple version of _MBCSGetNextUChar() that is used
* by other converter implementations.
* It does not use state from the converter, nor error codes.
*
* Return value:
* U+fffe unassigned
* U+ffff illegal
* otherwise the Unicode code point
*/
U_CFUNC UChar32
_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
const char **pSource, const char *sourceLimit,
UBool useFallback);
/** This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. */
U_CFUNC UChar32
_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
uint8_t b, UBool useFallback);
/**
* This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte.
* It works for single-byte, single-state codepages that only map
* to and from BMP code points, and it always
* returns fallback values.
*/
#define _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(sharedData, b) \
(UChar)(((sharedData)->table->mbcs.stateTable[0][b])>>7)
/**
* This is an internal function that allows other converter implementations
* to check whether a byte is a lead byte.
*/
U_CFUNC UBool
_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte);
/** This is a macro version of _MBCSIsLeadByte(). */
#define _MBCS_IS_LEAD_BYTE(sharedData, byte) \
(UBool)((sharedData)->table->mbcs.stateTable[0][(uint8_t)(byte)]>=0)
/**
* This is another simple conversion function for internal use by other
* conversion implementations.
* It does not use the converter state nor call callbacks.
* It converts one single Unicode code point into codepage bytes, encoded
* as one 32-bit value. The function returns the number of bytes in *pValue:
* 1..4 the number of bytes in *pValue
* 0 unassigned (*pValue undefined)
* -1 illegal (currently not used, *pValue undefined)
*
* *pValue will contain the resulting bytes with the last byte in bits 7..0,
* the second to last byte in bits 15..8, etc.
* Currently, the function assumes but does not check that 0<=c<=0x10ffff.
*/
U_CFUNC int32_t
_MBCSFromUChar32(UConverterSharedData *sharedData,
UChar32 c, uint32_t *pValue,
UBool useFallback);
/**
* This version of _MBCSFromUChar32() is optimized for single-byte codepages.
* It returns the codepage byte for the code point, or -1 if it is unassigned.
*/
U_CFUNC int32_t
_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
UChar32 c,
UBool useFallback);
#endif

View File

@ -1328,11 +1328,12 @@ static const UConverterStaticData _SCSUStaticData={
1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
{ 0x0e, 0xff, 0xfd, 0 }, 3, /* ### the subchar really must be written by an SCSU function! */
FALSE, FALSE,
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
const UConverterSharedData _SCSUData={
sizeof(UConverterSharedData), 1,
sizeof(UConverterSharedData), ~((uint32_t)0),
NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
0
};

View File

@ -173,7 +173,7 @@ static UDataInfo dataInfo={
0,
0x63, 0x6e, 0x76, 0x74, /* dataFormat="cnvt" */
6, 0, 0, 0, /* formatVersion */
6, 1, 0, 0, /* formatVersion */
0, 0, 0, 0 /* dataVersion (calculated at runtime) */
};
@ -648,7 +648,7 @@ void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, U
int32_t mbcsLength;
char codepointBytes[20];
UBool isOK = TRUE;
uint8_t precisionMask = 0;
uint8_t precisionMask = 0, unicodeMask = 0;
char endOfLine;
if(cnvData->startMappings!=NULL)
@ -684,6 +684,13 @@ void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, U
/* End of line could be \0 or | (if fallback) */
endOfLine= line[nextTokenOffset(line, CODEPOINT_SEPARATORS)];
} while((endOfLine != '\0') && (endOfLine != FALLBACK_SEPARATOR));
if(unicodeValue>=0x10000) {
unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
} else if(UTF_IS_SURROGATE(unicodeValue)) {
unicodeMask|=UCNV_HAS_SURROGATES; /* there are single surrogates */
}
if((uint32_t)unicodeValue > 0x10ffff)
{
fprintf(stderr, "error: Unicode code point > U+10ffff in '%s'\n", storageLine);
@ -730,6 +737,12 @@ void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, U
}
}
if(unicodeMask == 3)
{
fprintf(stderr, "warning: contains mappings to both supplementary code points and single surrogates\n");
}
staticData->unicodeMask = unicodeMask;
if(cnvData->finishMappings!=NULL)
{
cnvData->finishMappings(cnvData, staticData);

View File

@ -17,36 +17,40 @@
static const UConverterStaticData _SBCSStaticData={
sizeof(UConverterStaticData),
"SBCS",
sizeof(UConverterStaticData),
"SBCS",
0, UCNV_IBM, UCNV_SBCS, 1, 1,
{ 0, 0, 0, 0 }, 1, FALSE, FALSE,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved bytes */
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
static const UConverterStaticData _DBCSStaticData={
sizeof(UConverterStaticData),
"DBCS",
sizeof(UConverterStaticData),
"DBCS",
0, UCNV_IBM, UCNV_DBCS, 2, 2,
{ 0, 0, 0, 0 },1, FALSE, FALSE, /* subchar */
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved bytes */
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
static const UConverterStaticData _MBCSStaticData={
sizeof(UConverterStaticData),
"MBCS",
sizeof(UConverterStaticData),
"MBCS",
0, UCNV_IBM, UCNV_MBCS, 1, 1,
{ 0, 0, 0, 0 }, 1, FALSE, FALSE,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved bytes */
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
static const UConverterStaticData _EBCDICStatefulStaticData={
sizeof(UConverterStaticData),
"EBCDICStateful",
sizeof(UConverterStaticData),
"EBCDICStateful",
0, UCNV_IBM, UCNV_EBCDIC_STATEFUL, 1, 1,
{ 0, 0, 0, 0 },1, FALSE, FALSE,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved bytes */
{ 0, 0, 0, 0 },1, FALSE, FALSE,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
/* NULLs for algorithmic types, their tables live in ucnv_bld.c */