ICU-484 add unicodeMask to UConverterStaticData for optimized implementations
X-SVN-Rev: 3280
This commit is contained in:
parent
b2b6812d1e
commit
998f792a5a
@ -376,7 +376,8 @@ const UConverterStaticData _ISO2022StaticData={
|
||||
1,
|
||||
FALSE,
|
||||
FALSE,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} /* reserved */
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
|
||||
|
@ -64,6 +64,10 @@ typedef union UConverterTable UConverterTable;
|
||||
struct UConverterImpl;
|
||||
typedef struct UConverterImpl UConverterImpl;
|
||||
|
||||
/** values for the unicodeMask */
|
||||
#define UCNV_HAS_SUPPLEMENTARY 1
|
||||
#define UCNV_HAS_SURROGATES 2
|
||||
|
||||
typedef struct UConverterStaticData { /* +offset: size */
|
||||
uint32_t structSize; /* +0: 4 Size of this structure */
|
||||
|
||||
@ -83,7 +87,8 @@ typedef struct UConverterStaticData { /* +offset: size */
|
||||
|
||||
uint8_t hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
|
||||
uint8_t hasFromUnicodeFallback; /* +78: 1 */
|
||||
uint8_t reserved[21]; /* +79: 21 to round out the structure */
|
||||
uint8_t unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */
|
||||
uint8_t reserved[20]; /* +80: 20 to round out the structure */
|
||||
/* total size: 100 */
|
||||
} UConverterStaticData;
|
||||
|
||||
|
@ -518,7 +518,7 @@ const UConverterStaticData _LMBCSStaticData##n={\
|
||||
sizeof(UConverterStaticData),\
|
||||
"LMBCS-" #n,\
|
||||
0, UCNV_IBM, UCNV_LMBCS_##n, 1, 1,\
|
||||
{ 0x3f, 0, 0, 0 },1,FALSE,FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \
|
||||
{ 0x3f, 0, 0, 0 },1,FALSE,FALSE,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \
|
||||
};\
|
||||
const UConverterSharedData _LMBCSData##n={\
|
||||
sizeof(UConverterSharedData), ~((uint32_t) 0),\
|
||||
|
@ -784,7 +784,9 @@ const UConverterStaticData _UTF8StaticData={
|
||||
sizeof(UConverterStaticData),
|
||||
"UTF8",
|
||||
1208, UCNV_IBM, UCNV_UTF8, 1, 4,
|
||||
{ 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
|
||||
{ 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
|
||||
@ -964,7 +966,9 @@ const UConverterStaticData _UTF16BEStaticData={
|
||||
sizeof(UConverterStaticData),
|
||||
"UTF16_BigEndian",
|
||||
1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
|
||||
{ 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
|
||||
{ 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
|
||||
@ -1154,7 +1158,9 @@ const UConverterStaticData _UTF16LEStaticData={
|
||||
sizeof(UConverterStaticData),
|
||||
"UTF16_LittleEndian",
|
||||
1200, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
|
||||
{ 0xfd, 0xff, 0, 0 },2,0,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
|
||||
{ 0xfd, 0xff, 0, 0 },2,0,0,
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
|
||||
@ -1382,7 +1388,8 @@ const UConverterStaticData _UTF32BEStaticData = {
|
||||
0, /* Should be the UTF-32 CCSID */
|
||||
UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
|
||||
{ 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
|
||||
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
const UConverterSharedData _UTF32BEData = {
|
||||
@ -1610,7 +1617,8 @@ const UConverterStaticData _UTF32LEStaticData = {
|
||||
0, /* Should be the UTF-32 CCSID */
|
||||
UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
|
||||
{ 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
|
||||
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
|
||||
|
@ -85,7 +85,8 @@ const UConverterStaticData _HZStaticData={
|
||||
"HZ",
|
||||
2023, UCNV_IBM, UCNV_HZ, 1, 4,
|
||||
{ 0x1a, 0, 0, 0 },1, FALSE, FALSE,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} /* reserved */
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
|
||||
|
@ -195,10 +195,12 @@ static const UConverterImpl _Latin1Impl={
|
||||
};
|
||||
|
||||
const UConverterStaticData _Latin1StaticData={
|
||||
sizeof(UConverterStaticData),
|
||||
"LATIN_1",
|
||||
sizeof(UConverterStaticData),
|
||||
"LATIN_1",
|
||||
819, UCNV_IBM, UCNV_LATIN_1, 1, 1,
|
||||
{ 0x1a, 0, 0, 0 },1,FALSE, FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
|
||||
{ 0x1a, 0, 0, 0 },1,FALSE, FALSE,
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
|
||||
|
@ -33,6 +33,7 @@
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/ucnv_cb.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "ucnv_bld.h"
|
||||
#include "ucnvmbcs.h"
|
||||
#include "ucnv_cnv.h"
|
||||
@ -173,6 +174,10 @@ U_CFUNC void
|
||||
_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC void
|
||||
_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC UChar32
|
||||
_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode);
|
||||
@ -185,6 +190,10 @@ U_CFUNC void
|
||||
_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC void
|
||||
_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
static void
|
||||
fromUCallback(UConverter *cnv,
|
||||
void *context, UConverterFromUnicodeArgs *pArgs,
|
||||
@ -238,6 +247,7 @@ U_CFUNC void
|
||||
_MBCSLoad(UConverterSharedData *sharedData,
|
||||
const uint8_t *raw,
|
||||
UErrorCode *pErrorCode) {
|
||||
UDataInfo info;
|
||||
UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs;
|
||||
_MBCSHeader *header=(_MBCSHeader *)raw;
|
||||
|
||||
@ -255,6 +265,20 @@ _MBCSLoad(UConverterSharedData *sharedData,
|
||||
mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
|
||||
mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
|
||||
mbcsTable->outputType=(uint8_t)header->flags;
|
||||
|
||||
/*
|
||||
* converter versions 6.1 and up contain a unicodeMask that is
|
||||
* used here to select the most efficient function implementations
|
||||
*/
|
||||
info.size=sizeof(UDataInfo);
|
||||
udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
|
||||
if(info.formatVersion[0]>6 || info.formatVersion[0]==6 && info.formatVersion[1]>=1) {
|
||||
/* mask off possible future extensions to be safe */
|
||||
mbcsTable->unicodeMask=sharedData->staticData->unicodeMask&3;
|
||||
} else {
|
||||
/* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
|
||||
mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
|
||||
}
|
||||
}
|
||||
|
||||
U_CFUNC void
|
||||
@ -338,7 +362,11 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
/* use optimized function if possible */
|
||||
cnv=pArgs->converter;
|
||||
if(cnv->sharedData->table->mbcs.countStates==1) {
|
||||
_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
|
||||
if(!(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
|
||||
_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
|
||||
} else {
|
||||
_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@ -669,16 +697,16 @@ callback:
|
||||
*/
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
break;
|
||||
} else if(cnv->UCharErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
/* break on error */
|
||||
offset=0;
|
||||
state=0;
|
||||
byteIndex=0;
|
||||
break;
|
||||
} else if(cnv->UCharErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -717,7 +745,7 @@ endloop:
|
||||
pArgs->offsets=offsets;
|
||||
}
|
||||
|
||||
/* This version of _MBCSToUnicode() is optimized for single-byte, single-state codepages. */
|
||||
/* This version of _MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
|
||||
U_CFUNC void
|
||||
_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
@ -875,13 +903,13 @@ callback:
|
||||
*/
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
break;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
/* break on error */
|
||||
break;
|
||||
} else if(cnv->UCharErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
/* break on error */
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -903,6 +931,175 @@ endloop:
|
||||
pArgs->offsets=offsets;
|
||||
}
|
||||
|
||||
/*
|
||||
* This version of _MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
|
||||
* that only map to and from the BMP.
|
||||
* In addition to single-byte optimizations, the offset calculations
|
||||
* become much easier.
|
||||
*/
|
||||
U_CFUNC void
|
||||
_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
UConverter *cnv;
|
||||
const uint8_t *source, *sourceLimit, *lastSource;
|
||||
UChar *target;
|
||||
int32_t targetCapacity, length;
|
||||
int32_t *offsets;
|
||||
|
||||
const int32_t (*stateTable)[256];
|
||||
|
||||
int32_t sourceIndex;
|
||||
|
||||
int32_t entry;
|
||||
uint8_t b;
|
||||
UConverterCallbackReason reason;
|
||||
|
||||
/* set up the local pointers */
|
||||
cnv=pArgs->converter;
|
||||
source=(const uint8_t *)pArgs->source;
|
||||
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
|
||||
target=pArgs->target;
|
||||
targetCapacity=pArgs->targetLimit-pArgs->target;
|
||||
offsets=pArgs->offsets;
|
||||
|
||||
stateTable=cnv->sharedData->table->mbcs.stateTable;
|
||||
|
||||
/* sourceIndex=-1 if the current character began in the previous buffer */
|
||||
sourceIndex=0;
|
||||
lastSource=source;
|
||||
|
||||
/*
|
||||
* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
|
||||
* for the minimum of the sourceLength and targetCapacity
|
||||
*/
|
||||
length=sourceLimit-source;
|
||||
if(length<targetCapacity) {
|
||||
targetCapacity=length;
|
||||
}
|
||||
|
||||
/* conversion loop */
|
||||
while(targetCapacity>0) {
|
||||
b=*source++;
|
||||
entry=stateTable[0][b];
|
||||
/* entry<0 */
|
||||
/*
|
||||
* bit 31 is set, bits:
|
||||
* 30..27 action code
|
||||
* (do not mask out bit 31 for speed, include it in action values)
|
||||
* 26..7 depend on the action code
|
||||
* 6..0 next state
|
||||
*/
|
||||
|
||||
/* switch per action code */
|
||||
switch((uint32_t)entry>>27U) {
|
||||
case 16|MBCS_STATE_ILLEGAL:
|
||||
/* bits 26..7 are not used, 0 */
|
||||
/* callback(illegal) */
|
||||
reason=UCNV_ILLEGAL;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
case 16|MBCS_STATE_UNASSIGNED:
|
||||
/* bits 26..7 are not used, 0 */
|
||||
/* callback(unassigned) */
|
||||
reason=UCNV_UNASSIGNED;
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
break;
|
||||
case 16|MBCS_STATE_FALLBACK_DIRECT_16:
|
||||
/* bits 26..23 are not used, 0 */
|
||||
/* bits 22..7 contain the Unicode BMP code point */
|
||||
if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
|
||||
/* callback(unassigned) */
|
||||
reason=UCNV_UNASSIGNED;
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
/* fall through to the MBCS_STATE_VALID_DIRECT_16 branch */
|
||||
case 16|MBCS_STATE_VALID_DIRECT_16:
|
||||
/* bits 26..23 are not used, 0 */
|
||||
/* bits 22..7 contain the Unicode BMP code point */
|
||||
/* output BMP code point */
|
||||
*target++=(UChar)(entry>>7);
|
||||
--targetCapacity;
|
||||
continue;
|
||||
default:
|
||||
/* reserved, must never occur */
|
||||
/* bits 26..7 are not used, 0 */
|
||||
continue;
|
||||
}
|
||||
|
||||
/* call the callback function with all the preparations and post-processing */
|
||||
/* set offsets since the start or the last callback */
|
||||
if(offsets!=NULL) {
|
||||
int32_t count=(int32_t)(source-lastSource);
|
||||
|
||||
/* predecrement: do not set the offset for the callback-causing character */
|
||||
while(--count>0) {
|
||||
*offsets++=sourceIndex++;
|
||||
}
|
||||
/* offset and sourceIndex are now set for the current character */
|
||||
}
|
||||
|
||||
/* update the arguments structure */
|
||||
pArgs->source=(const char *)source;
|
||||
pArgs->target=target;
|
||||
pArgs->offsets=offsets;
|
||||
|
||||
/* copy the current bytes to invalidCharBuffer */
|
||||
cnv->invalidCharBuffer[0]=b;
|
||||
cnv->invalidCharLength=1;
|
||||
|
||||
/* call the callback function */
|
||||
toUCallback(cnv, cnv->toUContext, pArgs, (const char *)&b, 1, reason, pErrorCode);
|
||||
|
||||
/* update target and deal with offsets if necessary */
|
||||
offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
|
||||
target=pArgs->target;
|
||||
|
||||
/* update the source pointer and index */
|
||||
sourceIndex+=1+((const uint8_t *)pArgs->source-source);
|
||||
source=lastSource=(const uint8_t *)pArgs->source;
|
||||
targetCapacity=pArgs->targetLimit-target;
|
||||
length=sourceLimit-source;
|
||||
if(length<targetCapacity) {
|
||||
targetCapacity=length;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the callback overflowed the target, then we need to
|
||||
* stop here with an overflow indication.
|
||||
*/
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
break;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
/* break on error */
|
||||
break;
|
||||
} else if(cnv->UCharErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
|
||||
/* set offsets since the start or the last callback */
|
||||
if(offsets!=NULL) {
|
||||
size_t count=source-lastSource;
|
||||
while(count>0) {
|
||||
*offsets++=sourceIndex++;
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
/* write back the updated pointers */
|
||||
pArgs->source=(const char *)source;
|
||||
pArgs->target=target;
|
||||
pArgs->offsets=offsets;
|
||||
}
|
||||
|
||||
U_CFUNC UChar32
|
||||
_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
@ -1521,7 +1718,11 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
cnv=pArgs->converter;
|
||||
outputType=cnv->sharedData->table->mbcs.outputType;
|
||||
if(outputType==MBCS_OUTPUT_1) {
|
||||
_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
|
||||
if(!(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
|
||||
_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
|
||||
} else {
|
||||
_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1917,14 +2118,14 @@ callback:
|
||||
*/
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
break;
|
||||
} else if(cnv->charErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
/* break on error */
|
||||
c=0;
|
||||
break;
|
||||
} else if(cnv->charErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1942,7 +2143,7 @@ callback:
|
||||
if(pArgs->flush && source>=sourceLimit) {
|
||||
/* reset the state for the next conversion */
|
||||
if(c!=0 && U_SUCCESS(*pErrorCode)) {
|
||||
/* a character byte sequence remains incomplete */
|
||||
/* a Unicode code point remains incomplete (only a first surrogate) */
|
||||
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
||||
}
|
||||
cnv->fromUSurrogateLead=0;
|
||||
@ -1969,7 +2170,6 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
|
||||
const uint16_t *table;
|
||||
const uint8_t *bytes;
|
||||
uint8_t outputType;
|
||||
|
||||
UChar32 c;
|
||||
|
||||
@ -1977,7 +2177,7 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
|
||||
UConverterCallbackReason reason;
|
||||
uint32_t i;
|
||||
uint32_t value;
|
||||
uint8_t value;
|
||||
|
||||
/* set up the local pointers */
|
||||
cnv=pArgs->converter;
|
||||
@ -1989,7 +2189,6 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
|
||||
table=cnv->sharedData->table->mbcs.fromUnicodeTable;
|
||||
bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes;
|
||||
outputType=cnv->sharedData->table->mbcs.outputType;
|
||||
|
||||
/* get the converter state from UConverter */
|
||||
c=cnv->fromUSurrogateLead;
|
||||
@ -2064,7 +2263,21 @@ getTrail:
|
||||
value=*p;
|
||||
|
||||
/* is the codepage value really an "unassigned" indicator? */
|
||||
if(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0) {
|
||||
if(!(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0)) {
|
||||
/* assigned, write the output character bytes from value and length */
|
||||
/* length==1 */
|
||||
/* this is easy because we know that there is enough space */
|
||||
*target++=value;
|
||||
if(offsets!=NULL) {
|
||||
*offsets++=sourceIndex;
|
||||
}
|
||||
--targetCapacity;
|
||||
|
||||
/* normal end of conversion: prepare for a new character */
|
||||
c=0;
|
||||
sourceIndex=nextSourceIndex;
|
||||
continue;
|
||||
} else { /* unassigned */
|
||||
/*
|
||||
* We allow a 0 byte output if the Unicode code point is
|
||||
* U+0000 and also if the "assigned" bit is set for this entry.
|
||||
@ -2074,29 +2287,13 @@ getTrail:
|
||||
/* callback(unassigned) */
|
||||
reason=UCNV_UNASSIGNED;
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
goto callback;
|
||||
}
|
||||
} else {
|
||||
/* callback(unassigned) */
|
||||
reason=UCNV_UNASSIGNED;
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
goto callback;
|
||||
}
|
||||
|
||||
/* write the output character bytes from value and length */
|
||||
/* length==1 */
|
||||
/* this is easy because we know that there is enough space */
|
||||
*target++=(uint8_t)value;
|
||||
if(offsets!=NULL) {
|
||||
*offsets++=sourceIndex;
|
||||
}
|
||||
--targetCapacity;
|
||||
|
||||
/* normal end of conversion: prepare for a new character */
|
||||
c=0;
|
||||
sourceIndex=nextSourceIndex;
|
||||
continue;
|
||||
|
||||
callback:
|
||||
/* call the callback function with all the preparations and post-processing */
|
||||
/* update the arguments structure */
|
||||
@ -2133,14 +2330,14 @@ callback:
|
||||
*/
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
break;
|
||||
} else if(cnv->charErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
/* break on error */
|
||||
c=0;
|
||||
break;
|
||||
} else if(cnv->charErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2158,7 +2355,242 @@ callback:
|
||||
if(pArgs->flush && source>=sourceLimit) {
|
||||
/* reset the state for the next conversion */
|
||||
if(c!=0 && U_SUCCESS(*pErrorCode)) {
|
||||
/* a character byte sequence remains incomplete */
|
||||
/* a Unicode code point remains incomplete (only a first surrogate) */
|
||||
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
||||
}
|
||||
cnv->fromUSurrogateLead=0;
|
||||
} else {
|
||||
/* set the converter state back into UConverter */
|
||||
cnv->fromUSurrogateLead=(UChar)c;
|
||||
}
|
||||
|
||||
/* write back the updated pointers */
|
||||
pArgs->source=source;
|
||||
pArgs->target=(char *)target;
|
||||
pArgs->offsets=offsets;
|
||||
}
|
||||
|
||||
/*
|
||||
* This version of _MBCSFromUnicode() is optimized for single-byte codepages
|
||||
* that map only to and from the BMP.
|
||||
* In addition to single-byte/state optimizations, the offset calculations
|
||||
* become much easier.
|
||||
*/
|
||||
U_CFUNC void
|
||||
_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
UConverter *cnv;
|
||||
const UChar *source, *sourceLimit, *lastSource;
|
||||
uint8_t *target;
|
||||
int32_t targetCapacity, length;
|
||||
int32_t *offsets;
|
||||
|
||||
const uint16_t *table;
|
||||
const uint8_t *bytes;
|
||||
|
||||
UChar32 c;
|
||||
|
||||
int32_t sourceIndex;
|
||||
|
||||
UConverterCallbackReason reason;
|
||||
uint32_t i;
|
||||
uint8_t value;
|
||||
|
||||
/* set up the local pointers */
|
||||
cnv=pArgs->converter;
|
||||
source=pArgs->source;
|
||||
sourceLimit=pArgs->sourceLimit;
|
||||
target=(uint8_t *)pArgs->target;
|
||||
targetCapacity=pArgs->targetLimit-pArgs->target;
|
||||
offsets=pArgs->offsets;
|
||||
|
||||
table=cnv->sharedData->table->mbcs.fromUnicodeTable;
|
||||
bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes;
|
||||
|
||||
/* get the converter state from UConverter */
|
||||
c=cnv->fromUSurrogateLead;
|
||||
|
||||
/* sourceIndex=-1 if the current character began in the previous buffer */
|
||||
sourceIndex= c==0 ? 0 : -1;
|
||||
lastSource=source;
|
||||
|
||||
/*
|
||||
* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
|
||||
* for the minimum of the sourceLength and targetCapacity
|
||||
*/
|
||||
length=sourceLimit-source;
|
||||
if(length<targetCapacity) {
|
||||
targetCapacity=length;
|
||||
}
|
||||
|
||||
/* conversion loop */
|
||||
if(c!=0 && targetCapacity>0) {
|
||||
goto getTrail;
|
||||
}
|
||||
|
||||
while(targetCapacity>0) {
|
||||
/*
|
||||
* Get a correct Unicode code point:
|
||||
* a single UChar for a BMP code point or
|
||||
* a matched surrogate pair for a "surrogate code point".
|
||||
*/
|
||||
c=*source++;
|
||||
if(!UTF_IS_SURROGATE(c)) {
|
||||
/* convert the Unicode code point in c into codepage bytes */
|
||||
i=0x440+2*((uint32_t)table[c>>10]+((c>>4)&0x3f));
|
||||
|
||||
/* is this code point assigned, or do we use fallbacks? */
|
||||
if((table[i++]&(1<<(c&0xf)))!=0 || UCNV_FROM_U_USE_FALLBACK(cnv, c)) {
|
||||
const uint8_t *p=bytes;
|
||||
|
||||
/* MBCS_OUTPUT_1 */
|
||||
p+=(16*(uint32_t)table[i]+(c&0xf));
|
||||
value=*p;
|
||||
|
||||
/* is the codepage value really an "unassigned" indicator? */
|
||||
if(!(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0)) {
|
||||
/* assigned, write the output character bytes from value and length */
|
||||
/* length==1 */
|
||||
/* this is easy because we know that there is enough space */
|
||||
*target++=value;
|
||||
--targetCapacity;
|
||||
|
||||
/* normal end of conversion: prepare for a new character */
|
||||
c=0;
|
||||
continue;
|
||||
} else { /* unassigned */
|
||||
/*
|
||||
* We allow a 0 byte output if the Unicode code point is
|
||||
* U+0000 and also if the "assigned" bit is set for this entry.
|
||||
* There is no way with this data structure for fallback output
|
||||
* for other than U+0000 to be a zero byte.
|
||||
*/
|
||||
/* callback(unassigned) */
|
||||
reason=UCNV_UNASSIGNED;
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
}
|
||||
} else {
|
||||
/* callback(unassigned) */
|
||||
reason=UCNV_UNASSIGNED;
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
}
|
||||
} else {
|
||||
if(UTF_IS_SURROGATE_FIRST(c)) {
|
||||
getTrail:
|
||||
if(source<sourceLimit) {
|
||||
/* test the following code unit */
|
||||
UChar trail=*source;
|
||||
if(UTF_IS_SECOND_SURROGATE(trail)) {
|
||||
++source;
|
||||
c=UTF16_GET_PAIR_VALUE(c, trail);
|
||||
/* this codepage does not map supplementary code points */
|
||||
/* callback(unassigned) */
|
||||
reason=UCNV_UNASSIGNED;
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
} else {
|
||||
/* this is an unmatched lead code unit (1st surrogate) */
|
||||
/* callback(illegal) */
|
||||
reason=UCNV_ILLEGAL;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
} else {
|
||||
/* no more input */
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* this is an unmatched trail code unit (2nd surrogate) */
|
||||
/* callback(illegal) */
|
||||
reason=UCNV_ILLEGAL;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
/* call the callback function with all the preparations and post-processing */
|
||||
/* get the number of code units for c to correctly advance sourceIndex after the callback call */
|
||||
length=UTF_CHAR_LENGTH(c);
|
||||
|
||||
/* set offsets since the start or the last callback */
|
||||
if(offsets!=NULL) {
|
||||
int32_t count=(int32_t)(source-lastSource);
|
||||
|
||||
/* do not set the offset for the callback-causing character */
|
||||
count-=length;
|
||||
|
||||
while(count>0) {
|
||||
*offsets++=sourceIndex++;
|
||||
--count;
|
||||
}
|
||||
/* offset and sourceIndex are now set for the current character */
|
||||
}
|
||||
|
||||
/* update the arguments structure */
|
||||
pArgs->source=source;
|
||||
pArgs->target=(char *)target;
|
||||
pArgs->offsets=offsets;
|
||||
|
||||
/* set the converter state in UConverter to deal with the next character */
|
||||
cnv->fromUSurrogateLead=0;
|
||||
|
||||
/* write the code point as code units */
|
||||
i=0;
|
||||
UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, c);
|
||||
cnv->invalidUCharLength=(int8_t)i;
|
||||
/* i==length */
|
||||
|
||||
/* call the callback function */
|
||||
fromUCallback(cnv, cnv->fromUContext, pArgs, cnv->invalidUCharBuffer, i, c, reason, pErrorCode);
|
||||
|
||||
/* get the converter state from UConverter */
|
||||
c=cnv->fromUSurrogateLead;
|
||||
|
||||
/* update target and deal with offsets if necessary */
|
||||
offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
|
||||
target=(uint8_t *)pArgs->target;
|
||||
|
||||
/* update the source pointer and index */
|
||||
sourceIndex+=length+(pArgs->source-source);
|
||||
source=lastSource=pArgs->source;
|
||||
targetCapacity=(uint8_t *)pArgs->targetLimit-target;
|
||||
length=sourceLimit-source;
|
||||
if(length<targetCapacity) {
|
||||
targetCapacity=length;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the callback overflowed the target, then we need to
|
||||
* stop here with an overflow indication.
|
||||
*/
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
break;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
/* break on error */
|
||||
c=0;
|
||||
break;
|
||||
} else if(cnv->charErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
|
||||
/* set offsets since the start or the last callback */
|
||||
if(offsets!=NULL) {
|
||||
size_t count=source-lastSource;
|
||||
while(count>0) {
|
||||
*offsets++=sourceIndex++;
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
if(pArgs->flush && source>=sourceLimit) {
|
||||
/* reset the state for the next conversion */
|
||||
if(c!=0 && U_SUCCESS(*pErrorCode)) {
|
||||
/* a Unicode code point remains incomplete (only a first surrogate) */
|
||||
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
||||
}
|
||||
cnv->fromUSurrogateLead=0;
|
||||
@ -2295,21 +2727,54 @@ _MBCSFromUChar32(UConverterSharedData *sharedData,
|
||||
}
|
||||
|
||||
/* is the codepage value really an "unassigned" indicator? */
|
||||
if(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0) {
|
||||
/*
|
||||
* We allow a 0 byte output if the Unicode code point is
|
||||
* U+0000 and also if the "assigned" bit is set for this entry.
|
||||
* There is no way with this data structure for fallback output
|
||||
* for other than U+0000 to be a zero byte.
|
||||
*/
|
||||
return 0;
|
||||
} else {
|
||||
/*
|
||||
* We allow a 0 byte output if the Unicode code point is
|
||||
* U+0000 and also if the "assigned" bit is set for this entry.
|
||||
* There is no way with this data structure for fallback output
|
||||
* for other than U+0000 to be a zero byte.
|
||||
*/
|
||||
if(!(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0)) {
|
||||
/* assigned */
|
||||
*pValue=value;
|
||||
return length;
|
||||
}
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
|
||||
UChar32 c,
|
||||
UBool useFallback) {
|
||||
const uint16_t *table=sharedData->table->mbcs.fromUnicodeTable;
|
||||
uint32_t i;
|
||||
int32_t value;
|
||||
|
||||
/* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
|
||||
i=0x440+2*((uint32_t)table[c>>10]+((c>>4)&0x3f));
|
||||
|
||||
/* is this code point assigned, or do we use fallbacks? */
|
||||
if((table[i++]&(1<<(c&0xf)))!=0 || FROM_U_USE_FALLBACK(useFallback, c)) {
|
||||
const uint8_t *p=sharedData->table->mbcs.fromUnicodeBytes;
|
||||
|
||||
/* get the byte for the output */
|
||||
/* MBCS_OUTPUT_1 */
|
||||
p+=(16*(uint32_t)table[i]+(c&0xf));
|
||||
value=*p;
|
||||
|
||||
/* is the codepage value really an "unassigned" indicator? */
|
||||
/*
|
||||
* We allow a 0 byte output if the Unicode code point is
|
||||
* U+0000 and also if the "assigned" bit is set for this entry.
|
||||
* There is no way with this data structure for fallback output
|
||||
* for other than U+0000 to be a zero byte.
|
||||
*/
|
||||
if(!(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0)) {
|
||||
/* assigned */
|
||||
return value;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* miscellaneous ------------------------------------------------------------ */
|
||||
@ -2369,8 +2834,6 @@ const UConverterSharedData _MBCSData={
|
||||
|
||||
/* GB 18030 special handling ------------------------------------------------ */
|
||||
|
||||
/* ### IMPORTANT: THIS IS ALPHA-VERSION SUPPORT CODE FOR GB 18030 AND MAY CHANGE WITHOUT NOTICE */
|
||||
|
||||
/* definition of LINEAR macros and gb18030Ranges see near the beginning of the file */
|
||||
|
||||
/* the callback functions handle GB 18030 specially */
|
||||
|
@ -21,6 +21,10 @@
|
||||
|
||||
/* MBCS converter data and state -------------------------------------------- */
|
||||
|
||||
/**
|
||||
* MBCS action codes for conversions to Unicode.
|
||||
* These values are in bits 30..27 of the state table entries.
|
||||
*/
|
||||
enum {
|
||||
MBCS_STATE_ILLEGAL,
|
||||
MBCS_STATE_CHANGE_ONLY,
|
||||
@ -36,6 +40,11 @@ enum {
|
||||
MBCS_STATE_VALID_16_PAIR
|
||||
};
|
||||
|
||||
/**
|
||||
* MBCS output types for conversions from Unicode.
|
||||
* These per-converter types determine the storage method in stage 3 of the lookup table,
|
||||
* mostly how many bytes are stored per entry.
|
||||
*/
|
||||
enum {
|
||||
MBCS_OUTPUT_1,
|
||||
MBCS_OUTPUT_2,
|
||||
@ -46,11 +55,19 @@ enum {
|
||||
MBCS_OUTPUT_4_EUC
|
||||
};
|
||||
|
||||
/**
|
||||
* Fallbacks to Unicode are stored outside the normal state table and code point structures
|
||||
* in a vector of items of this type. They are sorted by offset.
|
||||
*/
|
||||
typedef struct {
|
||||
uint32_t offset;
|
||||
UChar32 codePoint;
|
||||
} _MBCSToUFallback;
|
||||
|
||||
/**
|
||||
* This is the MBCS part of the UConverterTable union (a runtime data structure).
|
||||
* It keeps all the per-converter data and points into the loaded mapping tables.
|
||||
*/
|
||||
typedef struct UConverterMBCSTable {
|
||||
/* toUnicode */
|
||||
uint8_t countStates;
|
||||
@ -63,10 +80,10 @@ typedef struct UConverterMBCSTable {
|
||||
/* fromUnicode */
|
||||
const uint16_t *fromUnicodeTable;
|
||||
const uint8_t *fromUnicodeBytes;
|
||||
uint8_t outputType;
|
||||
uint8_t outputType, unicodeMask;
|
||||
} UConverterMBCSTable;
|
||||
|
||||
/*
|
||||
/**
|
||||
* MBCS data structure as part of a .cnv file:
|
||||
*
|
||||
* uint32_t [8]; -- 8 values:
|
||||
@ -105,20 +122,78 @@ typedef struct {
|
||||
reserved;
|
||||
} _MBCSHeader;
|
||||
|
||||
/** Forward declaration to enable the following function declarations. */
|
||||
struct UConverterSharedData;
|
||||
|
||||
/** Forward declaration to enable the following function declarations. */
|
||||
typedef struct UConverterSharedData UConverterSharedData;
|
||||
|
||||
/**
|
||||
* This is a simple version of _MBCSGetNextUChar() that is used
|
||||
* by other converter implementations.
|
||||
* It does not use state from the converter, nor error codes.
|
||||
*
|
||||
* Return value:
|
||||
* U+fffe unassigned
|
||||
* U+ffff illegal
|
||||
* otherwise the Unicode code point
|
||||
*/
|
||||
U_CFUNC UChar32
|
||||
_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
|
||||
const char **pSource, const char *sourceLimit,
|
||||
UBool useFallback);
|
||||
|
||||
/** This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. */
|
||||
U_CFUNC UChar32
|
||||
_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
|
||||
uint8_t b, UBool useFallback);
|
||||
|
||||
/**
|
||||
* This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte.
|
||||
* It works for single-byte, single-state codepages that only map
|
||||
* to and from BMP code points, and it always
|
||||
* returns fallback values.
|
||||
*/
|
||||
#define _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(sharedData, b) \
|
||||
(UChar)(((sharedData)->table->mbcs.stateTable[0][b])>>7)
|
||||
|
||||
/**
|
||||
* This is an internal function that allows other converter implementations
|
||||
* to check whether a byte is a lead byte.
|
||||
*/
|
||||
U_CFUNC UBool
|
||||
_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte);
|
||||
|
||||
/** This is a macro version of _MBCSIsLeadByte(). */
|
||||
#define _MBCS_IS_LEAD_BYTE(sharedData, byte) \
|
||||
(UBool)((sharedData)->table->mbcs.stateTable[0][(uint8_t)(byte)]>=0)
|
||||
|
||||
/**
|
||||
* This is another simple conversion function for internal use by other
|
||||
* conversion implementations.
|
||||
* It does not use the converter state nor call callbacks.
|
||||
* It converts one single Unicode code point into codepage bytes, encoded
|
||||
* as one 32-bit value. The function returns the number of bytes in *pValue:
|
||||
* 1..4 the number of bytes in *pValue
|
||||
* 0 unassigned (*pValue undefined)
|
||||
* -1 illegal (currently not used, *pValue undefined)
|
||||
*
|
||||
* *pValue will contain the resulting bytes with the last byte in bits 7..0,
|
||||
* the second to last byte in bits 15..8, etc.
|
||||
* Currently, the function assumes but does not check that 0<=c<=0x10ffff.
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
_MBCSFromUChar32(UConverterSharedData *sharedData,
|
||||
UChar32 c, uint32_t *pValue,
|
||||
UBool useFallback);
|
||||
|
||||
/**
|
||||
* This version of _MBCSFromUChar32() is optimized for single-byte codepages.
|
||||
* It returns the codepage byte for the code point, or -1 if it is unassigned.
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
|
||||
UChar32 c,
|
||||
UBool useFallback);
|
||||
|
||||
#endif
|
||||
|
@ -1328,11 +1328,12 @@ static const UConverterStaticData _SCSUStaticData={
|
||||
1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
|
||||
{ 0x0e, 0xff, 0xfd, 0 }, 3, /* ### the subchar really must be written by an SCSU function! */
|
||||
FALSE, FALSE,
|
||||
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
const UConverterSharedData _SCSUData={
|
||||
sizeof(UConverterSharedData), 1,
|
||||
sizeof(UConverterSharedData), ~((uint32_t)0),
|
||||
NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
|
||||
0
|
||||
};
|
||||
|
@ -173,7 +173,7 @@ static UDataInfo dataInfo={
|
||||
0,
|
||||
|
||||
0x63, 0x6e, 0x76, 0x74, /* dataFormat="cnvt" */
|
||||
6, 0, 0, 0, /* formatVersion */
|
||||
6, 1, 0, 0, /* formatVersion */
|
||||
0, 0, 0, 0 /* dataVersion (calculated at runtime) */
|
||||
};
|
||||
|
||||
@ -648,7 +648,7 @@ void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, U
|
||||
int32_t mbcsLength;
|
||||
char codepointBytes[20];
|
||||
UBool isOK = TRUE;
|
||||
uint8_t precisionMask = 0;
|
||||
uint8_t precisionMask = 0, unicodeMask = 0;
|
||||
char endOfLine;
|
||||
|
||||
if(cnvData->startMappings!=NULL)
|
||||
@ -684,6 +684,13 @@ void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, U
|
||||
/* End of line could be \0 or | (if fallback) */
|
||||
endOfLine= line[nextTokenOffset(line, CODEPOINT_SEPARATORS)];
|
||||
} while((endOfLine != '\0') && (endOfLine != FALLBACK_SEPARATOR));
|
||||
|
||||
if(unicodeValue>=0x10000) {
|
||||
unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
|
||||
} else if(UTF_IS_SURROGATE(unicodeValue)) {
|
||||
unicodeMask|=UCNV_HAS_SURROGATES; /* there are single surrogates */
|
||||
}
|
||||
|
||||
if((uint32_t)unicodeValue > 0x10ffff)
|
||||
{
|
||||
fprintf(stderr, "error: Unicode code point > U+10ffff in '%s'\n", storageLine);
|
||||
@ -730,6 +737,12 @@ void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, U
|
||||
}
|
||||
}
|
||||
|
||||
if(unicodeMask == 3)
|
||||
{
|
||||
fprintf(stderr, "warning: contains mappings to both supplementary code points and single surrogates\n");
|
||||
}
|
||||
staticData->unicodeMask = unicodeMask;
|
||||
|
||||
if(cnvData->finishMappings!=NULL)
|
||||
{
|
||||
cnvData->finishMappings(cnvData, staticData);
|
||||
|
@ -17,36 +17,40 @@
|
||||
|
||||
|
||||
static const UConverterStaticData _SBCSStaticData={
|
||||
sizeof(UConverterStaticData),
|
||||
"SBCS",
|
||||
sizeof(UConverterStaticData),
|
||||
"SBCS",
|
||||
0, UCNV_IBM, UCNV_SBCS, 1, 1,
|
||||
{ 0, 0, 0, 0 }, 1, FALSE, FALSE,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved bytes */
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
|
||||
static const UConverterStaticData _DBCSStaticData={
|
||||
sizeof(UConverterStaticData),
|
||||
"DBCS",
|
||||
sizeof(UConverterStaticData),
|
||||
"DBCS",
|
||||
0, UCNV_IBM, UCNV_DBCS, 2, 2,
|
||||
{ 0, 0, 0, 0 },1, FALSE, FALSE, /* subchar */
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved bytes */
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
static const UConverterStaticData _MBCSStaticData={
|
||||
sizeof(UConverterStaticData),
|
||||
"MBCS",
|
||||
sizeof(UConverterStaticData),
|
||||
"MBCS",
|
||||
0, UCNV_IBM, UCNV_MBCS, 1, 1,
|
||||
{ 0, 0, 0, 0 }, 1, FALSE, FALSE,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved bytes */
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
static const UConverterStaticData _EBCDICStatefulStaticData={
|
||||
sizeof(UConverterStaticData),
|
||||
"EBCDICStateful",
|
||||
sizeof(UConverterStaticData),
|
||||
"EBCDICStateful",
|
||||
0, UCNV_IBM, UCNV_EBCDIC_STATEFUL, 1, 1,
|
||||
{ 0, 0, 0, 0 },1, FALSE, FALSE,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved bytes */
|
||||
{ 0, 0, 0, 0 },1, FALSE, FALSE,
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
/* NULLs for algorithmic types, their tables live in ucnv_bld.c */
|
||||
|
Loading…
Reference in New Issue
Block a user