111462800f
X-SVN-Rev: 736
590 lines
19 KiB
C
590 lines
19 KiB
C
/*
|
|
**********************************************************************
|
|
* Copyright (C) 2000, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
**********************************************************************
|
|
* file name: ucnvmbcs.cpp
|
|
* encoding: US-ASCII
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 2000feb03
|
|
* created by: Markus W. Scherer
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
#include "cmemory.h"
|
|
#include "ucmp16.h"
|
|
#include "ucmp8.h"
|
|
#include "unicode/ucnv_bld.h"
|
|
#include "unicode/ucnv.h"
|
|
#include "ucnv_cnv.h"
|
|
|
|
/* MBCS --------------------------------------------------------------------- */
|
|
|
|
static void
|
|
_MBCSLoad(UConverterSharedData *sharedData, const uint8_t *raw, UErrorCode *pErrorCode) {
|
|
const uint8_t *oldraw;
|
|
|
|
sharedData->table->mbcs.starters = (bool_t*)raw;
|
|
oldraw = raw += sizeof(bool_t)*256;
|
|
|
|
sharedData->table->mbcs.toUnicode = ucmp16_cloneFromData(&raw, pErrorCode);
|
|
if(((raw-oldraw)&3)!=0) {
|
|
raw+=4-((raw-oldraw)&3); /* pad to 4 */
|
|
}
|
|
sharedData->table->mbcs.fromUnicode = ucmp16_cloneFromData(&raw, pErrorCode);
|
|
}
|
|
|
|
static void
|
|
_MBCSUnload(UConverterSharedData *sharedData) {
|
|
ucmp16_close (sharedData->table->mbcs.fromUnicode);
|
|
ucmp16_close (sharedData->table->mbcs.toUnicode);
|
|
uprv_free (sharedData->table);
|
|
}
|
|
|
|
void T_UConverter_toUnicode_MBCS (UConverter * _this,
|
|
UChar ** target,
|
|
const UChar * targetLimit,
|
|
const char **source,
|
|
const char *sourceLimit,
|
|
int32_t *offsets,
|
|
bool_t flush,
|
|
UErrorCode * err)
|
|
{
|
|
const char *mySource = *source;
|
|
UChar *myTarget = *target;
|
|
int32_t mySourceIndex = 0;
|
|
int32_t myTargetIndex = 0;
|
|
int32_t targetLength = targetLimit - myTarget;
|
|
int32_t sourceLength = sourceLimit - mySource;
|
|
CompactShortArray *myToUnicode = NULL;
|
|
UChar targetUniChar = 0x0000;
|
|
UChar mySourceChar = 0x0000;
|
|
bool_t *myStarters = NULL;
|
|
|
|
|
|
|
|
|
|
myToUnicode = _this->sharedData->table->mbcs.toUnicode;
|
|
myStarters = _this->sharedData->table->mbcs.starters;
|
|
|
|
while (mySourceIndex < sourceLength)
|
|
{
|
|
if (myTargetIndex < targetLength)
|
|
{
|
|
/*gets the corresponding UniChar */
|
|
mySourceChar = (unsigned char) (mySource[mySourceIndex++]);
|
|
|
|
|
|
if (myStarters[(uint8_t) mySourceChar] &&
|
|
(_this->toUnicodeStatus == 0x00))
|
|
{
|
|
_this->toUnicodeStatus = (unsigned char) mySourceChar;
|
|
}
|
|
else
|
|
{
|
|
/*In case there is a state, we update the source char
|
|
*by concatenating the previous char with the current
|
|
*one
|
|
*/
|
|
|
|
if (_this->toUnicodeStatus != 0x00)
|
|
{
|
|
mySourceChar |= (UChar) (_this->toUnicodeStatus << 8);
|
|
|
|
_this->toUnicodeStatus = 0x00;
|
|
}
|
|
|
|
/*gets the corresponding Unicode codepoint */
|
|
targetUniChar = (UChar) ucmp16_getu (myToUnicode, mySourceChar);
|
|
|
|
/*writing the UniChar to the output stream */
|
|
if (targetUniChar != missingUCharMarker)
|
|
{
|
|
myTarget[myTargetIndex++] = targetUniChar;
|
|
|
|
}
|
|
else
|
|
{
|
|
*err = U_INVALID_CHAR_FOUND;
|
|
if (mySourceChar > 0xff)
|
|
{
|
|
_this->invalidCharLength = 2;
|
|
_this->invalidCharBuffer[0] = (char) (mySourceChar >> 8);
|
|
_this->invalidCharBuffer[1] = (char) mySourceChar;
|
|
}
|
|
else
|
|
{
|
|
_this->invalidCharLength = 1;
|
|
_this->invalidCharBuffer[0] = (char) mySourceChar;
|
|
}
|
|
|
|
ToU_CALLBACK_MACRO(_this,
|
|
myTarget,
|
|
myTargetIndex,
|
|
targetLimit,
|
|
mySource,
|
|
mySourceIndex,
|
|
sourceLimit,
|
|
offsets,
|
|
flush,
|
|
err);
|
|
|
|
if (U_FAILURE (*err)) break;
|
|
_this->invalidCharLength = 0;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*If at the end of conversion we are still carrying state information
|
|
*flush is TRUE, we can deduce that the input stream is truncated
|
|
*/
|
|
if (_this->toUnicodeStatus
|
|
&& (mySourceIndex == sourceLength)
|
|
&& (flush == TRUE))
|
|
{
|
|
if (U_SUCCESS(*err))
|
|
{
|
|
*err = U_TRUNCATED_CHAR_FOUND;
|
|
_this->toUnicodeStatus = 0x00;
|
|
}
|
|
}
|
|
|
|
*target += myTargetIndex;
|
|
*source += mySourceIndex;
|
|
|
|
return;
|
|
}
|
|
|
|
void T_UConverter_toUnicode_MBCS_OFFSETS_LOGIC (UConverter * _this,
|
|
UChar ** target,
|
|
const UChar * targetLimit,
|
|
const char **source,
|
|
const char *sourceLimit,
|
|
int32_t *offsets,
|
|
bool_t flush,
|
|
UErrorCode * err)
|
|
{
|
|
const char *mySource = *source;
|
|
UChar *myTarget = *target;
|
|
int32_t mySourceIndex = 0;
|
|
int32_t myTargetIndex = 0;
|
|
int32_t targetLength = targetLimit - myTarget;
|
|
int32_t sourceLength = sourceLimit - mySource;
|
|
CompactShortArray *myToUnicode = NULL;
|
|
UChar targetUniChar = 0x0000;
|
|
UChar mySourceChar = 0x0000;
|
|
UChar oldMySourceChar;
|
|
bool_t *myStarters = NULL;
|
|
int32_t* originalOffsets = offsets;
|
|
|
|
|
|
|
|
myToUnicode = _this->sharedData->table->mbcs.toUnicode;
|
|
myStarters = _this->sharedData->table->mbcs.starters;
|
|
|
|
while (mySourceIndex < sourceLength)
|
|
{
|
|
if (myTargetIndex < targetLength)
|
|
{
|
|
/*gets the corresponding UniChar */
|
|
mySourceChar = (unsigned char) (mySource[mySourceIndex++]);
|
|
|
|
|
|
if (myStarters[(uint8_t) mySourceChar] &&
|
|
(_this->toUnicodeStatus == 0x00))
|
|
{
|
|
_this->toUnicodeStatus = (unsigned char) mySourceChar;
|
|
}
|
|
else
|
|
{
|
|
/*In case there is a state, we update the source char
|
|
*by concatenating the previous char with the current
|
|
*one
|
|
*/
|
|
|
|
if (_this->toUnicodeStatus != 0x00)
|
|
{
|
|
mySourceChar |= (UChar) (_this->toUnicodeStatus << 8);
|
|
|
|
_this->toUnicodeStatus = 0x00;
|
|
}
|
|
|
|
/*gets the corresponding Unicode codepoint */
|
|
targetUniChar = (UChar) ucmp16_getu (myToUnicode, mySourceChar);
|
|
|
|
|
|
/*writing the UniChar to the output stream */
|
|
if (targetUniChar != missingUCharMarker)
|
|
{
|
|
/*writes the UniChar to the output stream */
|
|
{
|
|
|
|
|
|
if (targetUniChar > 0x00FF)
|
|
offsets[myTargetIndex] = mySourceIndex -2; /* double byte character - make the offset point to the first char */
|
|
else
|
|
offsets[myTargetIndex] = mySourceIndex -1 ; /* single byte char. Offset is OK */
|
|
|
|
|
|
}
|
|
myTarget[myTargetIndex++] = targetUniChar;
|
|
oldMySourceChar = mySourceChar;
|
|
|
|
}
|
|
else
|
|
{
|
|
int32_t currentOffset = offsets[myTargetIndex-1] + ((oldMySourceChar>0x00FF)?2:1);
|
|
|
|
*err = U_INVALID_CHAR_FOUND;
|
|
if (mySourceChar > 0xff)
|
|
{
|
|
_this->invalidCharLength = 2;
|
|
_this->invalidCharBuffer[0] = (char) (mySourceChar >> 8);
|
|
_this->invalidCharBuffer[1] = (char) mySourceChar;
|
|
}
|
|
else
|
|
{
|
|
_this->invalidCharLength = 1;
|
|
_this->invalidCharBuffer[0] = (char) mySourceChar;
|
|
}
|
|
|
|
ToU_CALLBACK_OFFSETS_LOGIC_MACRO(_this,
|
|
myTarget,
|
|
myTargetIndex,
|
|
targetLimit,
|
|
mySource,
|
|
mySourceIndex,
|
|
sourceLimit,
|
|
offsets,
|
|
flush,
|
|
err);
|
|
|
|
if (U_FAILURE (*err)) break;
|
|
_this->invalidCharLength = 0;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*If at the end of conversion we are still carrying state information
|
|
*flush is TRUE, we can deduce that the input stream is truncated
|
|
*/
|
|
if (_this->toUnicodeStatus
|
|
&& (mySourceIndex == sourceLength)
|
|
&& (flush == TRUE))
|
|
{
|
|
if (U_SUCCESS(*err))
|
|
{
|
|
*err = U_TRUNCATED_CHAR_FOUND;
|
|
_this->toUnicodeStatus = 0x00;
|
|
}
|
|
}
|
|
|
|
*target += myTargetIndex;
|
|
*source += mySourceIndex;
|
|
|
|
return;
|
|
}
|
|
|
|
void T_UConverter_fromUnicode_MBCS (UConverter * _this,
|
|
char **target,
|
|
const char *targetLimit,
|
|
const UChar ** source,
|
|
const UChar * sourceLimit,
|
|
int32_t *offsets,
|
|
bool_t flush,
|
|
UErrorCode * err)
|
|
|
|
{
|
|
const UChar *mySource = *source;
|
|
char *myTarget = *target;
|
|
int32_t mySourceIndex = 0;
|
|
int32_t myTargetIndex = 0;
|
|
int32_t targetLength = targetLimit - myTarget;
|
|
int32_t sourceLength = sourceLimit - mySource;
|
|
CompactShortArray *myFromUnicode = NULL;
|
|
UChar targetUniChar = 0x0000;
|
|
int8_t targetUniCharByteNum = 0;
|
|
UChar mySourceChar = 0x0000;
|
|
|
|
myFromUnicode = _this->sharedData->table->mbcs.fromUnicode;
|
|
|
|
/*writing the char to the output stream */
|
|
while (mySourceIndex < sourceLength)
|
|
{
|
|
if (myTargetIndex < targetLength)
|
|
{
|
|
mySourceChar = (UChar) mySource[mySourceIndex++];
|
|
targetUniChar = (UChar) ucmp16_getu (myFromUnicode, mySourceChar);
|
|
|
|
|
|
if (targetUniChar != missingCharMarker)
|
|
{
|
|
if (targetUniChar <= 0x00FF)
|
|
{
|
|
myTarget[myTargetIndex++] = (char) targetUniChar;
|
|
}
|
|
else
|
|
{
|
|
myTarget[myTargetIndex++] = (char) (targetUniChar >> 8);
|
|
if (myTargetIndex < targetLength)
|
|
{
|
|
myTarget[myTargetIndex++] = (char) targetUniChar;
|
|
}
|
|
else
|
|
{
|
|
_this->charErrorBuffer[0] = (char) targetUniChar;
|
|
_this->charErrorBufferLength = 1;
|
|
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*err = U_INVALID_CHAR_FOUND;
|
|
_this->invalidUCharBuffer[0] = (UChar) mySourceChar;
|
|
_this->invalidUCharLength = 1;
|
|
|
|
FromU_CALLBACK_MACRO(_this,
|
|
myTarget,
|
|
myTargetIndex,
|
|
targetLimit,
|
|
mySource,
|
|
mySourceIndex,
|
|
sourceLimit,
|
|
offsets,
|
|
flush,
|
|
err);
|
|
|
|
if (U_FAILURE (*err)) break;
|
|
_this->invalidUCharLength = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
break;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
*target += myTargetIndex;
|
|
*source += mySourceIndex;;
|
|
|
|
|
|
return;
|
|
}
|
|
|
|
void T_UConverter_fromUnicode_MBCS_OFFSETS_LOGIC (UConverter * _this,
|
|
char **target,
|
|
const char *targetLimit,
|
|
const UChar ** source,
|
|
const UChar * sourceLimit,
|
|
int32_t *offsets,
|
|
bool_t flush,
|
|
UErrorCode * err)
|
|
|
|
{
|
|
const UChar *mySource = *source;
|
|
char *myTarget = *target;
|
|
int32_t mySourceIndex = 0;
|
|
int32_t myTargetIndex = 0;
|
|
int32_t targetLength = targetLimit - myTarget;
|
|
int32_t sourceLength = sourceLimit - mySource;
|
|
CompactShortArray *myFromUnicode = NULL;
|
|
UChar targetUniChar = 0x0000;
|
|
int8_t targetUniCharByteNum = 0;
|
|
UChar mySourceChar = 0x0000;
|
|
int32_t* originalOffsets = offsets;
|
|
|
|
myFromUnicode = _this->sharedData->table->mbcs.fromUnicode;
|
|
|
|
|
|
|
|
/*writing the char to the output stream */
|
|
while (mySourceIndex < sourceLength)
|
|
{
|
|
if (myTargetIndex < targetLength)
|
|
{
|
|
mySourceChar = (UChar) mySource[mySourceIndex++];
|
|
targetUniChar = (UChar) ucmp16_getu (myFromUnicode, mySourceChar);
|
|
|
|
if (targetUniChar != missingCharMarker)
|
|
{
|
|
if (targetUniChar <= 0x00FF)
|
|
{
|
|
offsets[myTargetIndex] = mySourceIndex-1;
|
|
myTarget[myTargetIndex++] = (char) targetUniChar;
|
|
|
|
}
|
|
else
|
|
{
|
|
offsets[myTargetIndex] = mySourceIndex-1;
|
|
myTarget[myTargetIndex++] = (char) (targetUniChar >> 8);
|
|
if (myTargetIndex < targetLength)
|
|
{
|
|
offsets[myTargetIndex] = mySourceIndex-1;
|
|
myTarget[myTargetIndex++] = (char) targetUniChar;
|
|
}
|
|
else
|
|
{
|
|
_this->charErrorBuffer[0] = (char) targetUniChar;
|
|
_this->charErrorBufferLength = 1;
|
|
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
int32_t currentOffset = mySourceIndex -1;
|
|
int32_t* offsetsAnchor = offsets;
|
|
|
|
*err = U_INVALID_CHAR_FOUND;
|
|
_this->invalidUCharBuffer[0] = (UChar) mySourceChar;
|
|
_this->invalidUCharLength = 1;
|
|
|
|
FromU_CALLBACK_OFFSETS_LOGIC_MACRO(_this,
|
|
myTarget,
|
|
myTargetIndex,
|
|
targetLimit,
|
|
mySource,
|
|
mySourceIndex,
|
|
sourceLimit,
|
|
offsets,
|
|
flush,
|
|
err);
|
|
|
|
if (U_FAILURE (*err)) break;
|
|
_this->invalidUCharLength = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
break;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
*target += myTargetIndex;
|
|
*source += mySourceIndex;;
|
|
|
|
|
|
return;
|
|
}
|
|
|
|
UChar T_UConverter_getNextUChar_MBCS(UConverter* converter,
|
|
const char** source,
|
|
const char* sourceLimit,
|
|
UErrorCode* err)
|
|
{
|
|
UChar myUChar;
|
|
char const *sourceInitial = *source;
|
|
/*safe keeps a ptr to the beginning in case we need to step back*/
|
|
|
|
/*Input boundary check*/
|
|
if ((*source)+1 > sourceLimit)
|
|
{
|
|
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
return 0xFFFD;
|
|
}
|
|
|
|
/*Checks to see if the byte is a lead*/
|
|
if (converter->sharedData->table->mbcs.starters[(uint8_t)**source] == FALSE)
|
|
{
|
|
/*Not lead byte: we update the source ptr and get the codepoint*/
|
|
myUChar = ucmp16_getu(converter->sharedData->table->mbcs.toUnicode,
|
|
(UChar)(**source));
|
|
(*source)++;
|
|
}
|
|
else
|
|
{
|
|
/*Lead byte: we Build the codepoint and get the corresponding character
|
|
* and update the source ptr*/
|
|
if ((*source + 2) > sourceLimit)
|
|
{
|
|
*err = U_TRUNCATED_CHAR_FOUND;
|
|
return 0xFFFD;
|
|
}
|
|
|
|
myUChar = ucmp16_getu(converter->sharedData->table->mbcs.toUnicode,
|
|
(uint16_t)(((UChar)((**source)) << 8) |((uint8_t)*((*source)+1))));
|
|
|
|
(*source) += 2;
|
|
}
|
|
|
|
if (myUChar != 0xFFFD) return myUChar;
|
|
else
|
|
{
|
|
/*rewinds source*/
|
|
const char* sourceFinal = *source;
|
|
UChar* myUCharPtr = &myUChar;
|
|
|
|
*err = U_INVALID_CHAR_FOUND;
|
|
*source = sourceInitial;
|
|
|
|
/*It's is very likely that the ErrorFunctor will write to the
|
|
*internal buffers */
|
|
converter->fromCharErrorBehaviour(converter,
|
|
&myUCharPtr,
|
|
myUCharPtr + 1,
|
|
&sourceFinal,
|
|
sourceLimit,
|
|
NULL,
|
|
TRUE,
|
|
err);
|
|
|
|
/*makes the internal caching transparent to the user*/
|
|
if (*err == U_INDEX_OUTOFBOUNDS_ERROR) *err = U_ZERO_ERROR;
|
|
|
|
return myUChar;
|
|
}
|
|
}
|
|
|
|
void
|
|
_MBCSGetStarters(const UConverter* converter, bool_t starters[256], UErrorCode *pErrorCode) {
|
|
/* fills in the starters boolean array */
|
|
uprv_memcpy(starters, converter->sharedData->table->mbcs.starters, 256*sizeof(bool_t));
|
|
}
|
|
|
|
static const UConverterImpl _MBCSImpl={
|
|
UCNV_MBCS,
|
|
|
|
_MBCSLoad,
|
|
_MBCSUnload,
|
|
|
|
NULL,
|
|
NULL,
|
|
NULL,
|
|
|
|
T_UConverter_toUnicode_MBCS,
|
|
T_UConverter_toUnicode_MBCS_OFFSETS_LOGIC,
|
|
T_UConverter_fromUnicode_MBCS,
|
|
T_UConverter_fromUnicode_MBCS_OFFSETS_LOGIC,
|
|
T_UConverter_getNextUChar_MBCS,
|
|
|
|
_MBCSGetStarters
|
|
};
|
|
|
|
extern const UConverterSharedData _MBCSData={
|
|
sizeof(UConverterSharedData), 1,
|
|
NULL, NULL, &_MBCSImpl, "MBCS",
|
|
0, UCNV_IBM, UCNV_MBCS, 1, 1,
|
|
{ 0, 1, 0, 0, 0, 0 }
|
|
};
|