c69c164be8
basic build now compiles w/o a single warning on Linux. One with --enable-strict is a different matter... X-SVN-Rev: 1124
959 lines
31 KiB
C
959 lines
31 KiB
C
/*
|
|
**********************************************************************
|
|
* Copyright (C) 2000, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
**********************************************************************
|
|
* file name: ucnv_lmb.cpp
|
|
* encoding: US-ASCII
|
|
* tab size: 4 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 2000feb09
|
|
* created by: Brendan Murray
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
#include "cmemory.h"
|
|
#include "ucmp16.h"
|
|
#include "ucmp8.h"
|
|
#include "unicode/ucnv_bld.h"
|
|
#include "unicode/ucnv.h"
|
|
#include "ucnv_cnv.h"
|
|
|
|
/* LMBCS -------------------------------------------------------------------- */
|
|
|
|
/* Group bytes, and things that look like group bytes, should always be 8-bits */
|
|
typedef uint8_t ulmbcs_grp_t;
|
|
|
|
|
|
/* Define some constants instead of using literals */
|
|
|
|
|
|
/* LMBCS groups */
|
|
#define ULMBCS_GRP_EXCEPT 0x00 /* placeholder index for 'oddballs' XY, where Y<0x80 */
|
|
#define ULMBCS_GRP_L1 0x01 /* Latin-1 */
|
|
#define ULMBCS_GRP_GR 0x02 /* Greek */
|
|
#define ULMBCS_GRP_HE 0x03 /* Hebrew */
|
|
#define ULMBCS_GRP_AR 0x04 /* Arabic */
|
|
#define ULMBCS_GRP_RU 0x05 /* Cyrillic */
|
|
#define ULMBCS_GRP_L2 0x06 /* Latin-2 */
|
|
#define ULMBCS_GRP_TR 0x08 /* Turkish */
|
|
#define ULMBCS_GRP_TH 0x0B /* Thai */
|
|
#define ULMBCS_GRP_CTRL 0x0F /* C0/C1 controls */
|
|
#define ULMBCS_GRP_JA 0x10 /* Japanese */
|
|
#define ULMBCS_GRP_KO 0x11 /* Korean */
|
|
#define ULMBCS_GRP_CN 0x12 /* Chinese PRC */
|
|
#define ULMBCS_GRP_TW 0x13 /* Chinese Taiwan */
|
|
#define ULMBCS_GRP_UNICODE 0x14 /* Unicode compatibility group */
|
|
#define ULMBCS_GRP_LAST 0x14 /* last LMBCS group that means anything */
|
|
|
|
/* some special values that can appear in place of optimization groups */
|
|
#define ULMBCS_HT 0x09 /* Fixed control char - Horizontal Tab */
|
|
#define ULMBCS_LF 0x0A /* Fixed control char - Line Feed */
|
|
#define ULMBCS_CR 0x0D /* Fixed control char - Carriage Return */
|
|
#define ULMBCS_123SYSTEMRANGE 0x19 /* Fixed control char for 1-2-3 file data: start system range name */
|
|
#define ULMBCS_DEFAULTOPTGROUP 0x1 /* default optimization group for LMBCS */
|
|
#define ULMBCS_DOUBLEOPTGROUP 0x10 /* start of double-byte optimization groups */
|
|
|
|
/* parts of LMBCS values, or ranges for LMBCS data */
|
|
#define ULMBCS_UNICOMPATZERO 0xF6 /* PUA range for Unicode chars containing LSB = 0 */
|
|
#define ULMBCS_CTRLOFFSET 0x20 /* Offset of control range in group 0x0F */
|
|
#define ULMBCS_C1START 0x80 /* Start of 'C1' upper ascii range in ANSI code pages */
|
|
#define ULMBCS_C0END 0x1F /* last of the 'C0' lower ascii contraol range in ANSI code pages */
|
|
#define ULMBCS_INVALIDCHAR 0xFFFF /* Invalid character value = convert failed */
|
|
|
|
|
|
/* special return values for FindLMBCSUniRange */
|
|
#define ULMBCS_AMBIGUOUS_SBCS 0x80 /* could fit in more than one
|
|
LMBCS sbcs native encoding (example: most accented latin) */
|
|
#define ULMBCS_AMBIGUOUS_MBCS 0x81 /* could fit in more than one
|
|
LMBCS mbcs native encoding (example: Unihan) */
|
|
|
|
/* macro to check compatibility of groups */
|
|
#define ULMBCS_AMBIGUOUS_MATCH(agroup, xgroup) \
|
|
((((agroup) == ULMBCS_AMBIGUOUS_SBCS) && \
|
|
(xgroup) < ULMBCS_DOUBLEOPTGROUP) || \
|
|
(((agroup) == ULMBCS_AMBIGUOUS_MBCS) && \
|
|
(xgroup) >= ULMBCS_DOUBLEOPTGROUP))
|
|
|
|
/* Max size for 1 LMBCS char */
|
|
#define ULMBCS_CHARSIZE_MAX 3
|
|
|
|
|
|
/* JSGTODO: what is ICU standard debug assertion method?
|
|
Invent an all-crash stop here, for now */
|
|
#if 1
|
|
#define MyAssert(b) {if (!(b)) {*(char *)0 = 1;}}
|
|
#else
|
|
#define MyAssert(b)
|
|
#endif
|
|
|
|
|
|
/* Map Optimization group byte to converter name. Note the following:
|
|
0x00 is dummy, and contains the name of the exceptions converter.
|
|
0x02 is currently unavailable: NLTC have been asked to provide.
|
|
0x0F and 0x14 are algorithmically calculated
|
|
0x09, 0x0A, 0x0D are data bytes (HT, LF, CR)
|
|
0x07, 0x0C and 0x0E are unused
|
|
*/
|
|
static const char * OptGroupByteToCPName[ULMBCS_CTRLOFFSET] = {
|
|
/* 0x0000 */ "lmb-excp", /* No zero opt group: for non-standard entries */
|
|
/* 0x0001 */ "ibm-850",
|
|
/* 0x0002 */ "ibm-851",
|
|
/* 0x0003 */ "ibm-1255",
|
|
/* 0x0004 */ "ibm-1256",
|
|
/* 0x0005 */ "ibm-1251",
|
|
/* 0x0006 */ "ibm-852",
|
|
/* 0x0007 */ NULL, /* Unused */
|
|
/* 0x0008 */ "ibm-1254",
|
|
/* 0x0009 */ NULL, /* Control char HT */
|
|
/* 0x000A */ NULL, /* Control char LF */
|
|
/* 0x000B */ "ibm-874",
|
|
/* 0x000C */ NULL, /* Unused */
|
|
/* 0x000D */ NULL, /* Control char CR */
|
|
/* 0x000E */ NULL, /* Unused */
|
|
/* 0x000F */ NULL, /* Control chars: 0x0F20 + C0/C1 character: algorithmic */
|
|
/* 0x0010 */ "ibm-943",
|
|
/* 0x0011 */ "ibm-1361",
|
|
/* 0x0012 */ "ibm-950",
|
|
/* 0x0013 */ "ibm-1386"
|
|
|
|
/* The rest are null, including the 0x0014 Unicode compatibility region
|
|
and 0x0019, the 1-2-3 system range control char */
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/* map UNICODE ranges to converter indexes (or special values) */
|
|
|
|
ulmbcs_grp_t FindLMBCSUniRange(UChar uniChar, UErrorCode* err);
|
|
|
|
struct _UniLMBCSGrpMap
|
|
{
|
|
UChar uniStartRange;
|
|
UChar uniEndRange;
|
|
ulmbcs_grp_t GrpType;
|
|
} UniLMBCSGrpMap[]
|
|
=
|
|
{
|
|
{ 0x0001, 0x001F, ULMBCS_GRP_CTRL },
|
|
{ 0x0080, 0x009F, ULMBCS_GRP_CTRL },
|
|
{ 0x00A0, 0x0113, ULMBCS_AMBIGUOUS_SBCS },
|
|
{ 0x0115, 0x0120, ULMBCS_AMBIGUOUS_SBCS },
|
|
{ 0x0120, 0x012B, ULMBCS_GRP_EXCEPT },
|
|
{ 0x012C, 0x01CD, ULMBCS_AMBIGUOUS_SBCS },
|
|
{ 0x01CE, 0x01CE, ULMBCS_AMBIGUOUS_MBCS },
|
|
{ 0x01CF, 0x1FFF, ULMBCS_AMBIGUOUS_SBCS },
|
|
{ 0x2000, 0xFFFD, ULMBCS_AMBIGUOUS_MBCS },
|
|
{ 0xFFFF, 0xFFFF }
|
|
};
|
|
|
|
ulmbcs_grp_t FindLMBCSUniRange(UChar uniChar, UErrorCode* err)
|
|
{
|
|
struct _UniLMBCSGrpMap * pTable = UniLMBCSGrpMap;
|
|
|
|
while (uniChar > pTable->uniEndRange)
|
|
{
|
|
pTable++;
|
|
}
|
|
|
|
if (uniChar >= pTable->uniStartRange)
|
|
{
|
|
return pTable->GrpType;
|
|
}
|
|
|
|
if (pTable->uniStartRange == 0xFFFF)
|
|
{
|
|
*err = ULMBCS_INVALIDCHAR;
|
|
}
|
|
return ULMBCS_GRP_UNICODE;
|
|
}
|
|
|
|
#if 0
|
|
/* JSGTODO (by Brendan?) some incomplete source data from Brendan to be integrated */
|
|
|
|
0xFE30, ULMBCS_GRP_JA, ULMBCS_FLAGS_CONTINUE,
|
|
0xFA2E, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE,
|
|
0xF8FF, ULMBCS_GRP_JA, ULMBCS_FLAGS_CONTINUE,
|
|
0xD7FF, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE,
|
|
0xABFF, ULMBCS_GRP_KO, ULMBCS_FLAGS_UNICODE,
|
|
0x9FFF, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE,
|
|
0x31FF, ULMBCS_GRP_JA, ULMBCS_FLAGS_CONTINUE,
|
|
0x318F, ULMBCS_GRP_CN, ULMBCS_FLAGS_CONTINUE,
|
|
0x3130, ULMBCS_GRP_KO, ULMBCS_FLAGS_UNICODE,
|
|
0x3100, ULMBCS_GRP_CN, ULMBCS_FLAGS_CONTINUE,
|
|
0x313F, ULMBCS_GRP_JA, ULMBCS_FLAGS_UNICODE,
|
|
0x2FFF, ULMBCS_GRP_JA, ULMBCS_FLAGS_CONTINUE,
|
|
0x2714, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE,
|
|
0x2000, ULMBCS_GRP_L1, ULMBCS_FLAGS_CONTINUE,
|
|
0x0E5C, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE,
|
|
0x0E00, ULMBCS_GRP_TH, ULMBCS_FLAGS_UNICODE,
|
|
0x06FF, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE,
|
|
0x0600, ULMBCS_GRP_AR, ULMBCS_FLAGS_UNICODE,
|
|
0x0500, ULMBCS_GRP_HE, ULMBCS_FLAGS_UNICODE,
|
|
0x0400, ULMBCS_GRP_RU, ULMBCS_FLAGS_UNICODE,
|
|
0x0300, ULMBCS_GRP_GR, ULMBCS_FLAGS_UNICODE,
|
|
0x001F, ULMBCS_GRP_L1, ULMBCS_FLAGS_CONTINUE,
|
|
0x0000, ULMBCS_GRP_CTRL, ULMBCS_FLAGS_UNICODE
|
|
#endif
|
|
|
|
|
|
int LMBCSConversionWorker (
|
|
UConverterDataLMBCS * extraInfo, ulmbcs_grp_t group,
|
|
uint8_t * pStartLMBCS, UChar * pUniChar,
|
|
ulmbcs_grp_t * lastConverterIndex, bool_t * groups_tried,
|
|
UErrorCode* err);
|
|
|
|
int LMBCSConversionWorker (
|
|
UConverterDataLMBCS * extraInfo, ulmbcs_grp_t group,
|
|
uint8_t * pStartLMBCS, UChar * pUniChar,
|
|
ulmbcs_grp_t * lastConverterIndex, bool_t * groups_tried,
|
|
UErrorCode * err)
|
|
{
|
|
uint8_t * pLMBCS = pStartLMBCS;
|
|
UConverter * xcnv = extraInfo->OptGrpConverter[group];
|
|
uint8_t mbChar [ULMBCS_CHARSIZE_MAX];
|
|
uint8_t * pmbChar = mbChar;
|
|
bool_t isDoubleByteGroup = (group >= ULMBCS_DOUBLEOPTGROUP) ? TRUE : FALSE;
|
|
UErrorCode localErr = 0;
|
|
int bytesConverted =0;
|
|
|
|
MyAssert(xcnv);
|
|
MyAssert(group<ULMBCS_GRP_UNICODE);
|
|
|
|
ucnv_fromUnicode(xcnv, (char **)&pmbChar,(char *)mbChar+sizeof(mbChar),(const UChar **)&pUniChar,pUniChar+1,NULL,TRUE,&localErr);
|
|
bytesConverted = pmbChar - mbChar;
|
|
pmbChar = mbChar;
|
|
|
|
/* most common failure mode is the sub-converter using the substitution char (0x7f for our converters)
|
|
*/
|
|
|
|
if (*mbChar == xcnv->subChar[0] || U_FAILURE(localErr) || !bytesConverted )
|
|
{
|
|
/* JSGTODO: are there some local failure modes that ought to be bubbled up in some other way? */
|
|
groups_tried[group] = TRUE;
|
|
return 0;
|
|
}
|
|
|
|
*lastConverterIndex = group;
|
|
|
|
/* All initial byte values in lower ascii range should have been caught by now,
|
|
except with the exception group.
|
|
|
|
Uncomment this assert to find them.
|
|
*/
|
|
|
|
/* MyAssert((*pmbChar <= ULMBCS_C0END) || (*pmbChar >= ULMBCS_C1START) || (group == ULMBCS_GRP_EXCEPT)); */
|
|
|
|
/* use converted data: first write 0, 1 or two group bytes */
|
|
if (group != ULMBCS_GRP_EXCEPT && extraInfo->OptGroup != group)
|
|
{
|
|
*pLMBCS++ = group;
|
|
if (bytesConverted == 1 && isDoubleByteGroup)
|
|
{
|
|
*pLMBCS++ = group;
|
|
}
|
|
}
|
|
/* then move over the converted data */
|
|
do
|
|
{
|
|
*pLMBCS++ = *pmbChar++;
|
|
}
|
|
while(--bytesConverted);
|
|
|
|
return (pLMBCS - pStartLMBCS);
|
|
}
|
|
|
|
|
|
/* Convert Unicode string to LMBCS */
|
|
void _LMBCSFromUnicode(UConverter* _this,
|
|
char** target,
|
|
const char* targetLimit,
|
|
const UChar** source,
|
|
const UChar* sourceLimit,
|
|
int32_t * offsets,
|
|
bool_t flush,
|
|
UErrorCode* err)
|
|
{
|
|
ulmbcs_grp_t lastConverterIndex = 0;
|
|
UChar uniChar;
|
|
uint8_t LMBCS[ULMBCS_CHARSIZE_MAX];
|
|
uint8_t * pLMBCS;
|
|
int bytes_written;
|
|
bool_t groups_tried[ULMBCS_GRP_LAST];
|
|
UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo;
|
|
|
|
/* Arguments Check */
|
|
if (!err || U_FAILURE(*err))
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (sourceLimit < *source)
|
|
{
|
|
*err = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
|
|
|
|
do
|
|
{
|
|
uniChar = *(*source)++;
|
|
bytes_written = 0;
|
|
pLMBCS = LMBCS;
|
|
|
|
/* single byte matches */
|
|
|
|
if (uniChar == 0 || uniChar == ULMBCS_HT || uniChar == ULMBCS_CR ||
|
|
uniChar == ULMBCS_LF || uniChar == ULMBCS_123SYSTEMRANGE ||
|
|
((uniChar >= ULMBCS_CTRLOFFSET) && (uniChar < ULMBCS_C1START)))
|
|
{
|
|
*pLMBCS++ = (uint8_t) uniChar;
|
|
bytes_written = 1;
|
|
}
|
|
|
|
|
|
if (!bytes_written)
|
|
{
|
|
/* Check by UNICODE range */
|
|
ulmbcs_grp_t group = FindLMBCSUniRange(uniChar,err);
|
|
|
|
if (group == ULMBCS_GRP_UNICODE)
|
|
{
|
|
/* encode into LMBCS Unicode range */
|
|
uint8_t LowCh = (uint8_t) (uniChar & 0x00FF);
|
|
uint8_t HighCh = (uint8_t)(uniChar >> 8);
|
|
|
|
*pLMBCS++ = ULMBCS_GRP_UNICODE;
|
|
|
|
if (LowCh == 0)
|
|
{
|
|
*pLMBCS++ = ULMBCS_UNICOMPATZERO;
|
|
*pLMBCS++ = HighCh;
|
|
}
|
|
else
|
|
{
|
|
*pLMBCS++ = HighCh;
|
|
*pLMBCS++ = LowCh;
|
|
}
|
|
|
|
bytes_written = pLMBCS - LMBCS;
|
|
}
|
|
else if (group == ULMBCS_GRP_CTRL)
|
|
{
|
|
/* Handle control characters here */
|
|
if (uniChar <= ULMBCS_C0END)
|
|
{
|
|
*pLMBCS++ = ULMBCS_GRP_CTRL;
|
|
*pLMBCS++ = ULMBCS_CTRLOFFSET + (uint8_t) uniChar;
|
|
}
|
|
else if (uniChar >= ULMBCS_C1START && uniChar <= ULMBCS_C1START + ULMBCS_CTRLOFFSET)
|
|
{
|
|
*pLMBCS++ = ULMBCS_GRP_CTRL;
|
|
*pLMBCS++ = (uint8_t) (uniChar & 0x00FF);
|
|
}
|
|
bytes_written = pLMBCS - LMBCS;
|
|
}
|
|
else if (group < ULMBCS_GRP_UNICODE)
|
|
{
|
|
/* a specific converter has been identified - use it */
|
|
bytes_written = LMBCSConversionWorker (
|
|
extraInfo, group, pLMBCS, &uniChar,
|
|
&lastConverterIndex, groups_tried, err);
|
|
|
|
MyAssert(bytes_written); /* table should never return unusable group */
|
|
|
|
}
|
|
else /* the ambiguous group cases */
|
|
{
|
|
memset(groups_tried, 0, sizeof(groups_tried));
|
|
|
|
/* check for non-default optimization group */
|
|
if (extraInfo->OptGroup != 1
|
|
&& ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->OptGroup))
|
|
{
|
|
bytes_written = LMBCSConversionWorker (extraInfo,
|
|
extraInfo->OptGroup, pLMBCS, &uniChar,
|
|
&lastConverterIndex, groups_tried, err);
|
|
}
|
|
/* check for locale optimization group */
|
|
if (!bytes_written
|
|
&& (extraInfo->localeConverterIndex)
|
|
&& (ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->localeConverterIndex)))
|
|
{
|
|
bytes_written = LMBCSConversionWorker (extraInfo,
|
|
extraInfo->localeConverterIndex, pLMBCS, &uniChar,
|
|
&lastConverterIndex, groups_tried, err);
|
|
}
|
|
/* check for last optimization group used for this string */
|
|
if (!bytes_written
|
|
&& (lastConverterIndex)
|
|
&& (ULMBCS_AMBIGUOUS_MATCH(group, lastConverterIndex)))
|
|
{
|
|
bytes_written = LMBCSConversionWorker (extraInfo,
|
|
lastConverterIndex, pLMBCS, &uniChar,
|
|
&lastConverterIndex, groups_tried, err);
|
|
|
|
}
|
|
if (!bytes_written)
|
|
{
|
|
/* just check every matching converter */
|
|
ulmbcs_grp_t grp_start;
|
|
ulmbcs_grp_t grp_end;
|
|
ulmbcs_grp_t grp_ix;
|
|
grp_start = (group == ULMBCS_AMBIGUOUS_MBCS)
|
|
? ULMBCS_DOUBLEOPTGROUP
|
|
: ULMBCS_GRP_L1;
|
|
grp_end = (group == ULMBCS_AMBIGUOUS_MBCS)
|
|
? ULMBCS_GRP_LAST-1
|
|
: ULMBCS_GRP_TH;
|
|
|
|
for (grp_ix = grp_start;
|
|
grp_ix <= grp_end && !bytes_written;
|
|
grp_ix++)
|
|
{
|
|
if (extraInfo->OptGrpConverter [grp_ix] && !groups_tried [grp_ix])
|
|
{
|
|
bytes_written = LMBCSConversionWorker (extraInfo,
|
|
grp_ix, pLMBCS, &uniChar,
|
|
&lastConverterIndex, groups_tried, err);
|
|
}
|
|
}
|
|
|
|
/* a final conversion fallback for sbcs to the exceptions group */
|
|
if (!bytes_written && group == ULMBCS_AMBIGUOUS_SBCS)
|
|
{
|
|
bytes_written = LMBCSConversionWorker (extraInfo,
|
|
ULMBCS_GRP_EXCEPT, pLMBCS, &uniChar,
|
|
&lastConverterIndex, groups_tried, err);
|
|
}
|
|
/* all of our strategies failed. Fallback to Unicode. Consider adding these to table */
|
|
|
|
if (!bytes_written)
|
|
{
|
|
/* encode into LMBCS Unicode range */
|
|
uint8_t LowCh = (uint8_t) uniChar;
|
|
uint8_t HighCh = (uint8_t)(uniChar >> 8);
|
|
|
|
*pLMBCS++ = ULMBCS_GRP_UNICODE;
|
|
|
|
if (LowCh == 0)
|
|
{
|
|
*pLMBCS++ = ULMBCS_UNICOMPATZERO;
|
|
*pLMBCS++ = HighCh;
|
|
}
|
|
else
|
|
{
|
|
*pLMBCS++ = HighCh;
|
|
*pLMBCS++ = LowCh;
|
|
}
|
|
|
|
bytes_written = pLMBCS - LMBCS;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (*target + bytes_written > targetLimit)
|
|
{
|
|
/* JSGTODO deal with buffer running out here */
|
|
}
|
|
|
|
/* now that we are sure it all fits, move it in */
|
|
for(pLMBCS = LMBCS; bytes_written--; *(*target)++ = *pLMBCS++)
|
|
{ };
|
|
|
|
}
|
|
while (*source< sourceLimit &&
|
|
*target < targetLimit &&
|
|
!U_FAILURE(*err));
|
|
|
|
/* JSGTODO Check the various exit conditions */
|
|
}
|
|
|
|
|
|
|
|
/* Return the Unicode representation for the current LMBCS character */
|
|
UChar32 _LMBCSGetNextUChar(UConverter* _this,
|
|
const char** source,
|
|
const char* sourceLimit,
|
|
UErrorCode* err)
|
|
{
|
|
uint8_t CurByte; /* A byte from the input stream */
|
|
UChar32 uniChar; /* an output UNICODE char */
|
|
UChar mbChar; /* an intermediate multi-byte value (mbcs or LMBCS) */
|
|
CompactShortArray *MyCArray = NULL;
|
|
UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo;
|
|
ulmbcs_grp_t group = 0;
|
|
UConverter* cnv = 0;
|
|
|
|
/* Opt Group (or first data byte) */
|
|
CurByte = *((uint8_t *) (*source)++);
|
|
uniChar = 0;
|
|
|
|
/*
|
|
* at entry of each if clause:
|
|
* 1. 'CurByte' points at the first byte of a LMBCS character
|
|
* 2. '*source'points to the next byte of the source stream after 'CurByte'
|
|
*
|
|
* the job of each if clause is:
|
|
* 1. set '*source' to point at the beginning of next char (nop if LMBCS char is only 1 byte)
|
|
* 2. set 'uniChar' up with the right Unicode value, or set 'err' appropriately
|
|
*/
|
|
|
|
/* First lets check the simple fixed values. */
|
|
/* JSGTODO (from markus): a switch would be much faster here */
|
|
if (CurByte == 0 || CurByte == ULMBCS_HT || CurByte == ULMBCS_CR ||
|
|
CurByte == ULMBCS_LF || CurByte == ULMBCS_123SYSTEMRANGE ||
|
|
((CurByte >= ULMBCS_CTRLOFFSET) && (CurByte < ULMBCS_C1START)))
|
|
{
|
|
uniChar = CurByte;
|
|
}
|
|
else
|
|
if (CurByte == ULMBCS_GRP_CTRL) /* Control character group - no opt group update */
|
|
{
|
|
/* JSGTODO (from markus): please make sure your error code returns are consistent with
|
|
those of the other converters; the utf implementations return truncated only when
|
|
the input is too short; if there is nothing at all, then they set index out of bounds.
|
|
see unicode in here.
|
|
(and, please, come to a common indentation - brendan 2, you 3??)
|
|
(plus, no // comments in c code - it breaks many c compilers!)
|
|
*/
|
|
if (*source >= sourceLimit)
|
|
{
|
|
*err = U_TRUNCATED_CHAR_FOUND;
|
|
}
|
|
else
|
|
{
|
|
uint8_t C0C1byte = *(*source)++;
|
|
uniChar = (C0C1byte < ULMBCS_C1START) ? C0C1byte - ULMBCS_CTRLOFFSET : C0C1byte;
|
|
}
|
|
}
|
|
else
|
|
if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BE as is */
|
|
{
|
|
uint8_t HighCh, LowCh;
|
|
|
|
if (*source + 2 > sourceLimit)
|
|
{
|
|
if (*source >= sourceLimit)
|
|
{
|
|
*err = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
}
|
|
else
|
|
{
|
|
*err = U_TRUNCATED_CHAR_FOUND;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
HighCh = *(*source)++; /* Big-endian Unicode in LMBCS compatibility group*/
|
|
LowCh = *(*source)++;
|
|
|
|
if (HighCh == ULMBCS_UNICOMPATZERO )
|
|
{
|
|
HighCh = LowCh;
|
|
LowCh = 0; /* zero-byte in LSB special character */
|
|
}
|
|
|
|
uniChar = (HighCh << 8) | LowCh;
|
|
|
|
/* UTF-16 means that there may be a surrogate pair */
|
|
if(UTF_IS_FIRST_SURROGATE(uniChar))
|
|
{
|
|
/* assume that single surrogates only occur in Unicode LMBCS sequences */
|
|
if (*source >= sourceLimit)
|
|
{
|
|
*err = U_TRUNCATED_CHAR_FOUND;
|
|
}
|
|
else
|
|
/* is there really Unicode, and a second surrogate?
|
|
if not, then we ignore it without error
|
|
*/
|
|
if(**source == ULMBCS_GRP_UNICODE)
|
|
{
|
|
if (*source + 3 > sourceLimit)
|
|
{
|
|
*err = U_TRUNCATED_CHAR_FOUND;
|
|
}
|
|
else
|
|
{
|
|
uint16_t second;
|
|
HighCh = *(*source + 1); /* Big-endian Unicode in LMBCS compatibility group*/
|
|
LowCh = *(*source + 2);
|
|
|
|
if (HighCh == ULMBCS_UNICOMPATZERO )
|
|
{
|
|
HighCh = LowCh;
|
|
LowCh = 0; /* zero-byte in LSB special character */
|
|
}
|
|
|
|
second = (HighCh << 8) | LowCh;
|
|
if(UTF_IS_SECOND_SURROGATE(second))
|
|
{
|
|
uniChar = UTF16_GET_PAIR_VALUE(uniChar, second);
|
|
*source += 3;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
else if (CurByte <= ULMBCS_CTRLOFFSET)
|
|
{
|
|
group = CurByte; /* group byte is in the source */
|
|
cnv = extraInfo->OptGrpConverter[group];
|
|
|
|
if (!cnv)
|
|
{
|
|
/* this is not a valid group byte - no converter*/
|
|
*err = U_INVALID_CHAR_FOUND;
|
|
}
|
|
|
|
|
|
else if (group >= ULMBCS_DOUBLEOPTGROUP) /* double byte conversion */
|
|
{
|
|
uint8_t HighCh, LowCh;
|
|
|
|
|
|
HighCh = *(*source)++;
|
|
LowCh = *(*source)++;
|
|
|
|
/* check for LMBCS doubled-group-byte case */
|
|
mbChar = (HighCh == group) ? LowCh : (HighCh<<8) | LowCh;
|
|
|
|
MyCArray = cnv->sharedData->table->mbcs.toUnicode;
|
|
uniChar = (UChar) ucmp16_getu (MyCArray, mbChar);
|
|
|
|
}
|
|
else /* single byte conversion */
|
|
{
|
|
CurByte = *(*source)++;
|
|
if (CurByte >= ULMBCS_C1START)
|
|
{
|
|
uniChar = cnv->sharedData->table->sbcs.toUnicode[CurByte];
|
|
}
|
|
else
|
|
{
|
|
/* The non-optimizable oddballs where there is an explicit byte
|
|
* AND the second byte is not in the upper ascii range
|
|
*/
|
|
cnv = extraInfo->OptGrpConverter [ULMBCS_GRP_EXCEPT];
|
|
|
|
/* Lookup value must include opt group */
|
|
mbChar = (UChar)(group << 8) | (UChar) CurByte;
|
|
|
|
MyCArray = cnv->sharedData->table->mbcs.toUnicode;
|
|
uniChar = (UChar) ucmp16_getu(MyCArray, mbChar);
|
|
|
|
}
|
|
}
|
|
}
|
|
else if (CurByte >= ULMBCS_C1START) /* group byte is implicit */
|
|
{
|
|
group = extraInfo->OptGroup;
|
|
cnv = extraInfo->OptGrpConverter[group];
|
|
|
|
if (group >= ULMBCS_DOUBLEOPTGROUP) /* double byte conversion */
|
|
{
|
|
uint8_t HighCh, LowCh;
|
|
|
|
/* JSGTODO need to deal with case of single byte G1
|
|
chars in mbcs groups */
|
|
|
|
HighCh = CurByte;
|
|
LowCh = *(*source)++;
|
|
|
|
mbChar = (HighCh<<8) | LowCh;
|
|
MyCArray = cnv->sharedData->table->mbcs.toUnicode;
|
|
uniChar = (UChar) ucmp16_getu (MyCArray, mbChar);
|
|
(*source) += sizeof(UChar);
|
|
}
|
|
else /* single byte conversion */
|
|
{
|
|
uniChar = cnv->sharedData->table->sbcs.toUnicode[CurByte];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
#if DEBUG
|
|
/* JSGTODO: assert here: we should never get here. */
|
|
#endif
|
|
|
|
}
|
|
/* JSGTODO: need to correctly deal with partial chars */
|
|
/* JSGTODO (from markus :-) - deal with surrogate pairs;
|
|
see UTF-8/16BE/16LE implementations,
|
|
http://oss.software.ibm.com/icu/archives/icu/icu.0002/msg00043.html
|
|
|
|
behavior: uniChar is now declared UChar32;
|
|
if(UTF_IS_FIRST_SURROGATE(uniChar)) then check for more input length
|
|
if too short, then error
|
|
else get another 16-bit unit
|
|
if(UTF_IS_SECOND_SURROGATE(second unit)) then
|
|
uniChar=UTF16_GET_PAIR_VALUE(uniChar, second unit);
|
|
|
|
You may need to do this only when the following LMBCS byte indicates
|
|
embedded Unicode (ULMBCS_GRP_UNICODE), and get the following surrogate directly
|
|
from the following two bytes like the UTF-16BE implementation.
|
|
|
|
actually, just for the embedded Unicode, i did this. if no other groups
|
|
in LMBCS can carry single surrogates, then we may be done with my changes.
|
|
*/
|
|
return uniChar;
|
|
}
|
|
|
|
|
|
|
|
void _LMBCSToUnicodeWithOffsets(UConverter* _this,
|
|
UChar** target,
|
|
const UChar* targetLimit,
|
|
const char** source,
|
|
const char* sourceLimit,
|
|
int32_t* offsets,
|
|
bool_t flush,
|
|
UErrorCode* err)
|
|
{
|
|
UChar32 uniChar; /* an output UNICODE char */
|
|
CompactShortArray *MyCArray = NULL;
|
|
UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo;
|
|
ulmbcs_grp_t group = 0;
|
|
UConverter* cnv = 0;
|
|
const char * pStartLMBCS = *source;
|
|
|
|
if (!err || U_FAILURE(*err))
|
|
{
|
|
return;
|
|
}
|
|
if ((_this == NULL) || (targetLimit < *target) || (sourceLimit < *source))
|
|
{
|
|
*err = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
|
|
#if 0 /* JSGTODOD - restore incomplete char handling */
|
|
|
|
/* Have we arrived here from a prior conversion ending with a partial char?
|
|
The only possible configurations are:
|
|
1. mode contains the group byte of SBCS LMBCS char;
|
|
2. mode contains the group byte of MBCS LMBCS char
|
|
For both continue with next char in input buffer
|
|
3. mode contains group byte + 1st data byte of MBCS LMBCS char
|
|
Partially process & get the second data byte
|
|
4. mode contains both group bytes of double group-byte MBCS LMBCS char
|
|
Nuke contents after setting up converter & continue with buffer data
|
|
*/
|
|
if (_this->toUnicodeStatus)
|
|
{
|
|
mbChar = (UChar) _this->mode; /* Restore the previously calculated char */
|
|
|
|
_this->toUnicodeStatus = 0; /* Reset other fields*/
|
|
_this->invalidCharLength = 0;
|
|
|
|
/* Check if this is a partial MBCS char (fall through if SBCS) */
|
|
if (mbChar > 0xFF)
|
|
{
|
|
/* Select the correct converter */
|
|
group = (mbChar >> 8) & 0x00FF;
|
|
cnv = extraInfo->OptGrpConverter[group];
|
|
|
|
/* Pick up the converter table */
|
|
MyCArray = cnv->sharedData->table->mbcs.toUnicode;
|
|
|
|
/* Use only data byte: NULL if the character has pair of group-bytes */
|
|
if (mbChar & 0x00FF < ULMBCS_MAXGRPBYTE)
|
|
CurByte = 0;
|
|
else
|
|
CurByte = ((mbChar & 0x00FF) << 8);
|
|
|
|
/* Add the current char from the buffer */
|
|
CurByte |= *((uint8_t *) (*source)++);
|
|
|
|
goto continueWithPartialMBCSChar;
|
|
|
|
}
|
|
else
|
|
{
|
|
goto continueWithPartialChar;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
|
|
|
|
/* Process from source to limit */
|
|
while (!*err && sourceLimit > *source && targetLimit > *target)
|
|
{
|
|
if(offsets)
|
|
{
|
|
*offsets = (*source) - pStartLMBCS;
|
|
}
|
|
|
|
uniChar = _LMBCSGetNextUChar(_this, source, sourceLimit, err);
|
|
|
|
|
|
/* last step is always to move the new value into the buffer */
|
|
if (U_SUCCESS(*err) && uniChar != missingUCharMarker)
|
|
{
|
|
/* JSGTODO deal with missingUCharMarker case for error/info reporting. */
|
|
if(!UTF_NEED_MULTIPLE_UCHAR(uniChar)) {
|
|
*(*target)++ = (UChar)uniChar;
|
|
} else {
|
|
/* JSGTODO (from markus)
|
|
write several UChar's for this UChar32;
|
|
you may need to use macros like UTF_APPEND_CHAR() or similar (from utf.h)
|
|
what does this mean for the target range check and for the offsets?
|
|
*/
|
|
}
|
|
if(offsets)
|
|
{
|
|
offsets++;
|
|
}
|
|
|
|
}
|
|
}
|
|
#if 0
|
|
/* JSGTODO restore partial char handling */
|
|
/* Check to see if we've fallen through because of a partial char */
|
|
if (*err == U_TRUNCATED_CHAR_FOUND)
|
|
{
|
|
_this->mode = mbChar; /* Save current partial char */
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
|
|
/* Convert LMBCS string to Unicode */
|
|
void _LMBCSToUnicode(UConverter* _this,
|
|
UChar** target,
|
|
const UChar* targetLimit,
|
|
const char** source,
|
|
const char* sourceLimit,
|
|
int32_t* offsets,
|
|
bool_t flush,
|
|
UErrorCode* err)
|
|
{
|
|
_LMBCSToUnicodeWithOffsets(_this, target, targetLimit, source, sourceLimit, offsets, flush,err);
|
|
}
|
|
|
|
|
|
|
|
static void _LMBCSOpenWorker(UConverter* _this,
|
|
const char* name,
|
|
const char* locale,
|
|
UErrorCode* err,
|
|
ulmbcs_grp_t OptGroup
|
|
)
|
|
{
|
|
UConverterDataLMBCS * extraInfo = uprv_malloc (sizeof (UConverterDataLMBCS));
|
|
|
|
if(extraInfo != NULL)
|
|
{
|
|
|
|
ulmbcs_grp_t i;
|
|
ulmbcs_grp_t imax;
|
|
|
|
imax = sizeof(extraInfo->OptGrpConverter)/sizeof(extraInfo->OptGrpConverter[0]);
|
|
|
|
for (i=0; i < imax; i++)
|
|
{
|
|
extraInfo->OptGrpConverter[i] =
|
|
(OptGroupByteToCPName[i] != NULL) ?
|
|
ucnv_open(OptGroupByteToCPName[i], err) : NULL;
|
|
}
|
|
|
|
extraInfo->OptGroup = OptGroup;
|
|
/* JSGTODO: add LocaleConverterIndex logic here */
|
|
extraInfo->localeConverterIndex = 0;
|
|
}
|
|
else
|
|
{
|
|
*err = U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
|
|
_this->extraInfo = extraInfo;
|
|
}
|
|
|
|
|
|
|
|
|
|
static void _LMBCSClose(UConverter * _this)
|
|
{
|
|
if (_this->extraInfo != NULL)
|
|
{
|
|
ulmbcs_grp_t Ix;
|
|
|
|
for (Ix=0; Ix < ULMBCS_GRP_UNICODE; Ix++)
|
|
{
|
|
UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo;
|
|
if (extraInfo->OptGrpConverter[Ix] != NULL)
|
|
ucnv_close (extraInfo->OptGrpConverter[Ix]);
|
|
}
|
|
uprv_free (_this->extraInfo);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
#define DEFINE_LMBCS_OPEN(n) \
|
|
static void _LMBCSOpen##n(UConverter* _this,const char* name,const char* locale,UErrorCode* err) \
|
|
{ _LMBCSOpenWorker(_this, name,locale, err, n);} \
|
|
|
|
|
|
DEFINE_LMBCS_OPEN(1)
|
|
DEFINE_LMBCS_OPEN(2)
|
|
DEFINE_LMBCS_OPEN(3)
|
|
DEFINE_LMBCS_OPEN(4)
|
|
DEFINE_LMBCS_OPEN(5)
|
|
DEFINE_LMBCS_OPEN(6)
|
|
DEFINE_LMBCS_OPEN(8)
|
|
DEFINE_LMBCS_OPEN(11)
|
|
DEFINE_LMBCS_OPEN(16)
|
|
DEFINE_LMBCS_OPEN(17)
|
|
DEFINE_LMBCS_OPEN(18)
|
|
DEFINE_LMBCS_OPEN(19)
|
|
|
|
#define DECLARE_LMBCS_DATA(n) \
|
|
static const UConverterImpl _LMBCSImpl##n={\
|
|
UCNV_LMBCS_##n,\
|
|
NULL,NULL,\
|
|
_LMBCSOpen##n,\
|
|
_LMBCSClose,\
|
|
NULL,\
|
|
_LMBCSToUnicode,\
|
|
_LMBCSToUnicodeWithOffsets,\
|
|
_LMBCSFromUnicode,\
|
|
NULL,\
|
|
_LMBCSGetNextUChar,\
|
|
NULL\
|
|
};\
|
|
const UConverterSharedData _LMBCSData##n={\
|
|
sizeof(UConverterSharedData), ~0,\
|
|
NULL, NULL, &_LMBCSImpl##n, "LMBCS_" ## #n,\
|
|
0, UCNV_IBM, UCNV_LMBCS_1, 1, 1,\
|
|
{ 0, 1, { 0x3f, 0, 0, 0 } }\
|
|
};
|
|
|
|
DECLARE_LMBCS_DATA(1)
|
|
DECLARE_LMBCS_DATA(2)
|
|
DECLARE_LMBCS_DATA(3)
|
|
DECLARE_LMBCS_DATA(4)
|
|
DECLARE_LMBCS_DATA(5)
|
|
DECLARE_LMBCS_DATA(6)
|
|
DECLARE_LMBCS_DATA(8)
|
|
DECLARE_LMBCS_DATA(11)
|
|
DECLARE_LMBCS_DATA(16)
|
|
DECLARE_LMBCS_DATA(17)
|
|
DECLARE_LMBCS_DATA(18)
|
|
DECLARE_LMBCS_DATA(19)
|
|
|
|
|
|
|
|
|