/* ********************************************************************** * Copyright (C) 2000, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: ucnv_lmb.cpp * encoding: US-ASCII * tab size: 4 (not used) * indentation:4 * * created on: 2000feb09 * created by: Brendan Murray */ #include "unicode/utypes.h" #include "cmemory.h" #include "ucmp16.h" #include "ucmp8.h" #include "unicode/ucnv_bld.h" #include "unicode/ucnv.h" #include "ucnv_cnv.h" /* LMBCS -------------------------------------------------------------------- */ /* Group bytes, and things that look like group bytes, should always be 8-bits */ typedef uint8_t ulmbcs_grp_t; /* Define some constants instead of using literals */ /* LMBCS groups */ #define ULMBCS_GRP_EXCEPT 0x00 /* placeholder index for 'oddballs' XY, where Y<0x80 */ #define ULMBCS_GRP_L1 0x01 /* Latin-1 */ #define ULMBCS_GRP_GR 0x02 /* Greek */ #define ULMBCS_GRP_HE 0x03 /* Hebrew */ #define ULMBCS_GRP_AR 0x04 /* Arabic */ #define ULMBCS_GRP_RU 0x05 /* Cyrillic */ #define ULMBCS_GRP_L2 0x06 /* Latin-2 */ #define ULMBCS_GRP_TR 0x08 /* Turkish */ #define ULMBCS_GRP_TH 0x0B /* Thai */ #define ULMBCS_GRP_CTRL 0x0F /* C0/C1 controls */ #define ULMBCS_GRP_JA 0x10 /* Japanese */ #define ULMBCS_GRP_KO 0x11 /* Korean */ #define ULMBCS_GRP_CN 0x12 /* Chinese PRC */ #define ULMBCS_GRP_TW 0x13 /* Chinese Taiwan */ #define ULMBCS_GRP_UNICODE 0x14 /* Unicode compatibility group */ #define ULMBCS_GRP_LAST 0x14 /* last LMBCS group that means anything */ /* some special values that can appear in place of optimization groups */ #define ULMBCS_HT 0x09 /* Fixed control char - Horizontal Tab */ #define ULMBCS_LF 0x0A /* Fixed control char - Line Feed */ #define ULMBCS_CR 0x0D /* Fixed control char - Carriage Return */ #define ULMBCS_123SYSTEMRANGE 0x19 /* Fixed control char for 1-2-3 file data: start system range name */ #define ULMBCS_DEFAULTOPTGROUP 0x1 /* default optimization group for LMBCS */ #define ULMBCS_DOUBLEOPTGROUP 0x10 /* start of double-byte optimization groups */ /* parts of LMBCS values, or ranges for LMBCS data */ #define ULMBCS_UNICOMPATZERO 0xF6 /* PUA range for Unicode chars containing LSB = 0 */ #define ULMBCS_CTRLOFFSET 0x20 /* Offset of control range in group 0x0F */ #define ULMBCS_C1START 0x80 /* Start of 'C1' upper ascii range in ANSI code pages */ #define ULMBCS_C0END 0x1F /* last of the 'C0' lower ascii contraol range in ANSI code pages */ #define ULMBCS_INVALIDCHAR 0xFFFF /* Invalid character value = convert failed */ /* special return values for FindLMBCSUniRange */ #define ULMBCS_AMBIGUOUS_SBCS 0x80 /* could fit in more than one LMBCS sbcs native encoding (example: most accented latin) */ #define ULMBCS_AMBIGUOUS_MBCS 0x81 /* could fit in more than one LMBCS mbcs native encoding (example: Unihan) */ /* macro to check compatibility of groups */ #define ULMBCS_AMBIGUOUS_MATCH(agroup, xgroup) \ ((((agroup) == ULMBCS_AMBIGUOUS_SBCS) && \ (xgroup) < ULMBCS_DOUBLEOPTGROUP) || \ (((agroup) == ULMBCS_AMBIGUOUS_MBCS) && \ (xgroup) >= ULMBCS_DOUBLEOPTGROUP)) /* Max size for 1 LMBCS char */ #define ULMBCS_CHARSIZE_MAX 3 /* JSGTODO: what is ICU standard debug assertion method? Invent an all-crash stop here, for now */ #if 1 #define MyAssert(b) {if (!(b)) {*(char *)0 = 1;}} #else #define MyAssert(b) #endif /* Map Optimization group byte to converter name. Note the following: 0x00 is dummy, and contains the name of the exceptions converter. 0x02 is currently unavailable: NLTC have been asked to provide. 0x0F and 0x14 are algorithmically calculated 0x09, 0x0A, 0x0D are data bytes (HT, LF, CR) 0x07, 0x0C and 0x0E are unused */ static const char * OptGroupByteToCPName[ULMBCS_CTRLOFFSET] = { /* 0x0000 */ "lmb-excp", /* No zero opt group: for non-standard entries */ /* 0x0001 */ "ibm-850", /* 0x0002 */ "ibm-851", /* 0x0003 */ "ibm-1255", /* 0x0004 */ "ibm-1256", /* 0x0005 */ "ibm-1251", /* 0x0006 */ "ibm-852", /* 0x0007 */ NULL, /* Unused */ /* 0x0008 */ "ibm-1254", /* 0x0009 */ NULL, /* Control char HT */ /* 0x000A */ NULL, /* Control char LF */ /* 0x000B */ "ibm-874", /* 0x000C */ NULL, /* Unused */ /* 0x000D */ NULL, /* Control char CR */ /* 0x000E */ NULL, /* Unused */ /* 0x000F */ NULL, /* Control chars: 0x0F20 + C0/C1 character: algorithmic */ /* 0x0010 */ "ibm-943", /* 0x0011 */ "ibm-1361", /* 0x0012 */ "ibm-950", /* 0x0013 */ "ibm-1386" /* The rest are null, including the 0x0014 Unicode compatibility region and 0x0019, the 1-2-3 system range control char */ }; /* map UNICODE ranges to converter indexes (or special values) */ ulmbcs_grp_t FindLMBCSUniRange(UChar uniChar, UErrorCode* err); struct _UniLMBCSGrpMap { UChar uniStartRange; UChar uniEndRange; ulmbcs_grp_t GrpType; } UniLMBCSGrpMap[] = { { 0x0001, 0x001F, ULMBCS_GRP_CTRL }, { 0x0080, 0x009F, ULMBCS_GRP_CTRL }, { 0x00A0, 0x0113, ULMBCS_AMBIGUOUS_SBCS }, { 0x0115, 0x0120, ULMBCS_AMBIGUOUS_SBCS }, { 0x0120, 0x012B, ULMBCS_GRP_EXCEPT }, { 0x012C, 0x01CD, ULMBCS_AMBIGUOUS_SBCS }, { 0x01CE, 0x01CE, ULMBCS_AMBIGUOUS_MBCS }, { 0x01CF, 0x1FFF, ULMBCS_AMBIGUOUS_SBCS }, { 0x2000, 0xFFFD, ULMBCS_AMBIGUOUS_MBCS }, { 0xFFFF, 0xFFFF } }; ulmbcs_grp_t FindLMBCSUniRange(UChar uniChar, UErrorCode* err) { struct _UniLMBCSGrpMap * pTable = UniLMBCSGrpMap; while (uniChar > pTable->uniEndRange) { pTable++; } if (uniChar >= pTable->uniStartRange) { return pTable->GrpType; } if (pTable->uniStartRange == 0xFFFF) { *err = ULMBCS_INVALIDCHAR; } return ULMBCS_GRP_UNICODE; } #if 0 /* JSGTODO (by Brendan?) some incomplete source data from Brendan to be integrated */ 0xFE30, ULMBCS_GRP_JA, ULMBCS_FLAGS_CONTINUE, 0xFA2E, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE, 0xF8FF, ULMBCS_GRP_JA, ULMBCS_FLAGS_CONTINUE, 0xD7FF, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE, 0xABFF, ULMBCS_GRP_KO, ULMBCS_FLAGS_UNICODE, 0x9FFF, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE, 0x31FF, ULMBCS_GRP_JA, ULMBCS_FLAGS_CONTINUE, 0x318F, ULMBCS_GRP_CN, ULMBCS_FLAGS_CONTINUE, 0x3130, ULMBCS_GRP_KO, ULMBCS_FLAGS_UNICODE, 0x3100, ULMBCS_GRP_CN, ULMBCS_FLAGS_CONTINUE, 0x313F, ULMBCS_GRP_JA, ULMBCS_FLAGS_UNICODE, 0x2FFF, ULMBCS_GRP_JA, ULMBCS_FLAGS_CONTINUE, 0x2714, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE, 0x2000, ULMBCS_GRP_L1, ULMBCS_FLAGS_CONTINUE, 0x0E5C, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE, 0x0E00, ULMBCS_GRP_TH, ULMBCS_FLAGS_UNICODE, 0x06FF, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE, 0x0600, ULMBCS_GRP_AR, ULMBCS_FLAGS_UNICODE, 0x0500, ULMBCS_GRP_HE, ULMBCS_FLAGS_UNICODE, 0x0400, ULMBCS_GRP_RU, ULMBCS_FLAGS_UNICODE, 0x0300, ULMBCS_GRP_GR, ULMBCS_FLAGS_UNICODE, 0x001F, ULMBCS_GRP_L1, ULMBCS_FLAGS_CONTINUE, 0x0000, ULMBCS_GRP_CTRL, ULMBCS_FLAGS_UNICODE #endif int LMBCSConversionWorker ( UConverterDataLMBCS * extraInfo, ulmbcs_grp_t group, uint8_t * pStartLMBCS, UChar * pUniChar, ulmbcs_grp_t * lastConverterIndex, bool_t * groups_tried, UErrorCode* err); int LMBCSConversionWorker ( UConverterDataLMBCS * extraInfo, ulmbcs_grp_t group, uint8_t * pStartLMBCS, UChar * pUniChar, ulmbcs_grp_t * lastConverterIndex, bool_t * groups_tried, UErrorCode * err) { uint8_t * pLMBCS = pStartLMBCS; UConverter * xcnv = extraInfo->OptGrpConverter[group]; uint8_t mbChar [ULMBCS_CHARSIZE_MAX]; uint8_t * pmbChar = mbChar; bool_t isDoubleByteGroup = (group >= ULMBCS_DOUBLEOPTGROUP) ? TRUE : FALSE; UErrorCode localErr = 0; int bytesConverted =0; MyAssert(xcnv); MyAssert(groupsubChar[0] || U_FAILURE(localErr) || !bytesConverted ) { /* JSGTODO: are there some local failure modes that ought to be bubbled up in some other way? */ groups_tried[group] = TRUE; return 0; } *lastConverterIndex = group; /* All initial byte values in lower ascii range should have been caught by now, except with the exception group. Uncomment this assert to find them. */ /* MyAssert((*pmbChar <= ULMBCS_C0END) || (*pmbChar >= ULMBCS_C1START) || (group == ULMBCS_GRP_EXCEPT)); */ /* use converted data: first write 0, 1 or two group bytes */ if (group != ULMBCS_GRP_EXCEPT && extraInfo->OptGroup != group) { *pLMBCS++ = group; if (bytesConverted == 1 && isDoubleByteGroup) { *pLMBCS++ = group; } } /* then move over the converted data */ do { *pLMBCS++ = *pmbChar++; } while(--bytesConverted); return (pLMBCS - pStartLMBCS); } /* Convert Unicode string to LMBCS */ void _LMBCSFromUnicode(UConverter* _this, char** target, const char* targetLimit, const UChar** source, const UChar* sourceLimit, int32_t * offsets, bool_t flush, UErrorCode* err) { ulmbcs_grp_t lastConverterIndex = 0; UChar uniChar; uint8_t LMBCS[ULMBCS_CHARSIZE_MAX]; uint8_t * pLMBCS; int bytes_written; bool_t groups_tried[ULMBCS_GRP_LAST]; UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo; /* Arguments Check */ if (!err || U_FAILURE(*err)) { return; } if (sourceLimit < *source) { *err = U_ILLEGAL_ARGUMENT_ERROR; return; } do { uniChar = *(*source)++; bytes_written = 0; pLMBCS = LMBCS; /* single byte matches */ if (uniChar == 0 || uniChar == ULMBCS_HT || uniChar == ULMBCS_CR || uniChar == ULMBCS_LF || uniChar == ULMBCS_123SYSTEMRANGE || ((uniChar >= ULMBCS_CTRLOFFSET) && (uniChar < ULMBCS_C1START))) { *pLMBCS++ = (uint8_t) uniChar; bytes_written = 1; } if (!bytes_written) { /* Check by UNICODE range */ ulmbcs_grp_t group = FindLMBCSUniRange(uniChar,err); if (group == ULMBCS_GRP_UNICODE) { /* encode into LMBCS Unicode range */ uint8_t LowCh = (uint8_t) (uniChar & 0x00FF); uint8_t HighCh = (uint8_t)(uniChar >> 8); *pLMBCS++ = ULMBCS_GRP_UNICODE; if (LowCh == 0) { *pLMBCS++ = ULMBCS_UNICOMPATZERO; *pLMBCS++ = HighCh; } else { *pLMBCS++ = HighCh; *pLMBCS++ = LowCh; } bytes_written = pLMBCS - LMBCS; } else if (group == ULMBCS_GRP_CTRL) { /* Handle control characters here */ if (uniChar <= ULMBCS_C0END) { *pLMBCS++ = ULMBCS_GRP_CTRL; *pLMBCS++ = ULMBCS_CTRLOFFSET + (uint8_t) uniChar; } else if (uniChar >= ULMBCS_C1START && uniChar <= ULMBCS_C1START + ULMBCS_CTRLOFFSET) { *pLMBCS++ = ULMBCS_GRP_CTRL; *pLMBCS++ = (uint8_t) (uniChar & 0x00FF); } bytes_written = pLMBCS - LMBCS; } else if (group < ULMBCS_GRP_UNICODE) { /* a specific converter has been identified - use it */ bytes_written = LMBCSConversionWorker ( extraInfo, group, pLMBCS, &uniChar, &lastConverterIndex, groups_tried, err); MyAssert(bytes_written); /* table should never return unusable group */ } else /* the ambiguous group cases */ { memset(groups_tried, 0, sizeof(groups_tried)); /* check for non-default optimization group */ if (extraInfo->OptGroup != 1 && ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->OptGroup)) { bytes_written = LMBCSConversionWorker (extraInfo, extraInfo->OptGroup, pLMBCS, &uniChar, &lastConverterIndex, groups_tried, err); } /* check for locale optimization group */ if (!bytes_written && (extraInfo->localeConverterIndex) && (ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->localeConverterIndex))) { bytes_written = LMBCSConversionWorker (extraInfo, extraInfo->localeConverterIndex, pLMBCS, &uniChar, &lastConverterIndex, groups_tried, err); } /* check for last optimization group used for this string */ if (!bytes_written && (lastConverterIndex) && (ULMBCS_AMBIGUOUS_MATCH(group, lastConverterIndex))) { bytes_written = LMBCSConversionWorker (extraInfo, lastConverterIndex, pLMBCS, &uniChar, &lastConverterIndex, groups_tried, err); } if (!bytes_written) { /* just check every matching converter */ ulmbcs_grp_t grp_start; ulmbcs_grp_t grp_end; ulmbcs_grp_t grp_ix; grp_start = (group == ULMBCS_AMBIGUOUS_MBCS) ? ULMBCS_DOUBLEOPTGROUP : ULMBCS_GRP_L1; grp_end = (group == ULMBCS_AMBIGUOUS_MBCS) ? ULMBCS_GRP_LAST-1 : ULMBCS_GRP_TH; for (grp_ix = grp_start; grp_ix <= grp_end && !bytes_written; grp_ix++) { if (extraInfo->OptGrpConverter [grp_ix] && !groups_tried [grp_ix]) { bytes_written = LMBCSConversionWorker (extraInfo, grp_ix, pLMBCS, &uniChar, &lastConverterIndex, groups_tried, err); } } /* a final conversion fallback for sbcs to the exceptions group */ if (!bytes_written && group == ULMBCS_AMBIGUOUS_SBCS) { bytes_written = LMBCSConversionWorker (extraInfo, ULMBCS_GRP_EXCEPT, pLMBCS, &uniChar, &lastConverterIndex, groups_tried, err); } /* all of our strategies failed. Fallback to Unicode. Consider adding these to table */ if (!bytes_written) { /* encode into LMBCS Unicode range */ uint8_t LowCh = (uint8_t) uniChar; uint8_t HighCh = (uint8_t)(uniChar >> 8); *pLMBCS++ = ULMBCS_GRP_UNICODE; if (LowCh == 0) { *pLMBCS++ = ULMBCS_UNICOMPATZERO; *pLMBCS++ = HighCh; } else { *pLMBCS++ = HighCh; *pLMBCS++ = LowCh; } bytes_written = pLMBCS - LMBCS; } } } } if (*target + bytes_written > targetLimit) { /* JSGTODO deal with buffer running out here */ } /* now that we are sure it all fits, move it in */ for(pLMBCS = LMBCS; bytes_written--; *(*target)++ = *pLMBCS++) { }; } while (*source< sourceLimit && *target < targetLimit && !U_FAILURE(*err)); /* JSGTODO Check the various exit conditions */ } /* Return the Unicode representation for the current LMBCS character */ UChar32 _LMBCSGetNextUChar(UConverter* _this, const char** source, const char* sourceLimit, UErrorCode* err) { uint8_t CurByte; /* A byte from the input stream */ UChar32 uniChar; /* an output UNICODE char */ UChar mbChar; /* an intermediate multi-byte value (mbcs or LMBCS) */ CompactShortArray *MyCArray = NULL; UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo; ulmbcs_grp_t group = 0; UConverter* cnv = 0; /* Opt Group (or first data byte) */ CurByte = *((uint8_t *) (*source)++); uniChar = 0; /* * at entry of each if clause: * 1. 'CurByte' points at the first byte of a LMBCS character * 2. '*source'points to the next byte of the source stream after 'CurByte' * * the job of each if clause is: * 1. set '*source' to point at the beginning of next char (nop if LMBCS char is only 1 byte) * 2. set 'uniChar' up with the right Unicode value, or set 'err' appropriately */ /* First lets check the simple fixed values. */ /* JSGTODO (from markus): a switch would be much faster here */ if (CurByte == 0 || CurByte == ULMBCS_HT || CurByte == ULMBCS_CR || CurByte == ULMBCS_LF || CurByte == ULMBCS_123SYSTEMRANGE || ((CurByte >= ULMBCS_CTRLOFFSET) && (CurByte < ULMBCS_C1START))) { uniChar = CurByte; } else if (CurByte == ULMBCS_GRP_CTRL) /* Control character group - no opt group update */ { /* JSGTODO (from markus): please make sure your error code returns are consistent with those of the other converters; the utf implementations return truncated only when the input is too short; if there is nothing at all, then they set index out of bounds. see unicode in here. (and, please, come to a common indentation - brendan 2, you 3??) (plus, no // comments in c code - it breaks many c compilers!) */ if (*source >= sourceLimit) { *err = U_TRUNCATED_CHAR_FOUND; } else { uint8_t C0C1byte = *(*source)++; uniChar = (C0C1byte < ULMBCS_C1START) ? C0C1byte - ULMBCS_CTRLOFFSET : C0C1byte; } } else if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BE as is */ { uint8_t HighCh, LowCh; if (*source + 2 > sourceLimit) { if (*source >= sourceLimit) { *err = U_INDEX_OUTOFBOUNDS_ERROR; } else { *err = U_TRUNCATED_CHAR_FOUND; } } else { HighCh = *(*source)++; /* Big-endian Unicode in LMBCS compatibility group*/ LowCh = *(*source)++; if (HighCh == ULMBCS_UNICOMPATZERO ) { HighCh = LowCh; LowCh = 0; /* zero-byte in LSB special character */ } uniChar = (HighCh << 8) | LowCh; /* UTF-16 means that there may be a surrogate pair */ if(UTF_IS_FIRST_SURROGATE(uniChar)) { /* assume that single surrogates only occur in Unicode LMBCS sequences */ if (*source >= sourceLimit) { *err = U_TRUNCATED_CHAR_FOUND; } else /* is there really Unicode, and a second surrogate? if not, then we ignore it without error */ if(**source == ULMBCS_GRP_UNICODE) { if (*source + 3 > sourceLimit) { *err = U_TRUNCATED_CHAR_FOUND; } else { uint16_t second; HighCh = *(*source + 1); /* Big-endian Unicode in LMBCS compatibility group*/ LowCh = *(*source + 2); if (HighCh == ULMBCS_UNICOMPATZERO ) { HighCh = LowCh; LowCh = 0; /* zero-byte in LSB special character */ } second = (HighCh << 8) | LowCh; if(UTF_IS_SECOND_SURROGATE(second)) { uniChar = UTF16_GET_PAIR_VALUE(uniChar, second); *source += 3; } } } } } } else if (CurByte <= ULMBCS_CTRLOFFSET) { group = CurByte; /* group byte is in the source */ cnv = extraInfo->OptGrpConverter[group]; if (!cnv) { /* this is not a valid group byte - no converter*/ *err = U_INVALID_CHAR_FOUND; } else if (group >= ULMBCS_DOUBLEOPTGROUP) /* double byte conversion */ { uint8_t HighCh, LowCh; HighCh = *(*source)++; LowCh = *(*source)++; /* check for LMBCS doubled-group-byte case */ mbChar = (HighCh == group) ? LowCh : (HighCh<<8) | LowCh; MyCArray = &cnv->sharedData->table->mbcs.toUnicode; uniChar = (UChar) ucmp16_getu (MyCArray, mbChar); } else /* single byte conversion */ { CurByte = *(*source)++; if (CurByte >= ULMBCS_C1START) { uniChar = cnv->sharedData->table->sbcs.toUnicode[CurByte]; } else { /* The non-optimizable oddballs where there is an explicit byte * AND the second byte is not in the upper ascii range */ cnv = extraInfo->OptGrpConverter [ULMBCS_GRP_EXCEPT]; /* Lookup value must include opt group */ mbChar = (UChar)(group << 8) | (UChar) CurByte; MyCArray = &cnv->sharedData->table->mbcs.toUnicode; uniChar = (UChar) ucmp16_getu(MyCArray, mbChar); } } } else if (CurByte >= ULMBCS_C1START) /* group byte is implicit */ { group = extraInfo->OptGroup; cnv = extraInfo->OptGrpConverter[group]; if (group >= ULMBCS_DOUBLEOPTGROUP) /* double byte conversion */ { uint8_t HighCh, LowCh; /* JSGTODO need to deal with case of single byte G1 chars in mbcs groups */ HighCh = CurByte; LowCh = *(*source)++; mbChar = (HighCh<<8) | LowCh; MyCArray = &cnv->sharedData->table->mbcs.toUnicode; uniChar = (UChar) ucmp16_getu (MyCArray, mbChar); (*source) += sizeof(UChar); } else /* single byte conversion */ { uniChar = cnv->sharedData->table->sbcs.toUnicode[CurByte]; } } else { #if DEBUG /* JSGTODO: assert here: we should never get here. */ #endif } /* JSGTODO: need to correctly deal with partial chars */ /* JSGTODO (from markus :-) - deal with surrogate pairs; see UTF-8/16BE/16LE implementations, http://oss.software.ibm.com/icu/archives/icu/icu.0002/msg00043.html behavior: uniChar is now declared UChar32; if(UTF_IS_FIRST_SURROGATE(uniChar)) then check for more input length if too short, then error else get another 16-bit unit if(UTF_IS_SECOND_SURROGATE(second unit)) then uniChar=UTF16_GET_PAIR_VALUE(uniChar, second unit); You may need to do this only when the following LMBCS byte indicates embedded Unicode (ULMBCS_GRP_UNICODE), and get the following surrogate directly from the following two bytes like the UTF-16BE implementation. actually, just for the embedded Unicode, i did this. if no other groups in LMBCS can carry single surrogates, then we may be done with my changes. */ return uniChar; } void _LMBCSToUnicodeWithOffsets(UConverter* _this, UChar** target, const UChar* targetLimit, const char** source, const char* sourceLimit, int32_t* offsets, bool_t flush, UErrorCode* err) { UChar32 uniChar; /* an output UNICODE char */ CompactShortArray *MyCArray = NULL; UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo; ulmbcs_grp_t group = 0; UConverter* cnv = 0; const char * pStartLMBCS = *source; if (!err || U_FAILURE(*err)) { return; } if ((_this == NULL) || (targetLimit < *target) || (sourceLimit < *source)) { *err = U_ILLEGAL_ARGUMENT_ERROR; return; } #if 0 /* JSGTODOD - restore incomplete char handling */ /* Have we arrived here from a prior conversion ending with a partial char? The only possible configurations are: 1. mode contains the group byte of SBCS LMBCS char; 2. mode contains the group byte of MBCS LMBCS char For both continue with next char in input buffer 3. mode contains group byte + 1st data byte of MBCS LMBCS char Partially process & get the second data byte 4. mode contains both group bytes of double group-byte MBCS LMBCS char Nuke contents after setting up converter & continue with buffer data */ if (_this->toUnicodeStatus) { mbChar = (UChar) _this->mode; /* Restore the previously calculated char */ _this->toUnicodeStatus = 0; /* Reset other fields*/ _this->invalidCharLength = 0; /* Check if this is a partial MBCS char (fall through if SBCS) */ if (mbChar > 0xFF) { /* Select the correct converter */ group = (mbChar >> 8) & 0x00FF; cnv = extraInfo->OptGrpConverter[group]; /* Pick up the converter table */ MyCArray = cnv->sharedData->table->mbcs.toUnicode; /* Use only data byte: NULL if the character has pair of group-bytes */ if (mbChar & 0x00FF < ULMBCS_MAXGRPBYTE) CurByte = 0; else CurByte = ((mbChar & 0x00FF) << 8); /* Add the current char from the buffer */ CurByte |= *((uint8_t *) (*source)++); goto continueWithPartialMBCSChar; } else { goto continueWithPartialChar; } } #endif /* Process from source to limit */ while (!*err && sourceLimit > *source && targetLimit > *target) { if(offsets) { *offsets = (*source) - pStartLMBCS; } uniChar = _LMBCSGetNextUChar(_this, source, sourceLimit, err); /* last step is always to move the new value into the buffer */ if (U_SUCCESS(*err) && uniChar != missingUCharMarker) { /* JSGTODO deal with missingUCharMarker case for error/info reporting. */ if(!UTF_NEED_MULTIPLE_UCHAR(uniChar)) { *(*target)++ = (UChar)uniChar; } else { /* JSGTODO (from markus) write several UChar's for this UChar32; you may need to use macros like UTF_APPEND_CHAR() or similar (from utf.h) what does this mean for the target range check and for the offsets? */ } if(offsets) { offsets++; } } } #if 0 /* JSGTODO restore partial char handling */ /* Check to see if we've fallen through because of a partial char */ if (*err == U_TRUNCATED_CHAR_FOUND) { _this->mode = mbChar; /* Save current partial char */ } #endif } /* Convert LMBCS string to Unicode */ void _LMBCSToUnicode(UConverter* _this, UChar** target, const UChar* targetLimit, const char** source, const char* sourceLimit, int32_t* offsets, bool_t flush, UErrorCode* err) { _LMBCSToUnicodeWithOffsets(_this, target, targetLimit, source, sourceLimit, offsets, flush,err); } static void _LMBCSOpenWorker(UConverter* _this, const char* name, const char* locale, UErrorCode* err, ulmbcs_grp_t OptGroup ) { UConverterDataLMBCS * extraInfo = uprv_malloc (sizeof (UConverterDataLMBCS)); if(extraInfo != NULL) { ulmbcs_grp_t i; ulmbcs_grp_t imax; imax = sizeof(extraInfo->OptGrpConverter)/sizeof(extraInfo->OptGrpConverter[0]); for (i=0; i < imax; i++) { extraInfo->OptGrpConverter[i] = (OptGroupByteToCPName[i] != NULL) ? ucnv_open(OptGroupByteToCPName[i], err) : NULL; } extraInfo->OptGroup = OptGroup; /* JSGTODO: add LocaleConverterIndex logic here */ extraInfo->localeConverterIndex = 0; } else { *err = U_MEMORY_ALLOCATION_ERROR; } _this->extraInfo = extraInfo; } static void _LMBCSClose(UConverter * _this) { if (_this->extraInfo != NULL) { ulmbcs_grp_t Ix; for (Ix=0; Ix < ULMBCS_GRP_UNICODE; Ix++) { UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo; if (extraInfo->OptGrpConverter[Ix] != NULL) ucnv_close (extraInfo->OptGrpConverter[Ix]); } uprv_free (_this->extraInfo); } } #define DEFINE_LMBCS_OPEN(n) \ static void _LMBCSOpen##n(UConverter* _this,const char* name,const char* locale,UErrorCode* err) \ { _LMBCSOpenWorker(_this, name,locale, err, n);} \ DEFINE_LMBCS_OPEN(1) DEFINE_LMBCS_OPEN(2) DEFINE_LMBCS_OPEN(3) DEFINE_LMBCS_OPEN(4) DEFINE_LMBCS_OPEN(5) DEFINE_LMBCS_OPEN(6) DEFINE_LMBCS_OPEN(8) DEFINE_LMBCS_OPEN(11) DEFINE_LMBCS_OPEN(16) DEFINE_LMBCS_OPEN(17) DEFINE_LMBCS_OPEN(18) DEFINE_LMBCS_OPEN(19) #define DECLARE_LMBCS_DATA(n) \ static const UConverterImpl _LMBCSImpl##n={\ UCNV_LMBCS_##n,\ NULL,NULL,\ _LMBCSOpen##n,\ _LMBCSClose,\ NULL,\ _LMBCSToUnicode,\ _LMBCSToUnicodeWithOffsets,\ _LMBCSFromUnicode,\ NULL,\ _LMBCSGetNextUChar,\ NULL\ };\ const UConverterStaticData _LMBCSStaticData##n={\ sizeof(UConverterStaticData),\ "LMBCS_" ## #n,\ 0, UCNV_IBM, UCNV_LMBCS_1, 1, 1,\ 1, { 0x3f, 0, 0, 0 } \ };\ const UConverterSharedData _LMBCSData##n={\ sizeof(UConverterSharedData), ~0,\ NULL, NULL, &_LMBCSStaticData##n, FALSE, &_LMBCSImpl##n, \ 0 \ }; DECLARE_LMBCS_DATA(1) DECLARE_LMBCS_DATA(2) DECLARE_LMBCS_DATA(3) DECLARE_LMBCS_DATA(4) DECLARE_LMBCS_DATA(5) DECLARE_LMBCS_DATA(6) DECLARE_LMBCS_DATA(8) DECLARE_LMBCS_DATA(11) DECLARE_LMBCS_DATA(16) DECLARE_LMBCS_DATA(17) DECLARE_LMBCS_DATA(18) DECLARE_LMBCS_DATA(19)