d57871f769
X-SVN-Rev: 13990
3000 lines
121 KiB
C
3000 lines
121 KiB
C
/*
|
|
**********************************************************************
|
|
* Copyright (C) 2000-2003, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
**********************************************************************
|
|
* file name: ucnv2022.c
|
|
* encoding: US-ASCII
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 2000feb03
|
|
* created by: Markus W. Scherer
|
|
*
|
|
* Change history:
|
|
*
|
|
* 06/29/2000 helena Major rewrite of the callback APIs.
|
|
* 08/08/2000 Ram Included support for ISO-2022-JP-2
|
|
* Changed implementation of toUnicode
|
|
* function
|
|
* 08/21/2000 Ram Added support for ISO-2022-KR
|
|
* 08/29/2000 Ram Seperated implementation of EBCDIC to
|
|
* ucnvebdc.c
|
|
* 09/20/2000 Ram Added support for ISO-2022-CN
|
|
* Added implementations for getNextUChar()
|
|
* for specific 2022 country variants.
|
|
* 10/31/2000 Ram Implemented offsets logic functions
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_LEGACY_CONVERSION
|
|
|
|
#include "unicode/ucnv.h"
|
|
#include "unicode/uset.h"
|
|
#include "unicode/ucnv_err.h"
|
|
#include "unicode/ucnv_cb.h"
|
|
#include "ucnv_bld.h"
|
|
#include "ucnv_cnv.h"
|
|
#include "ucnvmbcs.h"
|
|
#include "cstring.h"
|
|
#include "cmemory.h"
|
|
|
|
#ifdef U_ENABLE_GENERIC_ISO_2022
|
|
/*
|
|
* I am disabling the generic ISO-2022 converter after proposing to do so on
|
|
* the icu mailing list two days ago.
|
|
*
|
|
* Reasons:
|
|
* 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
|
|
* its designation sequences, single shifts with return to the previous state,
|
|
* switch-with-no-return to UTF-16BE or similar, etc.
|
|
* This is unlike the language-specific variants like ISO-2022-JP which
|
|
* require a much smaller repertoire of ISO-2022 features.
|
|
* These variants continue to be supported.
|
|
* 2. I believe that no one is really using the generic ISO-2022 converter
|
|
* but rather always one of the language-specific variants.
|
|
* Note that ICU's generic ISO-2022 converter has always output one escape
|
|
* sequence followed by UTF-8 for the whole stream.
|
|
* 3. Switching between subcharsets is extremely slow, because each time
|
|
* the previous converter is closed and a new one opened,
|
|
* without any kind of caching, least-recently-used list, etc.
|
|
* 4. The code is currently buggy, and given the above it does not seem
|
|
* reasonable to spend the time on maintenance.
|
|
*
|
|
* Markus Scherer 2003-dec-03
|
|
*/
|
|
#endif
|
|
|
|
static const char UCNV_SS2[] = "\x1B\x4E";
|
|
static const char UCNV_SS3[] = "\x1B\x4F";
|
|
#define UCNV_SS2_LEN 2
|
|
#define UCNV_SS3_LEN 2
|
|
|
|
#define CR 0x0D
|
|
#define LF 0x0A
|
|
#define H_TAB 0x09
|
|
#define V_TAB 0x0B
|
|
#define SPACE 0x20
|
|
|
|
/* for ISO-2022-JP and -CN implementations */
|
|
typedef enum {
|
|
/* shared values */
|
|
INVALID_STATE=-1,
|
|
ASCII = 0,
|
|
|
|
SS2_STATE=0x10,
|
|
SS3_STATE,
|
|
|
|
/* JP */
|
|
ISO8859_1 = 1 ,
|
|
ISO8859_7 = 2 ,
|
|
JISX201 = 3,
|
|
JISX208 = 4,
|
|
JISX212 = 5,
|
|
GB2312 =6,
|
|
KSC5601 =7,
|
|
HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
|
|
|
|
/* CN */
|
|
/* the first few enum constants must keep their values because they correspond to myConverterArray[] */
|
|
GB2312_1=1,
|
|
ISO_IR_165=2,
|
|
CNS_11643=3,
|
|
|
|
/*
|
|
* these are used in StateEnum and ISO2022State variables,
|
|
* but CNS_11643 must be used to index into myConverterArray[]
|
|
*/
|
|
CNS_11643_0=0x20,
|
|
CNS_11643_1,
|
|
CNS_11643_2,
|
|
CNS_11643_3,
|
|
CNS_11643_4,
|
|
CNS_11643_5,
|
|
CNS_11643_6,
|
|
CNS_11643_7
|
|
} StateEnum;
|
|
|
|
|
|
|
|
typedef enum {
|
|
ASCII1=0,
|
|
LATIN1,
|
|
SBCS,
|
|
DBCS,
|
|
MBCS
|
|
|
|
}Cnv2022Type;
|
|
|
|
typedef struct ISO2022State {
|
|
int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
|
|
int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
|
|
int8_t prevG; /* g before single shift (SS2 or SS3) */
|
|
} ISO2022State;
|
|
|
|
#define UCNV_OPTIONS_VERSION_MASK 0xf
|
|
#define UCNV_2022_MAX_CONVERTERS 10
|
|
|
|
typedef struct{
|
|
UConverter *currentConverter;
|
|
UConverter *fromUnicodeConverter;
|
|
UBool isFirstBuffer;
|
|
StateEnum toUnicodeCurrentState;
|
|
StateEnum fromUnicodeCurrentState;
|
|
StateEnum toUnicodeSaveState;
|
|
Cnv2022Type currentType;
|
|
ISO2022State toU2022State, fromU2022State;
|
|
UConverter* myConverterArray[UCNV_2022_MAX_CONVERTERS];
|
|
UBool isShiftAppended;
|
|
uint32_t key;
|
|
uint32_t version;
|
|
char locale[3];
|
|
char name[30];
|
|
}UConverterDataISO2022;
|
|
|
|
/* Protos */
|
|
/* ISO-2022 ----------------------------------------------------------------- */
|
|
|
|
/*Forward declaration */
|
|
U_CFUNC void
|
|
T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
|
|
UErrorCode * err);
|
|
U_CFUNC void
|
|
T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
|
|
UErrorCode * err);
|
|
U_CFUNC void
|
|
_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
|
UErrorCode *pErrorCode);
|
|
U_CFUNC void
|
|
_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
|
UErrorCode *pErrorCode);
|
|
|
|
#define ESC_2022 0x1B /*ESC*/
|
|
|
|
typedef enum
|
|
{
|
|
INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
|
|
VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
|
|
VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
|
|
VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
|
|
} UCNV_TableStates_2022;
|
|
|
|
/*
|
|
* The way these state transition arrays work is:
|
|
* ex : ESC$B is the sequence for JISX208
|
|
* a) First Iteration: char is ESC
|
|
* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
|
|
* int x = normalize_esq_chars_2022[27] which is equal to 1
|
|
* ii) Search for this value in escSeqStateTable_Key_2022[]
|
|
* value of x is stored at escSeqStateTable_Key_2022[0]
|
|
* iii) Save this index as offset
|
|
* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
|
|
* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
|
|
* b) Switch on this state and continue to next char
|
|
* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
|
|
* which is normalize_esq_chars_2022[36] == 4
|
|
* ii) x is currently 1(from above)
|
|
* x<<=5 -- x is now 32
|
|
* x+=normalize_esq_chars_2022[36]
|
|
* now x is 36
|
|
* iii) Search for this value in escSeqStateTable_Key_2022[]
|
|
* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
|
|
* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
|
|
* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
|
|
* c) Switch on this state and continue to next char
|
|
* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
|
|
* ii) x is currently 36 (from above)
|
|
* x<<=5 -- x is now 1152
|
|
* x+=normalize_esq_chars_2022[66]
|
|
* now x is 1161
|
|
* iii) Search for this value in escSeqStateTable_Key_2022[]
|
|
* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
|
|
* iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
|
|
* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
|
|
* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
|
|
*/
|
|
|
|
|
|
/*Below are the 3 arrays depicting a state transition table*/
|
|
static const int8_t normalize_esq_chars_2022[256] = {
|
|
/* 0 1 2 3 4 5 6 7 8 9 */
|
|
|
|
0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
|
|
,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
|
|
,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
|
|
,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
|
|
,0 ,0 ,0 ,0 ,0 ,0
|
|
};
|
|
|
|
#ifdef U_ENABLE_GENERIC_ISO_2022
|
|
/*
|
|
* When the generic ISO-2022 converter is completely removed, not just disabled
|
|
* per #ifdef, then the following state table and the associated tables that are
|
|
* dimensioned with MAX_STATES_2022 should be trimmed.
|
|
*
|
|
* Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
|
|
* the associated escape sequences starting with ESC ( B should be removed.
|
|
* This includes the ones with key values 1097 and all of the ones above 1000000.
|
|
*
|
|
* For the latter, the tables can simply be truncated.
|
|
* For the former, since the tables must be kept parallel, it is probably best
|
|
* to simply duplicate an adjacent table cell, parallel in all tables.
|
|
*
|
|
* It may make sense to restructure the tables, especially by using small search
|
|
* tables for the variants instead of indexing them parallel to the table here.
|
|
*/
|
|
#endif
|
|
|
|
#define MAX_STATES_2022 74
|
|
static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
|
|
/* 0 1 2 3 4 5 6 7 8 9 */
|
|
|
|
1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
|
|
,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
|
|
,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
|
|
,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
|
|
,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
|
|
,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
|
|
,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
|
|
,35947631 ,35947635 ,35947636 ,35947638
|
|
};
|
|
|
|
#ifdef U_ENABLE_GENERIC_ISO_2022
|
|
|
|
static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
|
|
/* 0 1 2 3 4 5 6 7 8 9 */
|
|
|
|
NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
|
|
,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX-201" ,"JISX-201" ,"latin1"
|
|
,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
|
|
,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
|
|
,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
|
|
,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
|
|
,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
|
|
,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
|
|
};
|
|
|
|
#endif
|
|
|
|
static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = {
|
|
/* 0 1 2 3 4 5 6 7 8 9 */
|
|
VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
|
,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
|
,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
|
|
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
|
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
|
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
|
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
|
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
|
|
};
|
|
|
|
|
|
/* Type def for refactoring changeState_2022 code*/
|
|
typedef enum{
|
|
#ifdef U_ENABLE_GENERIC_ISO_2022
|
|
ISO_2022=0,
|
|
#endif
|
|
ISO_2022_JP=1,
|
|
ISO_2022_KR=2,
|
|
ISO_2022_CN=3
|
|
} Variant2022;
|
|
|
|
/*********** ISO 2022 Converter Protos ***********/
|
|
static void
|
|
_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
|
|
|
|
static void
|
|
_ISO2022Close(UConverter *converter);
|
|
|
|
static void
|
|
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
|
|
|
|
static const char*
|
|
_ISO2022getName(const UConverter* cnv);
|
|
|
|
static void
|
|
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
|
|
|
|
static UConverter *
|
|
_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
|
|
|
|
static void
|
|
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
|
|
|
|
/*const UConverterSharedData _ISO2022Data;*/
|
|
static const UConverterSharedData _ISO2022JPData;
|
|
static const UConverterSharedData _ISO2022KRData;
|
|
static const UConverterSharedData _ISO2022CNData;
|
|
|
|
/*************** Converter implementations ******************/
|
|
|
|
/**********/
|
|
static void
|
|
setInitialStateToUnicodeJPCN(UConverter* converter,UConverterDataISO2022 *myConverterData ){
|
|
myConverterData->toUnicodeCurrentState =ASCII;
|
|
myConverterData->currentConverter = NULL;
|
|
myConverterData->isFirstBuffer = TRUE;
|
|
myConverterData->toUnicodeSaveState = INVALID_STATE;
|
|
converter->mode = UCNV_SI;
|
|
}
|
|
|
|
static void
|
|
setInitialStateFromUnicodeJPCN(UConverter* converter,UConverterDataISO2022 *myConverterData){
|
|
myConverterData->fromUnicodeCurrentState= ASCII;
|
|
myConverterData->isShiftAppended=FALSE;
|
|
myConverterData->currentType = ASCII1;
|
|
converter->fromUnicodeStatus = FALSE;
|
|
}
|
|
|
|
static void
|
|
setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
|
|
|
|
converter->mode = UCNV_SI;
|
|
myConverterData->currentConverter = myConverterData->fromUnicodeConverter;
|
|
|
|
}
|
|
|
|
static void
|
|
setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
|
|
/* in ISO-2022-KR the desginator sequence appears only once
|
|
* in a file so we append it only once
|
|
*/
|
|
if( converter->charErrorBufferLength==0){
|
|
|
|
converter->charErrorBufferLength = 4;
|
|
converter->charErrorBuffer[0] = 0x1b;
|
|
converter->charErrorBuffer[1] = 0x24;
|
|
converter->charErrorBuffer[2] = 0x29;
|
|
converter->charErrorBuffer[3] = 0x43;
|
|
}
|
|
myConverterData->isShiftAppended=FALSE;
|
|
|
|
}
|
|
|
|
static void
|
|
_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
|
|
|
|
char myLocale[6]={' ',' ',' ',' ',' ',' '};
|
|
|
|
cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
|
|
if(cnv->extraInfo != NULL) {
|
|
UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
|
|
uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
|
|
myConverterData->currentConverter = NULL;
|
|
myConverterData->fromUnicodeConverter = NULL;
|
|
myConverterData->currentType = ASCII1;
|
|
myConverterData->key =0;
|
|
myConverterData->isFirstBuffer = TRUE;
|
|
cnv->fromUnicodeStatus =FALSE;
|
|
if(locale){
|
|
uprv_strncpy(myLocale, locale, sizeof(myLocale));
|
|
}
|
|
myConverterData->version= 0;
|
|
myConverterData->myConverterArray[0] =NULL;
|
|
if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
|
|
(myLocale[2]=='_' || myLocale[2]=='\0')){
|
|
int len=0;
|
|
/* open the required converters and cache them */
|
|
myConverterData->myConverterArray[0]= ucnv_open("ASCII", errorCode );
|
|
myConverterData->myConverterArray[1]= ucnv_open("ISO8859_1", errorCode);
|
|
myConverterData->myConverterArray[2]= ucnv_open("ISO8859_7", errorCode);
|
|
myConverterData->myConverterArray[3]= ucnv_open("jisx-201", errorCode);
|
|
myConverterData->myConverterArray[4]= ucnv_open("jisx-208", errorCode);
|
|
myConverterData->myConverterArray[5]= ucnv_open("jisx-212", errorCode);
|
|
myConverterData->myConverterArray[6]= ucnv_open("ibm-5478", errorCode); /* gb_2312_80-1 */
|
|
myConverterData->myConverterArray[7]= ucnv_open("ksc_5601", errorCode);
|
|
myConverterData->myConverterArray[8]= ucnv_open("jisx-201", errorCode);
|
|
myConverterData->myConverterArray[9]= NULL;
|
|
|
|
/* initialize the state variables */
|
|
setInitialStateToUnicodeJPCN(cnv, myConverterData);
|
|
setInitialStateFromUnicodeJPCN(cnv,myConverterData);
|
|
|
|
/* set the function pointers to appropriate funtions */
|
|
cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
|
|
uprv_strcpy(myConverterData->locale,"ja");
|
|
|
|
myConverterData->version =options & UCNV_OPTIONS_VERSION_MASK;
|
|
uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
|
|
len = uprv_strlen(myConverterData->name);
|
|
myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
|
|
myConverterData->name[len+1]='\0';
|
|
}
|
|
else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
|
|
(myLocale[2]=='_' || myLocale[2]=='\0')){
|
|
|
|
/* initialize the state variables */
|
|
setInitialStateToUnicodeKR(cnv, myConverterData);
|
|
setInitialStateFromUnicodeKR(cnv,myConverterData);
|
|
|
|
if ((options & UCNV_OPTIONS_VERSION_MASK)==1){
|
|
myConverterData->version = 1;
|
|
myConverterData->currentConverter=myConverterData->fromUnicodeConverter=
|
|
ucnv_open("icu-internal-25546",errorCode);
|
|
uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
|
|
}else{
|
|
myConverterData->currentConverter=myConverterData->fromUnicodeConverter = ucnv_open("ibm-949",errorCode);
|
|
myConverterData->version = 0;
|
|
uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
|
|
}
|
|
|
|
/* set the function pointers to appropriate funtions */
|
|
cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
|
|
cnv->mode=UCNV_SI;
|
|
uprv_strcpy(myConverterData->locale,"ko");
|
|
}
|
|
else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
|
|
(myLocale[2]=='_' || myLocale[2]=='\0')){
|
|
|
|
/* open the required converters and cache them */
|
|
myConverterData->myConverterArray[0] = NULL;
|
|
myConverterData->myConverterArray[GB2312_1] = ucnv_open("ibm-5478",errorCode);
|
|
myConverterData->myConverterArray[ISO_IR_165] = ucnv_open("iso-ir-165",errorCode);
|
|
myConverterData->myConverterArray[CNS_11643] = ucnv_open("cns-11643-1992",errorCode);
|
|
myConverterData->myConverterArray[4] = NULL;
|
|
|
|
|
|
/*initialize the state variables*/
|
|
setInitialStateToUnicodeJPCN(cnv, myConverterData);
|
|
setInitialStateFromUnicodeJPCN(cnv,myConverterData);
|
|
|
|
/* set the function pointers to appropriate funtions */
|
|
cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
|
|
uprv_strcpy(myConverterData->locale,"cn");
|
|
|
|
if ((options & UCNV_OPTIONS_VERSION_MASK)==1){
|
|
myConverterData->version = 1;
|
|
uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
|
|
}else{
|
|
uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
|
|
myConverterData->version = 0;
|
|
}
|
|
}
|
|
else{
|
|
#ifdef U_ENABLE_GENERIC_ISO_2022
|
|
/* append the UTF-8 escape sequence */
|
|
cnv->charErrorBufferLength = 3;
|
|
cnv->charErrorBuffer[0] = 0x1b;
|
|
cnv->charErrorBuffer[1] = 0x25;
|
|
cnv->charErrorBuffer[2] = 0x42;
|
|
|
|
cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
|
|
/* initialize the state variables */
|
|
uprv_strcpy(myConverterData->name,"ISO_2022");
|
|
#else
|
|
*errorCode = U_UNSUPPORTED_ERROR;
|
|
return;
|
|
#endif
|
|
}
|
|
|
|
cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
|
|
} else {
|
|
*errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
}
|
|
|
|
|
|
static void
|
|
_ISO2022Close(UConverter *converter) {
|
|
UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
|
|
UConverter **array = myData->myConverterArray;
|
|
|
|
if (converter->extraInfo != NULL) {
|
|
/*close the array of converter pointers and free the memory*/
|
|
while(*array!=NULL){
|
|
if(*array==myData->currentConverter){
|
|
myData->currentConverter=NULL;
|
|
}
|
|
ucnv_close(*array++);
|
|
|
|
}
|
|
|
|
ucnv_close(myData->currentConverter); /* if not closed above */
|
|
|
|
if(!converter->isExtraLocal){
|
|
uprv_free (converter->extraInfo);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
|
|
UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
|
|
if(choice<=UCNV_RESET_TO_UNICODE) {
|
|
uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
|
|
myConverterData->key = 0;
|
|
}
|
|
if(choice!=UCNV_RESET_TO_UNICODE) {
|
|
uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
|
|
}
|
|
#ifdef U_ENABLE_GENERIC_ISO_2022
|
|
if(myConverterData->locale[0] == 0){
|
|
if(choice<=UCNV_RESET_TO_UNICODE) {
|
|
myConverterData->isFirstBuffer = TRUE;
|
|
myConverterData->key = 0;
|
|
if (converter->mode == UCNV_SO){
|
|
ucnv_close (myConverterData->currentConverter);
|
|
myConverterData->currentConverter=NULL;
|
|
}
|
|
converter->mode = UCNV_SI;
|
|
}
|
|
if(choice!=UCNV_RESET_TO_UNICODE) {
|
|
/* re-append UTF-8 escape sequence */
|
|
converter->charErrorBufferLength = 3;
|
|
converter->charErrorBuffer[0] = 0x1b;
|
|
converter->charErrorBuffer[1] = 0x28;
|
|
converter->charErrorBuffer[2] = 0x42;
|
|
}
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
/* reset the state variables */
|
|
if(myConverterData->locale[0] == 'j' || myConverterData->locale[0] == 'c'){
|
|
if(choice<=UCNV_RESET_TO_UNICODE) {
|
|
setInitialStateToUnicodeJPCN(converter, myConverterData);
|
|
}
|
|
if(choice!=UCNV_RESET_TO_UNICODE) {
|
|
setInitialStateFromUnicodeJPCN(converter,myConverterData);
|
|
}
|
|
}
|
|
else if(myConverterData->locale[0] == 'k'){
|
|
if(choice<=UCNV_RESET_TO_UNICODE) {
|
|
setInitialStateToUnicodeKR(converter, myConverterData);
|
|
}
|
|
if(choice!=UCNV_RESET_TO_UNICODE) {
|
|
setInitialStateFromUnicodeKR(converter, myConverterData);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static const char*
|
|
_ISO2022getName(const UConverter* cnv){
|
|
if(cnv->extraInfo){
|
|
UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
|
|
return myData->name;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
/*************** to unicode *******************/
|
|
/****************************************************************************
|
|
* Recognized escape sequences are
|
|
* <ESC>(B ASCII
|
|
* <ESC>.A ISO-8859-1
|
|
* <ESC>.F ISO-8859-7
|
|
* <ESC>(J JISX-201
|
|
* <ESC>(I JISX-201
|
|
* <ESC>$B JISX-208
|
|
* <ESC>$@ JISX-208
|
|
* <ESC>$(D JISX-212
|
|
* <ESC>$A GB2312
|
|
* <ESC>$(C KSC5601
|
|
*/
|
|
static const StateEnum nextStateToUnicodeJP[5][MAX_STATES_2022]= {
|
|
{
|
|
/* 0 1 2 3 4 5 6 7 8 9 */
|
|
INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,JISX208 ,INVALID_STATE ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
},
|
|
{
|
|
/* 0 1 2 3 4 5 6 7 8 9 */
|
|
INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,JISX208 ,INVALID_STATE ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX212 ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
},
|
|
{
|
|
/* 0 1 2 3 4 5 6 7 8 9 */
|
|
INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
},
|
|
{
|
|
/* 0 1 2 3 4 5 6 7 8 9 */
|
|
INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
},
|
|
{
|
|
/* 0 1 2 3 4 5 6 7 8 9 */
|
|
INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
}
|
|
};
|
|
|
|
/*************** to unicode *******************/
|
|
static const StateEnum nextStateToUnicodeCN[2][MAX_STATES_2022]= {
|
|
{
|
|
/* 0 1 2 3 4 5 6 7 8 9 */
|
|
INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,INVALID_STATE
|
|
,CNS_11643_1 ,CNS_11643_2 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
},
|
|
{
|
|
/* 0 1 2 3 4 5 6 7 8 9 */
|
|
INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
|
|
,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
|
|
}
|
|
};
|
|
|
|
|
|
static UCNV_TableStates_2022
|
|
getKey_2022(char c,int32_t* key,int32_t* offset){
|
|
int32_t togo;
|
|
int32_t low = 0;
|
|
int32_t hi = MAX_STATES_2022;
|
|
int32_t oldmid=0;
|
|
|
|
togo = normalize_esq_chars_2022[(uint8_t)c];
|
|
if(togo == 0) {
|
|
/* not a valid character anywhere in an escape sequence */
|
|
*key = 0;
|
|
*offset = 0;
|
|
return INVALID_2022;
|
|
}
|
|
togo = (*key << 5) + togo;
|
|
|
|
while (hi != low) /*binary search*/{
|
|
|
|
register int32_t mid = (hi+low) >> 1; /*Finds median*/
|
|
|
|
if (mid == oldmid)
|
|
break;
|
|
|
|
if (escSeqStateTable_Key_2022[mid] > togo){
|
|
hi = mid;
|
|
}
|
|
else if (escSeqStateTable_Key_2022[mid] < togo){
|
|
low = mid;
|
|
}
|
|
else /*we found it*/{
|
|
*key = togo;
|
|
*offset = mid;
|
|
return escSeqStateTable_Value_2022[mid];
|
|
}
|
|
oldmid = mid;
|
|
|
|
}
|
|
|
|
*key = 0;
|
|
*offset = 0;
|
|
return INVALID_2022;
|
|
}
|
|
|
|
/*runs through a state machine to determine the escape sequence - codepage correspondance
|
|
*/
|
|
static void
|
|
changeState_2022(UConverter* _this,
|
|
const char** source,
|
|
const char* sourceLimit,
|
|
Variant2022 var,
|
|
UErrorCode* err){
|
|
UConverter* myUConverter;
|
|
UCNV_TableStates_2022 value;
|
|
UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
|
|
uint32_t key = myData2022->key;
|
|
int32_t offset;
|
|
char c;
|
|
|
|
value = VALID_NON_TERMINAL_2022;
|
|
while (*source < sourceLimit) {
|
|
c = *(*source)++;
|
|
_this->toUBytes[_this->toULength++]=(uint8_t)c;
|
|
value = getKey_2022(c,(int32_t *) &key, &offset);
|
|
|
|
switch (value){
|
|
|
|
case VALID_NON_TERMINAL_2022 :
|
|
/* continue with the loop */
|
|
break;
|
|
|
|
case VALID_TERMINAL_2022:
|
|
key = 0;
|
|
goto DONE;
|
|
|
|
case INVALID_2022:
|
|
goto DONE;
|
|
|
|
case VALID_MAYBE_TERMINAL_2022:
|
|
#ifdef U_ENABLE_GENERIC_ISO_2022
|
|
/* ESC ( B is ambiguous only for ISO_2022 itself */
|
|
if(var == ISO_2022) {
|
|
/* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
|
|
_this->toULength = 0;
|
|
|
|
/* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
|
|
|
|
/* continue with the loop */
|
|
value = VALID_NON_TERMINAL_2022;
|
|
break;
|
|
} else
|
|
#endif
|
|
{
|
|
/* not ISO_2022 itself, finish here */
|
|
value = VALID_TERMINAL_2022;
|
|
key = 0;
|
|
goto DONE;
|
|
}
|
|
}
|
|
}
|
|
|
|
DONE:
|
|
myData2022->key = key;
|
|
|
|
if (value == VALID_NON_TERMINAL_2022) {
|
|
/* indicate that the escape sequence is incomplete: key!=0 */
|
|
return;
|
|
} else if (value == INVALID_2022 ) {
|
|
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
|
return;
|
|
} else /* value == VALID_TERMINAL_2022 */ {
|
|
switch(var){
|
|
#ifdef U_ENABLE_GENERIC_ISO_2022
|
|
case ISO_2022:
|
|
{
|
|
const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
|
|
if(chosenConverterName == NULL) {
|
|
/* SS2 or SS3 */
|
|
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
|
return;
|
|
}
|
|
|
|
_this->mode = UCNV_SI;
|
|
ucnv_close(myData2022->currentConverter);
|
|
myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
|
|
if(U_SUCCESS(*err)) {
|
|
myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
|
|
}
|
|
break;
|
|
}
|
|
#endif
|
|
case ISO_2022_JP:
|
|
{
|
|
StateEnum tempState=nextStateToUnicodeJP[myData2022->version][offset];
|
|
_this->mode = UCNV_SI;
|
|
if(tempState == INVALID_STATE) {
|
|
myData2022->toUnicodeCurrentState = INVALID_STATE;
|
|
myData2022->currentConverter = myUConverter = NULL;
|
|
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
|
} else if(tempState == SS2_STATE) {
|
|
myUConverter = myData2022->currentConverter;
|
|
} else {
|
|
myData2022->toUnicodeCurrentState = tempState;
|
|
myData2022->currentConverter = myUConverter = myData2022->myConverterArray[tempState];
|
|
}
|
|
}
|
|
break;
|
|
case ISO_2022_CN:
|
|
{
|
|
StateEnum tempState=nextStateToUnicodeCN[myData2022->version][offset];
|
|
switch(tempState) {
|
|
case INVALID_STATE:
|
|
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
|
break;
|
|
case SS2_STATE:
|
|
if(myData2022->toU2022State.cs[2]!=0) {
|
|
if(myData2022->toU2022State.g<2) {
|
|
myData2022->toU2022State.prevG=myData2022->toU2022State.g;
|
|
}
|
|
myData2022->toU2022State.g=2;
|
|
} else {
|
|
/* illegal to have SS2 before a matching designator */
|
|
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
|
}
|
|
break;
|
|
case SS3_STATE:
|
|
if(myData2022->toU2022State.cs[3]!=0) {
|
|
if(myData2022->toU2022State.g<2) {
|
|
myData2022->toU2022State.prevG=myData2022->toU2022State.g;
|
|
}
|
|
myData2022->toU2022State.g=3;
|
|
} else {
|
|
/* illegal to have SS3 before a matching designator */
|
|
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
|
}
|
|
break;
|
|
case GB2312_1:
|
|
case ISO_IR_165:
|
|
case CNS_11643_1:
|
|
myData2022->toU2022State.cs[1]=(int8_t)tempState;
|
|
break;
|
|
case CNS_11643_2:
|
|
myData2022->toU2022State.cs[2]=(int8_t)tempState;
|
|
break;
|
|
default:
|
|
/* other CNS 11643 planes */
|
|
myData2022->toU2022State.cs[3]=(int8_t)tempState;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
case ISO_2022_KR:
|
|
if(offset==0x30){
|
|
_this->mode = UCNV_SI;
|
|
myUConverter = myData2022->currentConverter=myData2022->fromUnicodeConverter;
|
|
} else {
|
|
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
|
break;
|
|
}
|
|
}
|
|
if(U_SUCCESS(*err)) {
|
|
_this->mode = UCNV_SO;
|
|
_this->toULength = 0;
|
|
}
|
|
}
|
|
|
|
/*Checks the characters of the buffer against valid 2022 escape sequences
|
|
*if the match we return a pointer to the initial start of the sequence otherwise
|
|
*we return sourceLimit
|
|
*/
|
|
/*for 2022 looks ahead in the stream
|
|
*to determine the longest possible convertible
|
|
*data stream
|
|
*/
|
|
static const char*
|
|
getEndOfBuffer_2022(const char** source,
|
|
const char* sourceLimit,
|
|
UBool flush){
|
|
|
|
const char* mySource = *source;
|
|
|
|
if (*source >= sourceLimit)
|
|
return sourceLimit;
|
|
|
|
do{
|
|
|
|
if (*mySource == ESC_2022){
|
|
int8_t i;
|
|
int32_t key = 0;
|
|
int32_t offset;
|
|
UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
|
|
|
|
/* Kludge: I could not
|
|
* figure out the reason for validating an escape sequence
|
|
* twice - once here and once in changeState_2022().
|
|
* is it possible to have an ESC character in a ISO2022
|
|
* byte stream which is valid in a code page? Is it legal?
|
|
*/
|
|
for (i=0;
|
|
(mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
|
|
i++) {
|
|
value = getKey_2022(*(mySource+i), &key, &offset);
|
|
}
|
|
if (value > 0 || *mySource==ESC_2022)
|
|
return mySource;
|
|
|
|
if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
|
|
return sourceLimit;
|
|
}
|
|
}while (++mySource < sourceLimit);
|
|
|
|
return sourceLimit;
|
|
}
|
|
|
|
|
|
static U_INLINE void
|
|
CONCAT_ESCAPE_EX(UConverterFromUnicodeArgs* args,
|
|
const UChar* source,
|
|
unsigned char** target,
|
|
const unsigned char* targetLimit,
|
|
int32_t** offsets,
|
|
const char* strToAppend,
|
|
int len,
|
|
UErrorCode* err)
|
|
{
|
|
|
|
unsigned char* myTarget = *target;
|
|
int32_t* myOffsets = *offsets;
|
|
while(len-->0){
|
|
if(myTarget < targetLimit){
|
|
*(myTarget++) = (unsigned char) *(strToAppend++);
|
|
if(myOffsets){
|
|
*(myOffsets++) = source - args->source -1;
|
|
}
|
|
}
|
|
else{
|
|
args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *(strToAppend++);
|
|
*err =U_BUFFER_OVERFLOW_ERROR;
|
|
}
|
|
}
|
|
*target = myTarget;
|
|
*offsets = myOffsets;
|
|
}
|
|
|
|
/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
|
|
* any future change in _MBCSFromUChar32() function should be reflected in
|
|
* this macro
|
|
*/
|
|
static U_INLINE void
|
|
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
|
|
UChar32 c,
|
|
uint32_t* value,
|
|
UBool useFallback,
|
|
int32_t *length,
|
|
int outputType)
|
|
{
|
|
|
|
const uint16_t *table=sharedData->mbcs.fromUnicodeTable;
|
|
uint32_t stage2Entry;
|
|
uint32_t myValue=0;
|
|
const uint8_t *p;
|
|
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
|
if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
|
|
stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
|
|
/* get the bytes and the length for the output */
|
|
if(outputType==MBCS_OUTPUT_2){
|
|
myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
|
|
if(myValue<=0xff) {
|
|
*length=1;
|
|
} else {
|
|
*length=2;
|
|
}
|
|
}else if(outputType==MBCS_OUTPUT_3){
|
|
p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
|
|
myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
|
|
if(myValue<=0xff) {
|
|
*length=1;
|
|
} else if(myValue<=0xffff) {
|
|
*length=2;
|
|
} else {
|
|
*length=3;
|
|
}
|
|
}
|
|
/* is this code point assigned, or do we use fallbacks? */
|
|
if( (stage2Entry&(1<<(16+(c&0xf))))!=0 ||
|
|
(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0)
|
|
) {
|
|
/*
|
|
* We allow a 0 byte output if the "assigned" bit is set for this entry.
|
|
* There is no way with this data structure for fallback output
|
|
* to be a zero byte.
|
|
*/
|
|
/* assigned */
|
|
*value=myValue;
|
|
} else {
|
|
const int32_t *cx=sharedData->mbcs.extIndexes;
|
|
if(cx!=NULL) {
|
|
*length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
|
|
} else {
|
|
/* unassigned */
|
|
*length=0;
|
|
}
|
|
}
|
|
}else{
|
|
*length=0;
|
|
}
|
|
}
|
|
|
|
/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
|
|
* any future change in _MBCSSingleFromUChar32() function should be reflected in
|
|
* this macro
|
|
*/
|
|
static U_INLINE void
|
|
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
|
|
UChar32 c,
|
|
uint32_t* retval,
|
|
UBool useFallback)
|
|
{
|
|
const uint16_t *table;
|
|
int32_t value;
|
|
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
|
if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
|
|
value= -1;
|
|
}
|
|
/* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
|
|
table=sharedData->mbcs.fromUnicodeTable;
|
|
/* get the byte for the output */
|
|
value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
|
|
/* is this code point assigned, or do we use fallbacks? */
|
|
if(useFallback ? value>=0x800 : value>=0xc00) {
|
|
value &=0xff;
|
|
} else {
|
|
value= -1;
|
|
}
|
|
*retval=(uint16_t) value;
|
|
}
|
|
|
|
#ifdef U_ENABLE_GENERIC_ISO_2022
|
|
|
|
/**********************************************************************************
|
|
* ISO-2022 Converter
|
|
*
|
|
*
|
|
*/
|
|
|
|
static void
|
|
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
|
|
UErrorCode* err){
|
|
const char* mySourceLimit, *realSourceLimit;
|
|
const char* sourceStart;
|
|
const UChar* myTargetStart;
|
|
UConverter* saveThis;
|
|
UConverterDataISO2022* myData;
|
|
int8_t length;
|
|
|
|
saveThis = args->converter;
|
|
myData=((UConverterDataISO2022*)(saveThis->extraInfo));
|
|
|
|
realSourceLimit = args->sourceLimit;
|
|
while (args->source < realSourceLimit) {
|
|
if(myData->key == 0) { /* are we in the middle of an escape sequence? */
|
|
/*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
|
|
mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
|
|
|
|
if(args->source < mySourceLimit) {
|
|
if(myData->currentConverter==NULL) {
|
|
myData->currentConverter = ucnv_open("ASCII",err);
|
|
if(U_FAILURE(*err)){
|
|
return;
|
|
}
|
|
|
|
myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
|
|
saveThis->mode = UCNV_SO;
|
|
}
|
|
|
|
/* convert to before the ESC or until the end of the buffer */
|
|
myData->isFirstBuffer=FALSE;
|
|
sourceStart = args->source;
|
|
myTargetStart = args->target;
|
|
args->converter = myData->currentConverter;
|
|
ucnv_toUnicode(args->converter,
|
|
&args->target,
|
|
args->targetLimit,
|
|
&args->source,
|
|
mySourceLimit,
|
|
args->offsets,
|
|
(UBool)(args->flush && mySourceLimit == realSourceLimit),
|
|
err);
|
|
args->converter = saveThis;
|
|
|
|
if (*err == U_BUFFER_OVERFLOW_ERROR) {
|
|
/* move the overflow buffer */
|
|
length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
|
|
myData->currentConverter->UCharErrorBufferLength = 0;
|
|
if(length > 0) {
|
|
uprv_memcpy(saveThis->UCharErrorBuffer,
|
|
myData->currentConverter->UCharErrorBuffer,
|
|
length*U_SIZEOF_UCHAR);
|
|
}
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* At least one of:
|
|
* -Error while converting
|
|
* -Done with entire buffer
|
|
* -Need to write offsets or update the current offset
|
|
* (leave that up to the code in ucnv.c)
|
|
*
|
|
* or else we just stopped at an ESC byte and continue with changeState_2022()
|
|
*/
|
|
if (U_FAILURE(*err) ||
|
|
(args->source == realSourceLimit) ||
|
|
(args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
|
|
(mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
|
|
) {
|
|
/* copy partial or error input for truncated detection and error handling */
|
|
if(U_FAILURE(*err)) {
|
|
length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
|
|
if(length > 0) {
|
|
uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
|
|
}
|
|
} else {
|
|
length = saveThis->toULength = myData->currentConverter->toULength;
|
|
if(length > 0) {
|
|
uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
|
|
if(args->source < mySourceLimit) {
|
|
*err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
sourceStart = args->source;
|
|
changeState_2022(args->converter,
|
|
&(args->source),
|
|
realSourceLimit,
|
|
ISO_2022,
|
|
err);
|
|
if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
|
|
/* let the ucnv.c code update its current offset */
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
* To Unicode Callback helper function
|
|
*/
|
|
static void
|
|
toUnicodeCallback(UConverter *cnv,
|
|
const uint32_t sourceChar, const uint32_t targetUniChar,
|
|
UErrorCode* err){
|
|
if(sourceChar>0xff){
|
|
cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
|
|
cnv->toUBytes[1] = (uint8_t)sourceChar;
|
|
cnv->toULength = 2;
|
|
}
|
|
else{
|
|
cnv->toUBytes[0] =(char) sourceChar;
|
|
cnv->toULength = 2;
|
|
}
|
|
|
|
if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
|
|
*err = U_INVALID_CHAR_FOUND;
|
|
}
|
|
else{
|
|
*err = U_ILLEGAL_CHAR_FOUND;
|
|
}
|
|
}
|
|
|
|
/**************************************ISO-2022-JP*************************************************/
|
|
|
|
/************************************** IMPORTANT **************************************************
|
|
* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
|
|
* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
|
|
* The converter iterates over each Unicode codepoint
|
|
* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
|
|
* processed one char at a time it would make sense to reduce the extra processing a canned converter
|
|
* would do as far as possible.
|
|
*
|
|
* If the implementation of these macros or structure of sharedData struct change in the future, make
|
|
* sure that ISO-2022 is also changed.
|
|
***************************************************************************************************
|
|
*/
|
|
|
|
/***************************************************************************************************
|
|
* Rules for ISO-2022-jp encoding
|
|
* (i) Escape sequences must be fully contained within a line they should not
|
|
* span new lines or CRs
|
|
* (ii) If the last character on a line is represented by two bytes then an ASCII or
|
|
* JIS-Roman character escape sequence should follow before the line terminates
|
|
* (iii) If the first character on the line is represented by two bytes then a two
|
|
* byte character escape sequence should precede it
|
|
* (iv) If no escape sequence is encountered then the characters are ASCII
|
|
* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
|
|
* and invoked with SS2 (ESC N).
|
|
* (vi) If there is any G0 designation in text, there must be a switch to
|
|
* ASCII or to JIS X 0201-Roman before a space character (but not
|
|
* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
|
|
* characters such as tab or CRLF.
|
|
* (vi) Supported encodings:
|
|
* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
|
|
*
|
|
* source : RFC-1554
|
|
*
|
|
* JISX201, JISX208,JISX212 : new .cnv data files created
|
|
* KSC5601 : alias to ibm-949 mapping table
|
|
* GB2312 : alias to ibm-1386 mapping table
|
|
* ISO-8859-1 : Algorithmic implemented as LATIN1 case
|
|
* ISO-8859-7 : alisas to ibm-9409 mapping table
|
|
*/
|
|
#define MAX_VALID_CP_JP 9
|
|
static const Cnv2022Type myConverterType[MAX_VALID_CP_JP]={
|
|
ASCII1,
|
|
LATIN1,
|
|
SBCS,
|
|
SBCS,
|
|
DBCS,
|
|
DBCS,
|
|
DBCS,
|
|
DBCS,
|
|
SBCS,
|
|
|
|
};
|
|
|
|
static const StateEnum nextStateArray[5][MAX_VALID_CP_JP]= {
|
|
{JISX201 ,INVALID_STATE,INVALID_STATE,JISX208,ASCII,INVALID_STATE,INVALID_STATE,INVALID_STATE,INVALID_STATE},
|
|
{JISX201,INVALID_STATE,INVALID_STATE,JISX208,JISX212,ASCII,INVALID_STATE,INVALID_STATE,INVALID_STATE},
|
|
{ISO8859_1,ISO8859_7,JISX201,JISX208,JISX212,GB2312,KSC5601,ASCII,INVALID_STATE},
|
|
{JISX201,INVALID_STATE,INVALID_STATE,JISX208,JISX212,HWKANA_7BIT,INVALID_STATE,INVALID_STATE,ASCII},
|
|
{JISX201,INVALID_STATE,INVALID_STATE,JISX208,JISX212,ASCII,INVALID_STATE,INVALID_STATE,INVALID_STATE},
|
|
};
|
|
static const char escSeqChars[MAX_VALID_CP_JP][6] ={
|
|
"\x1B\x28\x42", /* <ESC>(B ASCII */
|
|
"\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
|
|
"\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
|
|
"\x1B\x28\x4A", /* <ESC>(J JISX-201 */
|
|
"\x1B\x24\x42", /* <ESC>$B JISX-208 */
|
|
"\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
|
|
"\x1B\x24\x41", /* <ESC>$A GB2312 */
|
|
"\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
|
|
"\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
|
|
|
|
};
|
|
static const int32_t escSeqCharsLen[MAX_VALID_CP_JP] ={
|
|
3, /* length of <ESC>(B ASCII */
|
|
3, /* length of <ESC>.A ISO-8859-1 */
|
|
3, /* length of <ESC>.F ISO-8859-7 */
|
|
3, /* length of <ESC>(J JISX-201 */
|
|
3, /* length of <ESC>$B JISX-208 */
|
|
4, /* length of <ESC>$(D JISX-212 */
|
|
3, /* length of <ESC>$A GB2312 */
|
|
4, /* length of <ESC>$(C KSC5601 */
|
|
3 /* length of <ESC>(I HWKANA_7BIT */
|
|
};
|
|
|
|
/*
|
|
* The iteration over various code pages works this way:
|
|
* i) Get the currentState from myConverterData->currentState
|
|
* ii) Check if the character is mapped to a valid character in the currentState
|
|
* Yes -> a) set the initIterState to currentState
|
|
* b) remain in this state until an invalid character is found
|
|
* No -> a) go to the next code page and find the character
|
|
* iii) Before changing the state increment the current state check if the current state
|
|
* is equal to the intitIteration state
|
|
* Yes -> A character that cannot be represented in any of the supported encodings
|
|
* break and return a U_INVALID_CHARACTER error
|
|
* No -> Continue and find the character in next code page
|
|
*
|
|
*
|
|
* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
|
|
*/
|
|
|
|
static void
|
|
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
|
|
|
|
UConverterDataISO2022 *converterData;
|
|
unsigned char* target = (unsigned char*) args->target;
|
|
const unsigned char* targetLimit = (const unsigned char*) args->targetLimit;
|
|
const UChar* source = args->source;
|
|
const UChar* sourceLimit = args->sourceLimit;
|
|
int32_t* offsets = args->offsets;
|
|
int32_t offset = 0;
|
|
uint32_t targetByteUnit = missingCharMarker;
|
|
UChar32 sourceChar =0x0000;
|
|
const char* escSeq = NULL;
|
|
int32_t len =0; /*length of escSeq chars*/
|
|
UConverterSharedData* sharedData=NULL;
|
|
UBool useFallback;
|
|
|
|
/* state variables*/
|
|
StateEnum* currentState;
|
|
StateEnum initIterState;
|
|
UConverter** currentConverter;
|
|
Cnv2022Type* currentType;
|
|
UConverter** convArray;
|
|
|
|
/* arguments check*/
|
|
if ((args->converter == NULL) || (targetLimit < target) || (sourceLimit < source)){
|
|
*err = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
|
|
/* Initialize */
|
|
converterData = (UConverterDataISO2022*)args->converter->extraInfo;
|
|
useFallback = args->converter->useFallback;
|
|
currentState = &converterData->fromUnicodeCurrentState;
|
|
initIterState = ASCII;
|
|
currentConverter = &converterData->fromUnicodeConverter;
|
|
convArray = converterData->myConverterArray;
|
|
initIterState = *currentState;
|
|
currentType = &converterData->currentType;
|
|
|
|
/* check if the last codepoint of previous buffer was a lead surrogate*/
|
|
if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
|
|
goto getTrail;
|
|
}
|
|
|
|
*currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
|
|
sharedData= (*currentConverter)->sharedData;
|
|
|
|
while( source < sourceLimit){
|
|
|
|
targetByteUnit = missingCharMarker;
|
|
|
|
if(target < targetLimit){
|
|
sourceChar = *(source++);
|
|
if(sourceChar > SPACE) {
|
|
do{
|
|
switch (*currentType){
|
|
/* most common case*/
|
|
case DBCS:
|
|
{
|
|
uint32_t value=0;
|
|
int32_t length=0;
|
|
/*if(2 == _MBCSFromUChar32(sharedData,sourceChar, &value, useFallback)) {
|
|
targetByteUnit = (uint16_t)value;
|
|
}*/
|
|
MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&value,useFallback,&length,MBCS_OUTPUT_2);
|
|
if(length==2){
|
|
targetByteUnit = value;
|
|
}
|
|
}
|
|
break;
|
|
case ASCII1:
|
|
if(sourceChar < 0x7f){
|
|
targetByteUnit = sourceChar;
|
|
}
|
|
break;
|
|
|
|
case SBCS:
|
|
MBCS_SINGLE_FROM_UCHAR32(sharedData,sourceChar,&targetByteUnit,useFallback);
|
|
/*targetByteUnit=(uint16_t)_MBCSSingleFromUChar32(sharedData,sourceChar,useFallback);*/
|
|
/*
|
|
* If mySourceChar is unassigned, then _MBCSSingleFromUChar32() returns -1
|
|
* which becomes the same as missingCharMarker with the cast to uint16_t.
|
|
*/
|
|
/* Check if the sourceChar is in the HW Kana range*/
|
|
if(0xFF9F-sourceChar<=(0xFF9F-0xFF61)){
|
|
if( converterData->version==3){
|
|
/*we get a1-df from _MBCSSingleFromUChar32 so subtract 0x80*/
|
|
targetByteUnit-=0x80;
|
|
*currentState = HWKANA_7BIT;
|
|
}
|
|
else if( converterData->version==4){
|
|
*currentState = JISX201;
|
|
}
|
|
else{
|
|
targetByteUnit=missingCharMarker;
|
|
}
|
|
*currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
|
|
*currentType = (Cnv2022Type) myConverterType[*currentState];
|
|
}
|
|
break;
|
|
|
|
case LATIN1:
|
|
if(sourceChar <= 0x00FF){
|
|
targetByteUnit = sourceChar;
|
|
}
|
|
|
|
break;
|
|
default:
|
|
/*not expected */
|
|
break;
|
|
}
|
|
if(targetByteUnit==missingCharMarker){
|
|
*currentState = nextStateArray[converterData->version][*currentState];
|
|
*currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
|
|
*currentType = (Cnv2022Type) myConverterType[*currentState];
|
|
sharedData= (*currentConverter)->sharedData;
|
|
}
|
|
else
|
|
/*got the mapping so break from while loop*/
|
|
break;
|
|
|
|
}while(initIterState != *currentState);
|
|
|
|
}
|
|
else{
|
|
targetByteUnit = sourceChar;
|
|
*currentState = ASCII;
|
|
*currentType = (Cnv2022Type) myConverterType[*currentState];
|
|
}
|
|
|
|
if(targetByteUnit != missingCharMarker){
|
|
|
|
if( *currentState != initIterState){
|
|
|
|
escSeq = escSeqChars[(int)*currentState];
|
|
len = escSeqCharsLen[(int)*currentState];
|
|
|
|
CONCAT_ESCAPE_EX(args,source, &target,targetLimit, &offsets, escSeq,len,err);
|
|
|
|
/* Append SSN for shifting to G2 */
|
|
if(*currentState==ISO8859_1 || *currentState==ISO8859_7){
|
|
escSeq = UCNV_SS2;
|
|
len = UCNV_SS2_LEN;
|
|
CONCAT_ESCAPE_EX(args, source, &target, targetLimit,&offsets, escSeq,len,err);
|
|
}
|
|
}
|
|
initIterState = *currentState;
|
|
offset = source - args->source -1;
|
|
/* write the targetByteUnit to target */
|
|
if(targetByteUnit <= 0x00FF){
|
|
if( target <targetLimit){
|
|
*(target++) = (unsigned char) targetByteUnit;
|
|
if(offsets){
|
|
*(offsets++) = offset;
|
|
}
|
|
|
|
}else{
|
|
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) targetByteUnit;
|
|
*err = U_BUFFER_OVERFLOW_ERROR;
|
|
}
|
|
}else{
|
|
if(target < targetLimit){
|
|
*(target++) =(unsigned char) (targetByteUnit>>8);
|
|
if(offsets){
|
|
*(offsets++) = offset;
|
|
}
|
|
if(target < targetLimit){
|
|
*(target++) =(unsigned char) (targetByteUnit);
|
|
if(offsets){
|
|
*(offsets++) = offset;
|
|
}
|
|
|
|
}else{
|
|
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
|
|
*err = U_BUFFER_OVERFLOW_ERROR;
|
|
}
|
|
}else{
|
|
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit>>8);
|
|
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
|
|
*err = U_BUFFER_OVERFLOW_ERROR;
|
|
}
|
|
}
|
|
}
|
|
else{
|
|
/* if we cannot find the character after checking all codepages
|
|
* then this is an error
|
|
*/
|
|
|
|
/*check if the char is a First surrogate*/
|
|
if(UTF_IS_SURROGATE(sourceChar)) {
|
|
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
|
|
getTrail:
|
|
/*look ahead to find the trail surrogate*/
|
|
if(source < sourceLimit) {
|
|
/* test the following code unit */
|
|
UChar trail=(UChar) *source;
|
|
if(UTF_IS_SECOND_SURROGATE(trail)) {
|
|
source++;
|
|
sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
|
|
*err = U_INVALID_CHAR_FOUND;
|
|
/* convert this surrogate code point */
|
|
/* exit this condition tree */
|
|
} else {
|
|
/* this is an unmatched lead code unit (1st surrogate) */
|
|
/* callback(illegal) */
|
|
*err=U_ILLEGAL_CHAR_FOUND;
|
|
}
|
|
} else {
|
|
/* no more input */
|
|
*err = U_ZERO_ERROR;
|
|
}
|
|
} else {
|
|
/* this is an unmatched trail code unit (2nd surrogate) */
|
|
/* callback(illegal) */
|
|
*err=U_ILLEGAL_CHAR_FOUND;
|
|
}
|
|
} else {
|
|
/* callback(unassigned) for a BMP code point */
|
|
*err = U_INVALID_CHAR_FOUND;
|
|
}
|
|
|
|
args->converter->fromUChar32=sourceChar;
|
|
break;
|
|
}
|
|
} /* end if(myTargetIndex<myTargetLength) */
|
|
else{
|
|
*err =U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
|
|
}/* end while(mySourceIndex<mySourceLength) */
|
|
|
|
/*save the state and return */
|
|
args->source = source;
|
|
args->target = (char*)target;
|
|
}
|
|
|
|
/*************** to unicode *******************/
|
|
|
|
static void
|
|
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
|
UErrorCode* err){
|
|
char tempBuf[2];
|
|
const char *mySource = ( char *) args->source;
|
|
UChar *myTarget = args->target;
|
|
const char *mySourceLimit = args->sourceLimit;
|
|
uint32_t targetUniChar = 0x0000;
|
|
uint32_t mySourceChar = 0x0000;
|
|
UConverterDataISO2022* myData;
|
|
StateEnum* currentState;
|
|
uint32_t* toUnicodeStatus;
|
|
|
|
if ((args->converter == NULL) || (myTarget < args->target) || (mySource < args->source)){
|
|
*err = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
myData=(UConverterDataISO2022*)(args->converter->extraInfo);
|
|
currentState = &myData->toUnicodeCurrentState;
|
|
toUnicodeStatus = &args->converter->toUnicodeStatus;
|
|
while(mySource< mySourceLimit){
|
|
|
|
targetUniChar = missingCharMarker;
|
|
|
|
if(myTarget < args->targetLimit){
|
|
|
|
mySourceChar= (unsigned char) *mySource++;
|
|
|
|
/* Consume the escape sequences and ascertain the state */
|
|
if(mySourceChar==UCNV_SI){
|
|
if(myData->version==3 && *toUnicodeStatus==0x00){
|
|
if(myData->toUnicodeSaveState!=INVALID_STATE){
|
|
*currentState = (StateEnum) myData->toUnicodeSaveState;
|
|
continue;
|
|
}
|
|
else{
|
|
*err =U_ILLEGAL_CHAR_FOUND;
|
|
goto CALLBACK;
|
|
}
|
|
|
|
}
|
|
else{
|
|
goto CALLBACK;
|
|
}
|
|
}else if(mySourceChar==UCNV_SO){
|
|
if(myData->version==3 && *toUnicodeStatus==0x00){
|
|
myData->toUnicodeSaveState= (int) *currentState;
|
|
*currentState = HWKANA_7BIT;
|
|
continue;
|
|
}
|
|
else{
|
|
goto CALLBACK;
|
|
}
|
|
}else if(mySourceChar==ESC_2022 || myData->key!=0){
|
|
if(*toUnicodeStatus== 0x00){
|
|
mySource--;
|
|
changeState_2022(args->converter,&(mySource),
|
|
mySourceLimit, ISO_2022_JP, err);
|
|
/*Invalid or illegal escape sequence */
|
|
if(U_SUCCESS(*err)){
|
|
continue;
|
|
|
|
}
|
|
else{
|
|
args->target = myTarget;
|
|
args->source = mySource;
|
|
return;
|
|
}
|
|
}
|
|
else{
|
|
goto CALLBACK;
|
|
}
|
|
}
|
|
|
|
switch(myConverterType[*currentState]){
|
|
case DBCS:
|
|
if(*toUnicodeStatus== 0x00){
|
|
*toUnicodeStatus= (UChar) mySourceChar;
|
|
continue;
|
|
}
|
|
else{
|
|
tempBuf[0] = (char) args->converter->toUnicodeStatus;
|
|
tempBuf[1] = (char) mySourceChar;
|
|
mySourceChar+= (args->converter->toUnicodeStatus)<<8;
|
|
*toUnicodeStatus= 0;
|
|
targetUniChar = _MBCSSimpleGetNextUChar(myData->currentConverter->sharedData, tempBuf, 2, args->converter->useFallback);
|
|
}
|
|
break;
|
|
|
|
|
|
case ASCII1:
|
|
if( mySourceChar < 0x7F){
|
|
targetUniChar = (UChar) mySourceChar;
|
|
}
|
|
else if((uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4) {
|
|
/* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
|
|
targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->myConverterArray[JISX201]->sharedData, mySourceChar);
|
|
}
|
|
|
|
break;
|
|
|
|
case SBCS:
|
|
if((uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4) {
|
|
/* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
|
|
targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->myConverterArray[JISX201]->sharedData, mySourceChar);
|
|
}
|
|
else if(*currentState==HWKANA_7BIT){
|
|
targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->myConverterArray[JISX201]->sharedData, mySourceChar+0x80);
|
|
}
|
|
else {
|
|
targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->currentConverter->sharedData, mySourceChar);
|
|
}
|
|
|
|
break;
|
|
|
|
case LATIN1:
|
|
|
|
targetUniChar = (UChar) mySourceChar;
|
|
break;
|
|
|
|
case INVALID_STATE:
|
|
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
|
args->target = myTarget;
|
|
args->source = mySource;
|
|
return;
|
|
|
|
default:
|
|
/* For non-valid state MBCS and others */
|
|
break;
|
|
}
|
|
if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
|
|
if(args->offsets){
|
|
args->offsets[myTarget - args->target]= mySource - args->source - 2
|
|
+(myConverterType[*currentState] <= SBCS);
|
|
|
|
}
|
|
*(myTarget++)=(UChar)targetUniChar;
|
|
targetUniChar=missingCharMarker;
|
|
}
|
|
else{
|
|
CALLBACK:
|
|
/* Call the callback function*/
|
|
toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
|
|
break;
|
|
}
|
|
}
|
|
else{
|
|
*err =U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
args->target = myTarget;
|
|
args->source = mySource;
|
|
}
|
|
|
|
|
|
|
|
/***************************************************************
|
|
* Rules for ISO-2022-KR encoding
|
|
* i) The KSC5601 designator sequence should appear only once in a file,
|
|
* at the begining of a line before any KSC5601 characters. This usually
|
|
* means that it appears by itself on the first line of the file
|
|
* ii) There are only 2 shifting sequences SO to shift into double byte mode
|
|
* and SI to shift into single byte mode
|
|
*/
|
|
static void
|
|
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
|
|
|
|
UConverter* saveConv = args->converter;
|
|
UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)args->converter->extraInfo;
|
|
args->converter=myConverterData->currentConverter;
|
|
_MBCSFromUnicodeWithOffsets(args,err);
|
|
if(U_FAILURE(*err)){
|
|
if(args->converter->charErrorBufferLength!=0){
|
|
uprv_memcpy(saveConv->charErrorBuffer, args->converter->charErrorBuffer,
|
|
args->converter->charErrorBufferLength);
|
|
saveConv->charErrorBufferLength=args->converter->charErrorBufferLength;
|
|
args->converter->charErrorBufferLength=0;
|
|
}
|
|
if(args->converter->toULength!=0){
|
|
uprv_memcpy(saveConv->toUBytes, args->converter->toUBytes,
|
|
args->converter->toULength);
|
|
saveConv->toULength=args->converter->toULength;
|
|
args->converter->toULength=0;
|
|
}
|
|
}
|
|
args->converter=saveConv;
|
|
}
|
|
|
|
static void
|
|
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
|
|
|
|
const UChar *source = args->source;
|
|
const UChar *sourceLimit = args->sourceLimit;
|
|
unsigned char *target = (unsigned char *) args->target;
|
|
unsigned char *targetLimit = (unsigned char *) args->targetLimit;
|
|
int32_t* offsets = args->offsets;
|
|
uint32_t targetByteUnit = 0x0000;
|
|
UChar32 sourceChar = 0x0000;
|
|
UBool isTargetByteDBCS;
|
|
UBool oldIsTargetByteDBCS;
|
|
UConverterDataISO2022 *converterData;
|
|
UConverterSharedData* sharedData;
|
|
UBool useFallback;
|
|
int32_t length =0;
|
|
|
|
if ((args->converter == NULL) || (args->targetLimit < args->target) || (sourceLimit < args->source)){
|
|
*err = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
/* initialize data */
|
|
converterData=(UConverterDataISO2022*)args->converter->extraInfo;
|
|
sharedData = converterData->fromUnicodeConverter->sharedData;
|
|
useFallback = args->converter->useFallback;
|
|
isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
|
|
oldIsTargetByteDBCS = isTargetByteDBCS;
|
|
/* if the version is 1 then the user is requesting
|
|
* conversion with ibm-25546 pass the arguments to
|
|
* MBCS converter and return
|
|
*/
|
|
if(converterData->version==1){
|
|
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
|
|
return;
|
|
}
|
|
|
|
isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
|
|
if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
|
|
goto getTrail;
|
|
}
|
|
while(source < sourceLimit){
|
|
|
|
targetByteUnit = missingCharMarker;
|
|
|
|
if(target < (unsigned char*) args->targetLimit){
|
|
sourceChar = *source++;
|
|
/* length= _MBCSFromUChar32(converterData->fromUnicodeConverter->sharedData,
|
|
sourceChar,&targetByteUnit,args->converter->useFallback);*/
|
|
MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2);
|
|
/* only DBCS or SBCS characters are expected*/
|
|
/* DB haracters with high bit set to 1 are expected */
|
|
if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
|
|
targetByteUnit=missingCharMarker;
|
|
}
|
|
if (targetByteUnit != missingCharMarker){
|
|
|
|
oldIsTargetByteDBCS = isTargetByteDBCS;
|
|
isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
|
|
/* append the shift sequence */
|
|
if (oldIsTargetByteDBCS != isTargetByteDBCS ){
|
|
|
|
if (isTargetByteDBCS)
|
|
*target++ = UCNV_SO;
|
|
else
|
|
*target++ = UCNV_SI;
|
|
if(offsets)
|
|
*(offsets++)= source - args->source-1;
|
|
}
|
|
/* write the targetUniChar to target */
|
|
if(targetByteUnit <= 0x00FF){
|
|
if( target < targetLimit){
|
|
*(target++) = (unsigned char) targetByteUnit;
|
|
if(offsets){
|
|
*(offsets++) = source - args->source-1;
|
|
}
|
|
|
|
}else{
|
|
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
|
|
*err = U_BUFFER_OVERFLOW_ERROR;
|
|
}
|
|
}else{
|
|
if(target < targetLimit){
|
|
*(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
|
|
if(offsets){
|
|
*(offsets++) = source - args->source-1;
|
|
}
|
|
if(target < targetLimit){
|
|
*(target++) =(unsigned char) (targetByteUnit -0x80);
|
|
if(offsets){
|
|
*(offsets++) = source - args->source-1;
|
|
}
|
|
}else{
|
|
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
|
|
*err = U_BUFFER_OVERFLOW_ERROR;
|
|
}
|
|
}else{
|
|
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
|
|
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
|
|
*err = U_BUFFER_OVERFLOW_ERROR;
|
|
}
|
|
}
|
|
|
|
}
|
|
else{
|
|
/* oops.. the code point is unassingned
|
|
* set the error and reason
|
|
*/
|
|
|
|
/*check if the char is a First surrogate*/
|
|
if(UTF_IS_SURROGATE(sourceChar)) {
|
|
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
|
|
getTrail:
|
|
/*look ahead to find the trail surrogate*/
|
|
if(source < sourceLimit) {
|
|
/* test the following code unit */
|
|
UChar trail=(UChar) *source;
|
|
if(UTF_IS_SECOND_SURROGATE(trail)) {
|
|
source++;
|
|
sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
|
|
*err = U_INVALID_CHAR_FOUND;
|
|
/* convert this surrogate code point */
|
|
/* exit this condition tree */
|
|
} else {
|
|
/* this is an unmatched lead code unit (1st surrogate) */
|
|
/* callback(illegal) */
|
|
*err=U_ILLEGAL_CHAR_FOUND;
|
|
}
|
|
} else {
|
|
/* no more input */
|
|
*err = U_ZERO_ERROR;
|
|
}
|
|
} else {
|
|
/* this is an unmatched trail code unit (2nd surrogate) */
|
|
/* callback(illegal) */
|
|
*err=U_ILLEGAL_CHAR_FOUND;
|
|
}
|
|
} else {
|
|
/* callback(unassigned) for a BMP code point */
|
|
*err = U_INVALID_CHAR_FOUND;
|
|
}
|
|
|
|
args->converter->fromUChar32=sourceChar;
|
|
args->converter->fromUnicodeStatus = (int32_t)isTargetByteDBCS;
|
|
break;
|
|
}
|
|
} /* end if(myTargetIndex<myTargetLength) */
|
|
else{
|
|
*err =U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
|
|
}/* end while(mySourceIndex<mySourceLength) */
|
|
|
|
/*save the state and return */
|
|
args->source = source;
|
|
args->target = (char*)target;
|
|
args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
|
|
}
|
|
|
|
/************************ To Unicode ***************************************/
|
|
|
|
static void
|
|
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
|
|
UErrorCode* err){
|
|
const char* mySourceLimit;
|
|
char const* sourceStart;
|
|
UConverter* saveThis;
|
|
UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
|
|
do{
|
|
|
|
/*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
|
|
mySourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
|
|
|
|
if (args->converter->mode == UCNV_SO) /*Already doing some conversion*/{
|
|
saveThis = args->converter;
|
|
args->offsets = NULL;
|
|
args->converter = myData->currentConverter;
|
|
_MBCSToUnicodeWithOffsets(args,err);
|
|
if(U_FAILURE(*err)){
|
|
uprv_memcpy(saveThis->invalidUCharBuffer, args->converter->invalidUCharBuffer,
|
|
args->converter->invalidUCharLength);
|
|
saveThis->invalidUCharLength=args->converter->invalidUCharLength;
|
|
}
|
|
args->converter = saveThis;
|
|
}
|
|
|
|
/*-Done with buffer with entire buffer
|
|
-Error while converting
|
|
*/
|
|
if (U_FAILURE(*err) || (args->source == args->sourceLimit))
|
|
return;
|
|
|
|
sourceStart = args->source;
|
|
changeState_2022(args->converter,
|
|
&(args->source),
|
|
args->sourceLimit,
|
|
ISO_2022_KR,
|
|
err);
|
|
/* args->source = sourceStart; */
|
|
|
|
|
|
}while(args->source < args->sourceLimit);
|
|
/* return*/
|
|
}
|
|
|
|
static void
|
|
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
|
UErrorCode* err){
|
|
char tempBuf[2];
|
|
const char *mySource = ( char *) args->source;
|
|
UChar *myTarget = args->target;
|
|
const char *mySourceLimit = args->sourceLimit;
|
|
UChar32 targetUniChar = 0x0000;
|
|
UChar mySourceChar = 0x0000;
|
|
UConverterDataISO2022* myData;
|
|
UConverterSharedData* sharedData ;
|
|
UBool useFallback;
|
|
|
|
|
|
if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){
|
|
*err = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
/* initialize state */
|
|
myData=(UConverterDataISO2022*)(args->converter->extraInfo);
|
|
sharedData = myData->fromUnicodeConverter->sharedData;
|
|
useFallback = args->converter->useFallback;
|
|
|
|
if(myData->version==1){
|
|
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
|
|
return;
|
|
}
|
|
while(mySource< mySourceLimit){
|
|
|
|
targetUniChar = missingCharMarker;
|
|
|
|
if(myTarget < args->targetLimit){
|
|
|
|
mySourceChar= (unsigned char) *mySource++;
|
|
|
|
if(mySourceChar==UCNV_SI){
|
|
myData->currentType = SBCS;
|
|
/*consume the source */
|
|
continue;
|
|
}else if(mySourceChar==UCNV_SO){
|
|
myData->currentType = DBCS;
|
|
/*consume the source */
|
|
continue;
|
|
}else if(mySourceChar==ESC_2022 || myData->key!=0){
|
|
|
|
/*
|
|
* Commented out this part to be lenient and allow for
|
|
* more escape sequences in ISO-2022-KR byte stream
|
|
*
|
|
* Already doing some conversion and found escape Sequence
|
|
* if(args->converter->mode == UCNV_SO){
|
|
* *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
|
* }
|
|
* else{
|
|
*
|
|
*/
|
|
|
|
mySource--;
|
|
changeState_2022(args->converter,&(mySource),
|
|
mySourceLimit, ISO_2022_KR, err);
|
|
/*}*/
|
|
if(U_FAILURE(*err)){
|
|
args->target = myTarget;
|
|
args->source = mySource;
|
|
return;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if(myData->currentType==DBCS){
|
|
if(args->converter->toUnicodeStatus == 0x00){
|
|
args->converter->toUnicodeStatus = (UChar) mySourceChar;
|
|
continue;
|
|
}
|
|
else{
|
|
tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80);
|
|
tempBuf[1] = (char) (mySourceChar+0x80);
|
|
mySourceChar = (UChar)(mySourceChar + (args->converter->toUnicodeStatus<<8));
|
|
args->converter->toUnicodeStatus =0x00;
|
|
targetUniChar = _MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
|
|
}
|
|
}
|
|
else{
|
|
if(args->converter->fromUnicodeStatus == 0x00){
|
|
targetUniChar = _MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
|
|
|
|
}
|
|
|
|
}
|
|
if(targetUniChar < 0xfffe){
|
|
if(args->offsets)
|
|
args->offsets[myTarget - args->target]= mySource - args->source - 1-(myData->currentType==DBCS);
|
|
*(myTarget++)=(UChar)targetUniChar;
|
|
}
|
|
else {
|
|
/* Call the callback function*/
|
|
toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
|
|
break;
|
|
}
|
|
}
|
|
else{
|
|
*err =U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
args->target = myTarget;
|
|
args->source = mySource;
|
|
}
|
|
|
|
/*************************** END ISO2022-KR *********************************/
|
|
|
|
/*************************** ISO-2022-CN *********************************
|
|
*
|
|
* Rules for ISO-2022-CN Encoding:
|
|
* i) The designator sequence must appear once on a line before any instance
|
|
* of character set it designates.
|
|
* ii) If two lines contain characters from the same character set, both lines
|
|
* must include the designator sequence.
|
|
* iii) Once the designator sequence is known, a shifting sequence has to be found
|
|
* to invoke the shifting
|
|
* iv) All lines start in ASCII and end in ASCII.
|
|
* v) Four shifting sequences are employed for this purpose:
|
|
*
|
|
* Sequcence ASCII Eq Charsets
|
|
* ---------- ------- ---------
|
|
* SI <SI> US-ASCII
|
|
* SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
|
|
* SS2 <ESC>N CNS-11643-1992 Plane 2
|
|
* SS3 <ESC>O CNS-11643-1992 Planes 3-7
|
|
*
|
|
* vi)
|
|
* SOdesignator : ESC "$" ")" finalchar_for_SO
|
|
* SS2designator : ESC "$" "*" finalchar_for_SS2
|
|
* SS3designator : ESC "$" "+" finalchar_for_SS3
|
|
*
|
|
* ESC $ ) A Indicates the bytes following SO are Chinese
|
|
* characters as defined in GB 2312-80, until
|
|
* another SOdesignation appears
|
|
*
|
|
*
|
|
* ESC $ ) E Indicates the bytes following SO are as defined
|
|
* in ISO-IR-165 (for details, see section 2.1),
|
|
* until another SOdesignation appears
|
|
*
|
|
* ESC $ ) G Indicates the bytes following SO are as defined
|
|
* in CNS 11643-plane-1, until another
|
|
* SOdesignation appears
|
|
*
|
|
* ESC $ * H Indicates the two bytes immediately following
|
|
* SS2 is a Chinese character as defined in CNS
|
|
* 11643-plane-2, until another SS2designation
|
|
* appears
|
|
* (Meaning <ESC>N must preceed every 2 byte
|
|
* sequence.)
|
|
*
|
|
* ESC $ + I Indicates the immediate two bytes following SS3
|
|
* is a Chinese character as defined in CNS
|
|
* 11643-plane-3, until another SS3designation
|
|
* appears
|
|
* (Meaning <ESC>O must preceed every 2 byte
|
|
* sequence.)
|
|
*
|
|
* ESC $ + J Indicates the immediate two bytes following SS3
|
|
* is a Chinese character as defined in CNS
|
|
* 11643-plane-4, until another SS3designation
|
|
* appears
|
|
* (In English: <ESC>O must preceed every 2 byte
|
|
* sequence.)
|
|
*
|
|
* ESC $ + K Indicates the immediate two bytes following SS3
|
|
* is a Chinese character as defined in CNS
|
|
* 11643-plane-5, until another SS3designation
|
|
* appears
|
|
*
|
|
* ESC $ + L Indicates the immediate two bytes following SS3
|
|
* is a Chinese character as defined in CNS
|
|
* 11643-plane-6, until another SS3designation
|
|
* appears
|
|
*
|
|
* ESC $ + M Indicates the immediate two bytes following SS3
|
|
* is a Chinese character as defined in CNS
|
|
* 11643-plane-7, until another SS3designation
|
|
* appears
|
|
*
|
|
* As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
|
|
* has its own designation information before any Chinese characters
|
|
* appear
|
|
*
|
|
*/
|
|
|
|
/* The following are defined this way to make the strings truely readonly */
|
|
static const char SHIFT_IN_STR[] = "\x0F";
|
|
static const char SHIFT_OUT_STR[] = "\x0E";
|
|
static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
|
|
static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
|
|
static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
|
|
static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
|
|
static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
|
|
static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
|
|
static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
|
|
static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
|
|
static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
|
|
|
|
/********************** ISO2022-CN Data **************************/
|
|
static const char* const escSeqCharsCN[10] ={
|
|
SHIFT_IN_STR, /* ASCII */
|
|
GB_2312_80_STR,
|
|
ISO_IR_165_STR,
|
|
CNS_11643_1992_Plane_1_STR,
|
|
CNS_11643_1992_Plane_2_STR,
|
|
CNS_11643_1992_Plane_3_STR,
|
|
CNS_11643_1992_Plane_4_STR,
|
|
CNS_11643_1992_Plane_5_STR,
|
|
CNS_11643_1992_Plane_6_STR,
|
|
CNS_11643_1992_Plane_7_STR
|
|
};
|
|
|
|
static void
|
|
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
|
|
|
|
UConverterDataISO2022 *converterData;
|
|
uint8_t *target = (uint8_t *) args->target;
|
|
const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
|
|
const UChar* source = args->source;
|
|
const UChar* sourceLimit = args->sourceLimit;
|
|
int32_t* offsets = args->offsets;
|
|
UChar32 sourceChar;
|
|
char buffer[8];
|
|
int32_t len;
|
|
int8_t choices[3];
|
|
int32_t choiceCount;
|
|
uint32_t targetValue;
|
|
UBool useFallback;
|
|
|
|
/* set up the state */
|
|
converterData = (UConverterDataISO2022*)args->converter->extraInfo;
|
|
useFallback = args->converter->useFallback;
|
|
|
|
choiceCount = 0;
|
|
|
|
/* check if the last codepoint of previous buffer was a lead surrogate*/
|
|
if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
|
|
goto getTrail;
|
|
}
|
|
|
|
while( source < sourceLimit){
|
|
if(target < targetLimit){
|
|
|
|
sourceChar = *(source++);
|
|
/*check if the char is a First surrogate*/
|
|
if(UTF_IS_SURROGATE(sourceChar)) {
|
|
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
|
|
getTrail:
|
|
/*look ahead to find the trail surrogate*/
|
|
if(source < sourceLimit) {
|
|
/* test the following code unit */
|
|
UChar trail=(UChar) *source;
|
|
if(UTF_IS_SECOND_SURROGATE(trail)) {
|
|
source++;
|
|
sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
|
|
args->converter->fromUChar32=0x00;
|
|
/* convert this supplementary code point */
|
|
/* exit this condition tree */
|
|
} else {
|
|
/* this is an unmatched lead code unit (1st surrogate) */
|
|
/* callback(illegal) */
|
|
*err=U_ILLEGAL_CHAR_FOUND;
|
|
args->converter->fromUChar32=sourceChar;
|
|
break;
|
|
}
|
|
} else {
|
|
/* no more input */
|
|
args->converter->fromUChar32=sourceChar;
|
|
break;
|
|
}
|
|
} else {
|
|
/* this is an unmatched trail code unit (2nd surrogate) */
|
|
/* callback(illegal) */
|
|
*err=U_ILLEGAL_CHAR_FOUND;
|
|
args->converter->fromUChar32=sourceChar;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* do the conversion */
|
|
if(sourceChar <= 0x007f ){
|
|
/* US-ASCII */
|
|
if(converterData->fromU2022State.g == 0) {
|
|
buffer[0] = (char)sourceChar;
|
|
len = 1;
|
|
} else {
|
|
buffer[0] = UCNV_SI;
|
|
buffer[1] = (char)sourceChar;
|
|
len = 2;
|
|
converterData->fromU2022State.g = 0;
|
|
}
|
|
if(sourceChar == CR || sourceChar == LF) {
|
|
/* reset the state at the end of a line */
|
|
uprv_memset(&converterData->fromU2022State, 0, sizeof(ISO2022State));
|
|
}
|
|
}
|
|
else{
|
|
/* convert U+0080..U+10ffff */
|
|
UConverter *cnv;
|
|
int32_t i;
|
|
int8_t cs, g;
|
|
|
|
if(choiceCount == 0) {
|
|
/* try the current SO/G1 converter first */
|
|
choices[0] = converterData->fromU2022State.cs[1];
|
|
|
|
/* default to GB2312_1 if none is designated yet */
|
|
if(choices[0] == 0) {
|
|
choices[0] = GB2312_1;
|
|
}
|
|
|
|
if(converterData->version == 0) {
|
|
/* ISO-2022-CN */
|
|
|
|
/* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
|
|
if(choices[0] == GB2312_1) {
|
|
choices[1] = (int8_t)CNS_11643_1;
|
|
} else {
|
|
choices[1] = (int8_t)GB2312_1;
|
|
}
|
|
|
|
choiceCount = 2;
|
|
} else {
|
|
/* ISO-2022-CN-EXT */
|
|
|
|
/* try one of the other converters */
|
|
switch(choices[0]) {
|
|
case GB2312_1:
|
|
choices[1] = (int8_t)CNS_11643_1;
|
|
choices[2] = (int8_t)ISO_IR_165;
|
|
break;
|
|
case ISO_IR_165:
|
|
choices[1] = (int8_t)GB2312_1;
|
|
choices[2] = (int8_t)CNS_11643_1;
|
|
break;
|
|
default: /* CNS_11643_x */
|
|
choices[1] = (int8_t)GB2312_1;
|
|
choices[2] = (int8_t)ISO_IR_165;
|
|
break;
|
|
}
|
|
|
|
choiceCount = 3;
|
|
}
|
|
}
|
|
|
|
cs = g = 0;
|
|
len = 0;
|
|
|
|
for(i = 0; i < choiceCount && len == 0; ++i) {
|
|
cs = choices[i];
|
|
if(cs > 0) {
|
|
if(cs > CNS_11643_0) {
|
|
cnv = converterData->myConverterArray[CNS_11643];
|
|
MBCS_FROM_UCHAR32_ISO2022(cnv->sharedData,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3);
|
|
if(len==3) {
|
|
cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80);
|
|
len = 2;
|
|
if(cs == CNS_11643_1) {
|
|
g = 1;
|
|
} else if(cs == CNS_11643_2) {
|
|
g = 2;
|
|
} else /* plane 3..7 */ if(converterData->version == 1) {
|
|
g = 3;
|
|
} else {
|
|
/* ISO-2022-CN (without -EXT) does not support plane 3..7 */
|
|
len = 0;
|
|
}
|
|
}
|
|
} else {
|
|
/* GB2312_1 or ISO-IR-165 */
|
|
cnv = converterData->myConverterArray[cs];
|
|
MBCS_FROM_UCHAR32_ISO2022(cnv->sharedData,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2);
|
|
g = 1; /* used if len == 2 */
|
|
}
|
|
}
|
|
}
|
|
|
|
if(len > 0) {
|
|
len = 0; /* count output bytes; it must have been len == 2 */
|
|
|
|
/* write the designation sequence if necessary */
|
|
if(cs != converterData->fromU2022State.cs[g]) {
|
|
if(cs < CNS_11643) {
|
|
uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
|
|
} else {
|
|
uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
|
|
}
|
|
len = 4;
|
|
converterData->fromU2022State.cs[g] = cs;
|
|
if(g == 1) {
|
|
/* changing the SO/G1 charset invalidates the choices[] */
|
|
choiceCount = 0;
|
|
}
|
|
}
|
|
|
|
/* write the shift sequence if necessary */
|
|
if(g != converterData->fromU2022State.g) {
|
|
switch(g) {
|
|
case 1:
|
|
buffer[len++] = UCNV_SO;
|
|
|
|
/* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
|
|
converterData->fromU2022State.g = 1;
|
|
break;
|
|
case 2:
|
|
buffer[len++] = 0x1b;
|
|
buffer[len++] = 0x4e;
|
|
break;
|
|
default: /* case 3 */
|
|
buffer[len++] = 0x1b;
|
|
buffer[len++] = 0x4f;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* write the two output bytes */
|
|
buffer[len++] = (char)(targetValue >> 8);
|
|
buffer[len++] = (char)targetValue;
|
|
} else {
|
|
/* if we cannot find the character after checking all codepages
|
|
* then this is an error
|
|
*/
|
|
*err = U_INVALID_CHAR_FOUND;
|
|
args->converter->fromUChar32=sourceChar;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* output len>0 bytes in buffer[] */
|
|
if(len == 1) {
|
|
*target++ = buffer[0];
|
|
if(offsets) {
|
|
*offsets++ = source - args->source - 1; /* -1: known to be ASCII */
|
|
}
|
|
} else if(len == 2 && (target + 2) <= targetLimit) {
|
|
*target++ = buffer[0];
|
|
*target++ = buffer[1];
|
|
if(offsets) {
|
|
int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
|
|
*offsets++ = sourceIndex;
|
|
*offsets++ = sourceIndex;
|
|
}
|
|
} else {
|
|
ucnv_fromUWriteBytes(
|
|
args->converter,
|
|
buffer, len,
|
|
(char **)&target, (const char *)targetLimit,
|
|
&offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
|
|
err);
|
|
if(U_FAILURE(*err)) {
|
|
break;
|
|
}
|
|
}
|
|
} /* end if(myTargetIndex<myTargetLength) */
|
|
else{
|
|
*err =U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
|
|
}/* end while(mySourceIndex<mySourceLength) */
|
|
|
|
/*save the state and return */
|
|
args->source = source;
|
|
args->target = (char*)target;
|
|
}
|
|
|
|
|
|
static void
|
|
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
|
UErrorCode* err){
|
|
char tempBuf[3];
|
|
const char *mySource = ( char *) args->source;
|
|
UChar *myTarget = args->target;
|
|
const char *mySourceLimit = args->sourceLimit;
|
|
uint32_t targetUniChar = 0x0000;
|
|
uint32_t mySourceChar = 0x0000;
|
|
UConverterDataISO2022* myData;
|
|
|
|
myData=(UConverterDataISO2022*)(args->converter->extraInfo);
|
|
|
|
if(myData->key != 0) {
|
|
/* continue with a partial escape sequence */
|
|
goto escape;
|
|
} else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
|
|
/* continue with a partial double-byte character */
|
|
mySourceChar = args->converter->toUBytes[0];
|
|
args->converter->toULength = 0;
|
|
goto getTrailByte;
|
|
}
|
|
|
|
while(mySource < mySourceLimit){
|
|
|
|
targetUniChar =missingCharMarker;
|
|
|
|
if(myTarget < args->targetLimit){
|
|
|
|
mySourceChar= (unsigned char) *mySource++;
|
|
|
|
|
|
switch(mySourceChar){
|
|
case UCNV_SI:
|
|
myData->toU2022State.g=0;
|
|
continue;
|
|
|
|
case UCNV_SO:
|
|
if(myData->toU2022State.cs[1] != 0) {
|
|
myData->toU2022State.g=1;
|
|
continue;
|
|
} else {
|
|
/* illegal to have SO before a matching designator */
|
|
break;
|
|
}
|
|
|
|
case ESC_2022:
|
|
mySource--;
|
|
escape:
|
|
changeState_2022(args->converter,&(mySource),
|
|
mySourceLimit, ISO_2022_CN,err);
|
|
|
|
/* invalid or illegal escape sequence */
|
|
if(U_FAILURE(*err)){
|
|
args->target = myTarget;
|
|
args->source = mySource;
|
|
return;
|
|
}
|
|
continue;
|
|
|
|
/* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
|
|
|
|
case CR:
|
|
/*falls through*/
|
|
case LF:
|
|
uprv_memset(&myData->toU2022State, 0, sizeof(ISO2022State));
|
|
/* falls through */
|
|
default:
|
|
/* convert one or two bytes */
|
|
if(myData->toU2022State.g != 0) {
|
|
getTrailByte:
|
|
if(mySource < mySourceLimit) {
|
|
UConverter *cnv;
|
|
StateEnum tempState;
|
|
int32_t tempBufLen;
|
|
char trailByte = *mySource++;
|
|
tempState = (StateEnum)myData->toU2022State.cs[myData->toU2022State.g];
|
|
if(tempState > CNS_11643_0) {
|
|
cnv = myData->myConverterArray[CNS_11643];
|
|
tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
|
|
tempBuf[1] = (char) (mySourceChar);
|
|
tempBuf[2] = trailByte;
|
|
tempBufLen = 3;
|
|
|
|
}else{
|
|
cnv = myData->myConverterArray[tempState];
|
|
tempBuf[0] = (char) (mySourceChar);
|
|
tempBuf[1] = trailByte;
|
|
tempBufLen = 2;
|
|
}
|
|
mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
|
|
if(myData->toU2022State.g>=2) {
|
|
/* return from a single-shift state to the previous one */
|
|
myData->toU2022State.g=myData->toU2022State.prevG;
|
|
}
|
|
targetUniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, tempBuf, tempBufLen, FALSE);
|
|
} else {
|
|
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
|
args->converter->toULength = 1;
|
|
goto endloop;
|
|
}
|
|
}
|
|
else{
|
|
if(mySourceChar <= 0x7f) {
|
|
targetUniChar = (UChar) mySourceChar;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
|
|
if(args->offsets){
|
|
args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
|
|
}
|
|
*(myTarget++)=(UChar)targetUniChar;
|
|
}
|
|
else if(targetUniChar > missingCharMarker){
|
|
/* disassemble the surrogate pair and write to output*/
|
|
targetUniChar-=0x0010000;
|
|
*myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
|
|
if(args->offsets){
|
|
args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
|
|
}
|
|
++myTarget;
|
|
if(myTarget< args->targetLimit){
|
|
*myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
|
|
if(args->offsets){
|
|
args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
|
|
}
|
|
++myTarget;
|
|
}else{
|
|
args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
|
|
(UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
|
|
}
|
|
|
|
}
|
|
else{
|
|
/* Call the callback function*/
|
|
toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
|
|
break;
|
|
}
|
|
}
|
|
else{
|
|
*err =U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
endloop:
|
|
args->target = myTarget;
|
|
args->source = mySource;
|
|
}
|
|
|
|
static void
|
|
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
|
|
UConverter *cnv = args->converter;
|
|
UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
|
|
char *p;
|
|
char buffer[4];
|
|
|
|
p = buffer;
|
|
switch(myConverterData->locale[0]){
|
|
case 'j':
|
|
if(myConverterData->fromUnicodeCurrentState!= ASCII){
|
|
myConverterData->fromUnicodeCurrentState= ASCII;
|
|
myConverterData->currentType = (Cnv2022Type) myConverterType[myConverterData->fromUnicodeCurrentState];
|
|
*p++ = '\x1b';
|
|
*p++ = '\x28';
|
|
*p++ = '\x42';
|
|
|
|
}
|
|
*p++ = cnv->subChar[0];
|
|
break;
|
|
case 'c':
|
|
if(myConverterData->fromU2022State.g != 0) {
|
|
/* not in ASCII mode: switch to ASCII */
|
|
myConverterData->fromU2022State.g = 0;
|
|
*p++ = UCNV_SI;
|
|
}
|
|
*p++ = cnv->subChar[0];
|
|
break;
|
|
case 'k':
|
|
if(args->converter->fromUnicodeStatus){
|
|
args->converter->fromUnicodeStatus=0x00;
|
|
*p++= UCNV_SI;
|
|
}
|
|
|
|
*p++ = cnv->subChar[0];
|
|
|
|
default:
|
|
/* not expected */
|
|
break;
|
|
}
|
|
ucnv_cbFromUWriteBytes(args,
|
|
buffer, (int32_t)(p - buffer),
|
|
offsetIndex, err);
|
|
}
|
|
|
|
/* structure for SafeClone calculations */
|
|
struct cloneStruct
|
|
{
|
|
UConverter cnv;
|
|
UConverterDataISO2022 mydata;
|
|
UConverter currentCnv; /**< for ISO_2022 converter if the current converter is open */
|
|
|
|
UConverter clonedConverters[1]; /* Actually a variable sized array for all of the sub converters to be cloned. */
|
|
};
|
|
|
|
|
|
static UConverter *
|
|
_ISO_2022_SafeClone(
|
|
const UConverter *cnv,
|
|
void *stackBuffer,
|
|
int32_t *pBufferSize,
|
|
UErrorCode *status)
|
|
{
|
|
struct cloneStruct * localClone;
|
|
int32_t bufferSizeNeeded = sizeof(struct cloneStruct);
|
|
UConverterDataISO2022* cnvData = (UConverterDataISO2022*)cnv->extraInfo;
|
|
int32_t i;
|
|
int32_t sizes[UCNV_2022_MAX_CONVERTERS];
|
|
int32_t numConverters = 0;
|
|
int32_t currentConverterIndex = -1;
|
|
int32_t fromUnicodeConverterIndex = -1;
|
|
int32_t currentConverterSize = 0;
|
|
char *ptr; /* buffer pointer */
|
|
|
|
if (U_FAILURE(*status)) {
|
|
return 0;
|
|
}
|
|
|
|
for(i=0;(i<UCNV_2022_MAX_CONVERTERS)&&cnvData->myConverterArray[i];i++) {
|
|
int32_t size;
|
|
|
|
size = 0;
|
|
ucnv_safeClone(cnvData->myConverterArray[i], NULL, &size, status);
|
|
bufferSizeNeeded += size;
|
|
sizes[i] = size;
|
|
numConverters++;
|
|
|
|
if(cnvData->currentConverter == cnvData->myConverterArray[i]) {
|
|
currentConverterIndex = i;
|
|
}
|
|
|
|
if(cnvData->fromUnicodeConverter == cnvData->myConverterArray[i]) {
|
|
fromUnicodeConverterIndex = i;
|
|
}
|
|
}
|
|
|
|
if(currentConverterIndex == -1) { /* -1 means - not found in array. Clone separately */
|
|
currentConverterSize = 0;
|
|
if(cnvData->currentConverter) {
|
|
ucnv_safeClone(cnvData->currentConverter, NULL, ¤tConverterSize, status);
|
|
bufferSizeNeeded += currentConverterSize;
|
|
}
|
|
}
|
|
|
|
for(;i<UCNV_2022_MAX_CONVERTERS;i++) { /* zero the other sizes */
|
|
sizes[i]=0;
|
|
}
|
|
|
|
if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
|
|
*pBufferSize = bufferSizeNeeded;
|
|
return 0;
|
|
}
|
|
|
|
if(*pBufferSize < bufferSizeNeeded) {
|
|
*status = U_BUFFER_OVERFLOW_ERROR;
|
|
return 0;
|
|
}
|
|
|
|
localClone = (struct cloneStruct *)stackBuffer;
|
|
uprv_memcpy(&localClone->cnv, cnv, sizeof(UConverter));
|
|
|
|
uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataISO2022));
|
|
|
|
/* clone back sub cnvs */
|
|
|
|
ptr = (char*)&localClone->clonedConverters;
|
|
for(i=0;i<numConverters;i++) {
|
|
int32_t size;
|
|
size = sizes[i];
|
|
localClone->mydata.myConverterArray[i] = ucnv_safeClone(cnvData->myConverterArray[i], (UConverter*)ptr, &size, status);
|
|
ptr += size;
|
|
}
|
|
for(;i<UCNV_2022_MAX_CONVERTERS;i++) {
|
|
localClone->mydata.myConverterArray[i] = NULL;
|
|
}
|
|
|
|
if(currentConverterIndex == -1) { /* -1 = not found in list */
|
|
/* KR version 1 also uses the state in currentConverter for preserving state
|
|
* so we need to clone it too!
|
|
*/
|
|
if(cnvData->currentConverter) {
|
|
localClone->mydata.currentConverter = ucnv_safeClone(cnvData->currentConverter, ptr, ¤tConverterSize, status);
|
|
ptr += currentConverterSize;
|
|
} else {
|
|
localClone->mydata.currentConverter = NULL;
|
|
}
|
|
} else {
|
|
localClone->mydata.currentConverter = localClone->mydata.myConverterArray[currentConverterIndex];
|
|
}
|
|
|
|
if(fromUnicodeConverterIndex != -1) {
|
|
/* fromUnicodeConverter is in the list */
|
|
localClone->mydata.fromUnicodeConverter = localClone->mydata.myConverterArray[fromUnicodeConverterIndex];
|
|
} else if(cnvData->currentConverter == cnvData->fromUnicodeConverter) {
|
|
/* fromUnicodeConverter is the same as currentConverter */
|
|
localClone->mydata.fromUnicodeConverter = localClone->mydata.currentConverter;
|
|
} else {
|
|
/* fromUnicodeConverter is NULL */
|
|
localClone->mydata.fromUnicodeConverter = NULL;
|
|
}
|
|
|
|
localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
|
|
|
|
return &localClone->cnv;
|
|
}
|
|
|
|
static void
|
|
_ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
|
USet *set,
|
|
UConverterUnicodeSet which,
|
|
UErrorCode *pErrorCode)
|
|
{
|
|
int32_t i;
|
|
USet *cnvSet;
|
|
UConverterDataISO2022* cnvData;
|
|
|
|
if (U_FAILURE(*pErrorCode)) {
|
|
return;
|
|
}
|
|
if (cnv->sharedData == &_ISO2022Data) {
|
|
/* We use UTF-8 in this case */
|
|
uset_addRange(set, 0, 0xd7FF);
|
|
uset_addRange(set, 0xE000, 0x10FFFF);
|
|
return;
|
|
}
|
|
|
|
cnvData = (UConverterDataISO2022*)cnv->extraInfo;
|
|
if (cnv->sharedData == &_ISO2022KRData && cnvData->currentConverter != NULL) {
|
|
ucnv_getUnicodeSet(cnvData->currentConverter, set, which, pErrorCode);
|
|
return;
|
|
}
|
|
|
|
cnvSet = uset_open(0, 0);
|
|
if (!cnvSet) {
|
|
*pErrorCode =U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
|
|
for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
|
|
if(cnvData->myConverterArray[i]!=NULL) {
|
|
ucnv_getUnicodeSet(cnvData->myConverterArray[i], cnvSet, which, pErrorCode);
|
|
uset_addAll(set, cnvSet /* pErrorCode */);
|
|
}
|
|
}
|
|
uset_close(cnvSet);
|
|
}
|
|
|
|
static const UConverterImpl _ISO2022Impl={
|
|
UCNV_ISO_2022,
|
|
|
|
NULL,
|
|
NULL,
|
|
|
|
_ISO2022Open,
|
|
_ISO2022Close,
|
|
_ISO2022Reset,
|
|
|
|
#ifdef U_ENABLE_GENERIC_ISO_2022
|
|
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
|
|
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
|
|
T_UConverter_fromUnicode_UTF8,
|
|
T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC,
|
|
#else
|
|
NULL,
|
|
NULL,
|
|
NULL,
|
|
NULL,
|
|
#endif
|
|
NULL,
|
|
|
|
NULL,
|
|
_ISO2022getName,
|
|
_ISO_2022_WriteSub,
|
|
_ISO_2022_SafeClone,
|
|
_ISO_2022_GetUnicodeSet
|
|
};
|
|
static const UConverterStaticData _ISO2022StaticData={
|
|
sizeof(UConverterStaticData),
|
|
"ISO_2022",
|
|
2022,
|
|
UCNV_IBM,
|
|
UCNV_ISO_2022,
|
|
1,
|
|
3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
|
|
{ 0x1a, 0, 0, 0 },
|
|
1,
|
|
FALSE,
|
|
FALSE,
|
|
0,
|
|
0,
|
|
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
|
};
|
|
const UConverterSharedData _ISO2022Data={
|
|
sizeof(UConverterSharedData),
|
|
~((uint32_t) 0),
|
|
NULL,
|
|
NULL,
|
|
&_ISO2022StaticData,
|
|
FALSE,
|
|
&_ISO2022Impl,
|
|
0
|
|
};
|
|
|
|
/*************JP****************/
|
|
static const UConverterImpl _ISO2022JPImpl={
|
|
UCNV_ISO_2022,
|
|
|
|
NULL,
|
|
NULL,
|
|
|
|
_ISO2022Open,
|
|
_ISO2022Close,
|
|
_ISO2022Reset,
|
|
|
|
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
|
|
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
|
|
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
|
|
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
|
|
NULL,
|
|
|
|
NULL,
|
|
_ISO2022getName,
|
|
_ISO_2022_WriteSub,
|
|
_ISO_2022_SafeClone,
|
|
_ISO_2022_GetUnicodeSet
|
|
};
|
|
static const UConverterStaticData _ISO2022JPStaticData={
|
|
sizeof(UConverterStaticData),
|
|
"ISO_2022_JP",
|
|
0,
|
|
UCNV_IBM,
|
|
UCNV_ISO_2022,
|
|
1,
|
|
6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
|
|
{ 0x1a, 0, 0, 0 },
|
|
1,
|
|
FALSE,
|
|
FALSE,
|
|
0,
|
|
0,
|
|
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
|
};
|
|
static const UConverterSharedData _ISO2022JPData={
|
|
sizeof(UConverterSharedData),
|
|
~((uint32_t) 0),
|
|
NULL,
|
|
NULL,
|
|
&_ISO2022JPStaticData,
|
|
FALSE,
|
|
&_ISO2022JPImpl,
|
|
0
|
|
};
|
|
|
|
/************* KR ***************/
|
|
static const UConverterImpl _ISO2022KRImpl={
|
|
UCNV_ISO_2022,
|
|
|
|
NULL,
|
|
NULL,
|
|
|
|
_ISO2022Open,
|
|
_ISO2022Close,
|
|
_ISO2022Reset,
|
|
|
|
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
|
|
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
|
|
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
|
|
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
|
|
NULL,
|
|
|
|
NULL,
|
|
_ISO2022getName,
|
|
_ISO_2022_WriteSub,
|
|
_ISO_2022_SafeClone,
|
|
_ISO_2022_GetUnicodeSet
|
|
};
|
|
static const UConverterStaticData _ISO2022KRStaticData={
|
|
sizeof(UConverterStaticData),
|
|
"ISO_2022_KR",
|
|
0,
|
|
UCNV_IBM,
|
|
UCNV_ISO_2022,
|
|
1,
|
|
3, /* max 3 bytes per UChar: SO+DBCS */
|
|
{ 0x1a, 0, 0, 0 },
|
|
1,
|
|
FALSE,
|
|
FALSE,
|
|
0,
|
|
0,
|
|
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
|
};
|
|
static const UConverterSharedData _ISO2022KRData={
|
|
sizeof(UConverterSharedData),
|
|
~((uint32_t) 0),
|
|
NULL,
|
|
NULL,
|
|
&_ISO2022KRStaticData,
|
|
FALSE,
|
|
&_ISO2022KRImpl,
|
|
0
|
|
};
|
|
|
|
/*************** CN ***************/
|
|
static const UConverterImpl _ISO2022CNImpl={
|
|
|
|
UCNV_ISO_2022,
|
|
|
|
NULL,
|
|
NULL,
|
|
|
|
_ISO2022Open,
|
|
_ISO2022Close,
|
|
_ISO2022Reset,
|
|
|
|
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
|
|
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
|
|
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
|
|
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
|
|
NULL,
|
|
|
|
NULL,
|
|
_ISO2022getName,
|
|
_ISO_2022_WriteSub,
|
|
_ISO_2022_SafeClone,
|
|
_ISO_2022_GetUnicodeSet
|
|
};
|
|
static const UConverterStaticData _ISO2022CNStaticData={
|
|
sizeof(UConverterStaticData),
|
|
"ISO_2022_CN",
|
|
0,
|
|
UCNV_IBM,
|
|
UCNV_ISO_2022,
|
|
2,
|
|
8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
|
|
{ 0x1a, 0, 0, 0 },
|
|
1,
|
|
FALSE,
|
|
FALSE,
|
|
0,
|
|
0,
|
|
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
|
};
|
|
static const UConverterSharedData _ISO2022CNData={
|
|
sizeof(UConverterSharedData),
|
|
~((uint32_t) 0),
|
|
NULL,
|
|
NULL,
|
|
&_ISO2022CNStaticData,
|
|
FALSE,
|
|
&_ISO2022CNImpl,
|
|
0
|
|
};
|
|
|
|
|
|
|
|
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
|