ICU-6137 Add static method in CharsetMBCS used in ISO2022.

X-SVN-Rev: 23404
This commit is contained in:
Michael Ow 2008-02-09 00:10:25 +00:00
parent b48351f219
commit 90412e106d
2 changed files with 585 additions and 15 deletions

View File

@ -462,17 +462,17 @@ class CharsetMBCS extends CharsetICU {
* MBCS output types for conversions from Unicode. These per-converter types determine the storage method in stage 3
* of the lookup table, mostly how many bytes are stored per entry.
*/
private static final int MBCS_OUTPUT_1 = 0; /* 0 */
private static final int MBCS_OUTPUT_2 = MBCS_OUTPUT_1 + 1; /* 1 */
private static final int MBCS_OUTPUT_3 = MBCS_OUTPUT_2 + 1; /* 2 */
private static final int MBCS_OUTPUT_4 = MBCS_OUTPUT_3 + 1; /* 3 */
private static final int MBCS_OUTPUT_3_EUC = 8; /* 8 */
private static final int MBCS_OUTPUT_4_EUC = MBCS_OUTPUT_3_EUC + 1; /* 9 */
private static final int MBCS_OUTPUT_2_SISO = 12; /* c */
private static final int MBCS_OUTPUT_2_HZ = MBCS_OUTPUT_2_SISO + 1; /* d */
private static final int MBCS_OUTPUT_EXT_ONLY = MBCS_OUTPUT_2_HZ + 1; /* e */
// private static final int MBCS_OUTPUT_COUNT = MBCS_OUTPUT_EXT_ONLY + 1;
private static final int MBCS_OUTPUT_DBCS_ONLY = 0xdb; /* runtime-only type for DBCS-only handling of SISO tables */
static final int MBCS_OUTPUT_1 = 0; /* 0 */
static final int MBCS_OUTPUT_2 = MBCS_OUTPUT_1 + 1; /* 1 */
static final int MBCS_OUTPUT_3 = MBCS_OUTPUT_2 + 1; /* 2 */
static final int MBCS_OUTPUT_4 = MBCS_OUTPUT_3 + 1; /* 3 */
static final int MBCS_OUTPUT_3_EUC = 8; /* 8 */
static final int MBCS_OUTPUT_4_EUC = MBCS_OUTPUT_3_EUC + 1; /* 9 */
static final int MBCS_OUTPUT_2_SISO = 12; /* c */
static final int MBCS_OUTPUT_2_HZ = MBCS_OUTPUT_2_SISO + 1; /* d */
static final int MBCS_OUTPUT_EXT_ONLY = MBCS_OUTPUT_2_HZ + 1; /* e */
// static final int MBCS_OUTPUT_COUNT = MBCS_OUTPUT_EXT_ONLY + 1;
static final int MBCS_OUTPUT_DBCS_ONLY = 0xdb; /* runtime-only type for DBCS-only handling of SISO tables */
/* GB 18030 data ------------------------------------------------------------ */
@ -927,6 +927,576 @@ class CharsetMBCS extends CharsetICU {
: sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY ? 1 : -1;
}
private static int getFallback(UConverterMBCSTable mbcsTable, int offset)
{
MBCSToUFallback[] toUFallbacks;
int i, start, limit;
limit = mbcsTable.countToUFallbacks;
if(limit>0) {
/* do a binary search for the fallback mapping */
toUFallbacks = mbcsTable.toUFallbacks;
start = 0;
while(start<limit-1) {
i = (start+limit)/2;
if(offset<toUFallbacks[i].offset) {
limit = i;
}
else {
start = i;
}
}
/* did we really find it? */
if(offset==toUFallbacks[start].offset) {
return toUFallbacks[start].codePoint;
}
}
return 0xfffe;
}
/*
* This is a simple version of _MBCSGetNextUChar() that is used
* by other converter implementations.
* It only returns an "assigned" result if it consumes the entire input.
* It does not use state from the converter, nor error codes.
* It does not handle the EBCDIC swaplfnl option (set in UConverter).
* It handles conversion extensions but not GB 18030.
*
* Return value:
* U+fffe unassigned
* U+ffff illegal
* otherwise the Unicode code point
*/
static int MBCSSimpleGetNextUChar(UConverterSharedData sharedData,
ByteBuffer source,
boolean useFallback) {
int[][] stateTable;
char[] unicodeCodeUnits;
int offset;
int state;
int action;
int c;
int entry;
/* set up the local pointers */
stateTable=sharedData.mbcs.stateTable;
unicodeCodeUnits=sharedData.mbcs.unicodeCodeUnits;
/* converter state */
offset=0;
state=sharedData.mbcs.dbcsOnlyState;
/* conversion loop */
for(;;) {
if (source.hasRemaining() == false) {
/* no input at all: "illegal" */
return 0xffff;
}
int sourceByte = source.get() & UConverterConstants.UNSIGNED_BYTE_MASK;
entry = stateTable[state][sourceByte];
if (MBCS_ENTRY_IS_TRANSITION(entry)) {
state = MBCS_ENTRY_TRANSITION_STATE(entry);
offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
} else {
/*
* An if-else-if chain provides more reliable performance for
* the most common cases compared to a switch.
*/
action = MBCS_ENTRY_FINAL_ACTION(entry);
if(action==MBCS_STATE_VALID_16) {
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
c=unicodeCodeUnits[offset];
if(c!=0xfffe) {
/* done */
} else if (useFallback) {
c = getFallback(sharedData.mbcs, offset);
/* else done with 0xfffe */
}
break;
} else if(action==MBCS_STATE_VALID_DIRECT_16) {
/* output BMP code point */
c = MBCS_ENTRY_FINAL_VALUE_16(entry);
break;
} else if (action==MBCS_STATE_VALID_16_PAIR) {
offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
c=unicodeCodeUnits[offset++];
if(c<0xd800) {
/* output BMP code point below 0xd800 */
} else if (useFallback ? c<=0xdfff : c<=0xdbff) {
/* output roundtrip or fallback supplementary code point */
c = (((c&0x3ff)<<10) + unicodeCodeUnits[offset] + (0x10000-0xdc00));
} else if(useFallback ? (c&0xfffe)==0xe000 : c==0xe000) {
/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
c=unicodeCodeUnits[offset];
} else if(c==0xffff) {
return 0xffff;
} else {
c=0xfffe;
}
break;
} else if(action==MBCS_STATE_VALID_DIRECT_20) {
/* output supplementary code point */
c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
break;
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
if(!useFallback) {
c=0xfffe;
break;
}
/* output BMP code point */
c=MBCS_ENTRY_FINAL_VALUE_16(entry);
break;
} else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
if(!useFallback) {
c=0xfffe;
break;
}
/* output supplementary code point */
c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
break;
} else if(action==MBCS_STATE_UNASSIGNED) {
c=0xfffe;
break;
}
/*
* forbid MBCS_STATE_CHANGE_ONLY for this function,
* and MBCS_STATE_ILLEGAL and reserved action codes
*/
c = 0xffff;
break;
}
}
if(c==0xfffe) {
/* try an extension mapping */
ByteBuffer cx=sharedData.mbcs.extIndexes;
cx.position(0);
if(cx != null) {
source.position(0);
return extSimpleMatchToU(cx, source, useFallback, sharedData);
}
}
return c;
}
/* This private static method is use by MBCSSimpleGetNextUChar for extension mapping.*/
private static int extSimpleMatchToU(ByteBuffer cx, ByteBuffer source, boolean useFallback, UConverterSharedData sharedData) {
int[] value = new int[1];
int match;
if (source.remaining() <= 0) {
return 0xffff;
}
/* try to match */
match = extMatchToU(cx, (byte)-1, source, null, value,
useFallback, true, sharedData);
if (match == source.array().length) {
/* write result for simple, single-character conversion */
if (TO_U_IS_CODE_POINT(value[0])) {
return TO_U_GET_CODE_POINT(value[0]);
}
}
/*
* return no match because
* - match>0 && value points to string: simple conversion cannot handle multiple code points
* - match>0 && match!=length: not all input consumed, forbidden for this function
* - match==0: no match found in the first place
* - match<0: partial match, not supported for simple conversion (and flush==TRUE)
*/
return 0xfffe;
}
/* This private static method is use by extSimpleMatchToU for extension mapping. */
private static int extMatchToU(ByteBuffer cx, byte sisoState, ByteBuffer pre, ByteBuffer src,
int[] pMatchValue, boolean isUseFallback, boolean flush, UConverterSharedData sharedData) {
IntBuffer toUTable, toUSection;
int preLength = pre.array().length;
int value, matchValue, srcLength;
int i, j, index, length, matchLength;
short b;
if (src == null) {
srcLength = 0;
} else {
srcLength = src.array().length;
}
if (cx == null || cx.getInt(EXT_TO_U_LENGTH) <= 0) {
return 0; /* no extension data, no match */
}
/* initialize */
toUTable = (IntBuffer)ARRAY(cx, EXT_TO_U_INDEX, int.class);//(IntBuffer) ARRAY(cx, EXT_TO_U_INDEX, int.class);
index = 0;
matchValue = 0;
i = j = matchLength = 0;
if (sisoState == 0) {
/* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */
if (preLength > 1) {
return 0; /* no match of a DBCS sequence in SBCS mode */
} else if (preLength == 1) {
srcLength = 0;
} else /* preLength==0 */{
if (srcLength > 1) {
srcLength = 1;
}
}
flush = true;
}
/* we must not remember fallback matches when not using fallbacks */
/* match input units until there is a full match or the input is consumed */
for (;;) {
/* go to the next section */
int oldpos = toUTable.position();
toUSection = ((IntBuffer) toUTable.position(index)).slice();
toUTable.position(oldpos);
/* read first pair of the section */
value = toUSection.get();
length = TO_U_GET_BYTE(value);
value = TO_U_GET_VALUE(value);
if (value != 0 && (TO_U_IS_ROUNDTRIP(value)) /*|| isToUUseFallback(isUseFallback))*/
&& TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) {
/* remember longest match so far */
matchValue = value;
matchLength = i + j;
}
/* match pre[] then src[] */
if (i < preLength) {
b = (short) (pre.get(i++) & UConverterConstants.UNSIGNED_BYTE_MASK);
} else if (j < srcLength) {
b = (short) (src.get(j++) & UConverterConstants.UNSIGNED_BYTE_MASK);
} else {
/* all input consumed, partial match */
if (flush || (length = (i + j)) > MAX_BYTES) {
/*
* end of the entire input stream, stop with the longest match so far or: partial match must not
* be longer than UCNV_EXT_MAX_BYTES because it must fit into state buffers
*/
break;
} else {
/* continue with more input next time */
return -length;
}
}
/* search for the current UChar */
value = findToU(toUSection, length, b);
if (value == 0) {
/* no match here, stop with the longest match so far */
break;
} else {
if (TO_U_IS_PARTIAL(value)) {
/* partial match, continue */
index = TO_U_GET_PARTIAL_INDEX(value);
} else {
if (TO_U_IS_ROUNDTRIP(value) /*|| isToUUseFallback(isUseFallback)) */&& TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) {
/* full match, stop with result */
matchValue = value;
matchLength = i + j;
} else {
/* full match on fallback not taken, stop with the longest match so far */
}
break;
}
}
}
if (matchLength == 0) {
/* no match at all */
return 0;
}
/* return result */
pMatchValue[0] = TO_U_MASK_ROUNDTRIP(matchValue);
return matchLength;
}
/*
* This is another simple conversion function for internal use by other
* conversion implementations.
* It does not use the converter state nor call callbacks.
* It does not handle the EBCDIC swaplfnl option (set in UConverter).
* It handles conversion extensions but not GB 1830.
*
* It converts a single Unicode code point into code page bytes, encoded
* as one 32-bit value. The function returns the number of bytes in *pValue:
* 1..4 the number of bytes in *pValue
* 0 unassigned (*pValue undefined)
* -1 illegal (currently not used, *pValue undefined)
* *pValue will contain the resulting bytes with the last byte in bits 7..0,
* the second to last byte in bits 15..8, etc.
* Currently the function assumes but does not check that 0<=c<=0x10ffff.
*/
static int MBCSFromUChar32_ISO2022(UConverterSharedData sharedData, int c, int[] value, boolean useFallback,
int outputType) { // Output Type from MBCS, e.g. CharsetMBCS.MBCS_OUTPUT_2
ByteBuffer cx;
char[] table;
int stage2Entry;
int myValue;
int length;
int p;
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
if (c<0x10000 || (sharedData.mbcs.unicodeMask& UConverterConstants.HAS_SUPPLEMENTARY) != 0) {
table = sharedData.mbcs.fromUnicodeTable;
stage2Entry = MBCS_STAGE_2_FROM_U(table, c);
/* get the bytes and the length for the output */
if (outputType == MBCS_OUTPUT_2) {
myValue = MBCS_VALUE_2_FROM_STAGE_2(sharedData.mbcs.fromUnicodeBytes, stage2Entry, c);
if (myValue <= 0xff) {
length = 1;
} else {
length = 2;
}
} else { /* outputType == MBCS_OUTPUT_3 */
byte[] bytes = sharedData.mbcs.fromUnicodeBytes;
p = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
myValue = ((bytes[p] & UConverterConstants.UNSIGNED_BYTE_MASK)<<16) |
((bytes[p+1] & UConverterConstants.UNSIGNED_BYTE_MASK)<<8) |
(bytes[p+2] & UConverterConstants.UNSIGNED_BYTE_MASK);
if (myValue <= 0xff) {
length = 1;
} else if (myValue <= 0xffff) {
length = 2;
} else {
length = 3;
}
}
/* is this code point assigned, or do we use fallbacks? */
if ((stage2Entry&(1<<(16+(c&0xf)))) != 0) {
/* assigned */
value[0] = myValue;
return length;
} else if (CharsetEncoderICU.isFromUUseFallback(useFallback, c) && myValue != 0) {
/*
* We allow a 0 byte output if the "assigned" bit is set for this entry.
* There is no way with this data structure for fallback output
* to be a zero byte.
*/
value[0] = myValue;
return -length;
}
}
cx = sharedData.mbcs.extIndexes;
if (cx != null) {
return extSimpleMatchFromU(cx, c, value, useFallback);
}
return 0;
}
/*
* Used by ISO 2022 implementation
* @return number of bytes in pValue; negative number if fallback; 0 for no mapping
*/
private static int extSimpleMatchFromU(ByteBuffer cx, int c, int[] pValue, boolean useFallback) {
int match;
int[] value = new int[1];
/*try to match */
match = extMatchFromU(cx, c, null, null, value, useFallback, true);
if (match >= 2) {
int length;
boolean isRoundtrip;
isRoundtrip = FROM_U_IS_ROUNDTRIP(value[0]);
length = FROM_U_GET_LENGTH(value[0]);
value[0] = FROM_U_GET_DATA(value[0]);
if (length <= EXT_FROM_U_MAX_DIRECT_LENGTH) {
pValue[0] = value[0];
return isRoundtrip ? length : -length;
}
}
/*
* return no match because
* - match>1 && resultLength>4: result too long for simple conversion
* - match==1: no match found, <subchar1> preferred
* - match==0: no match found in the first place
* - match<0: partial match, not supported for simple conversion (and flush==true)
*/
return 0;
}
private static int extMatchFromU(ByteBuffer cx, int firstCP, char[] pre, char[] src, int[] pMatchValue, boolean useFallback, boolean flush) {
CharBuffer stage12, stage3;
IntBuffer stage3b;
CharBuffer fromUTableUChars, fromUSectionUChars;
IntBuffer fromUTableValues, fromUSectionValues;
int value, matchValue;
int i, j, index, length, matchLength;
char c;
if (cx == null) {
return 0; /* no extension data, no match */
}
/* trie lookup of firstCP */
index = firstCP>>10; /* stage 1 index */
if (index>=cx.getInt(EXT_FROM_U_STAGE_1_LENGTH*4)) { // need to find the correct int in the bytebuffer
return 0; /* the first code point is outside the trie */
}
stage12 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX, char.class);
stage3 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX, char.class);
index = FROM_U(stage12, stage3, index, firstCP);
stage3b = (IntBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX, int.class);
value = stage3b.get(index);
if (value == 0) {
return 0;
}
/*
* Tests for (value&EXT_FROM_U_RESERVED_MASK) == 0:
* Do not interpret values with reserved bits used, for forward compatibility,
* and do not even remember intermediate results with reserved bits used.
*/
if (TO_U_IS_PARTIAL(value)) {
/* partial match, enter the loop below */
index = FROM_U_GET_PARTIAL_INDEX(value);
/* initialize */
fromUTableUChars = (CharBuffer)ARRAY(cx, EXT_FROM_U_UCHARS_INDEX, char.class);
fromUTableValues = (IntBuffer)ARRAY(cx, EXT_FROM_U_VALUES_INDEX, int.class);
matchValue = 0;
i = j = matchLength = 0;
/* we must not remember fallback matches when not using fallbacks */
/*match inputs until there is a full match or the input is consumed */
for(;;) {
/* go to the next section */
int oldpos = fromUTableUChars.position();
fromUSectionUChars = ((CharBuffer)fromUTableUChars.position(index)).slice();
fromUTableUChars.position(oldpos);
oldpos = fromUTableValues.position();
fromUSectionValues = ((IntBuffer)fromUTableValues.position(index)).slice();
fromUTableValues.position(oldpos);
/*read first pair of the section */
length = fromUSectionUChars.get();
value = fromUSectionValues.get();
if (value != 0 &&
(FROM_U_IS_ROUNDTRIP(value) || CharsetEncoderICU.isFromUUseFallback(useFallback, firstCP)) &&
(value&FROM_U_RESERVED_MASK) == 0) {
/* remember longest match so far */
matchValue = value;
matchLength = 2 + i + j;
}
/* match pre[] then src[] */
if (pre != null && i < pre.length) {
c = pre[i++];
} else if (src != null && j < src.length) {
c = src[j++];
} else {
/* all input consumed, partial match */
if (flush || (length=(i+j))> MAX_UCHARS) {
/*
* end of the entire input stream, stop with the longest match so far
* or: partial match must not be longer than MAX_UCHARS
* because it must fit into state buffers
*/
break;
} else {
/* continue with more input next time */
return -(2+length);
}
}
/* search for the current UChar */
index = findFromU(fromUSectionUChars, length, c);
if (index < 0) {
/* no match here, stop with the longest match so far */
break;
} else {
value = fromUSectionValues.get(index);
if (FROM_U_IS_PARTIAL(value)) {
/* partial match, continue */
index = FROM_U_GET_PARTIAL_INDEX(value);
} else {
if ((FROM_U_IS_ROUNDTRIP(value) || CharsetEncoderICU.isFromUUseFallback(useFallback, firstCP)) &&
(value&FROM_U_RESERVED_MASK) == 0 ) {
/* full match, stop with result */
matchValue = value;
matchLength = 2 + i + j;
} else {
/* full match on fallback not taken, stop with the longest match so far */
}
break;
}
}
}
if (matchLength == 0) {
/* no match at all */
return 0;
}
} else { /* result from firstCP trie lookup */
if ((FROM_U_IS_ROUNDTRIP(value) || CharsetEncoderICU.isFromUUseFallback(useFallback, firstCP)) &&
(value&FROM_U_RESERVED_MASK) == 0) {
/* full match, stop with result */
matchValue = value;
matchLength = 2;
} else {
/* fallback not taken */
return 0;
}
}
/* return result */
if (matchValue == FROM_U_SUBCHAR1) {
return 1; /* assert matchLength == 2 */
}
pMatchValue[0] = matchValue;
return matchLength;
}
/*
* @param is the the output byte
* @return 1 roundtrip byte 0 no mapping -1 fallback byte
*/
static int MBCSSingleFromUChar32(UConverterSharedData sharedData, int c, int[] retval, boolean useFallback) {
char[] table;
int value;
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
if (c >= 0x10000 && (sharedData.mbcs.unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
return 0;
}
/* convert the Unicode code point in c into codepage bytes */
table = sharedData.mbcs.fromUnicodeTable;
/* get the byte for the output */
value = MBCS_SINGLE_RESULT_FROM_U(table, sharedData.mbcs.fromUnicodeBytes, c);
/* get the byte for the output */
retval[0] = value & 0xff;
if (value >= 0xf00) {
return 1; /* roundtrip */
} else if (useFallback ? value>=0x800 : value>=0xc00) {
return -1; /* fallback taken */
} else {
return 0; /* no mapping */
}
}
class CharsetDecoderMBCS extends CharsetDecoderICU {
CharsetDecoderMBCS(CharsetICU cs) {

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2006-2007, International Business Machines Corporation and *
* Copyright (C) 2006-2008, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
@ -50,9 +50,9 @@ public final class CharsetProviderICU extends CharsetProvider{
// create the converter object and return it
if(icuCanonicalName==null || icuCanonicalName.length()==0){
// this would make the Charset API to throw
// unsupported encoding exception
return null;
// Try the original name, may be something added and not in the alias table.
// Will get an unsupported encoding exception if it doesn't work.
return getCharset(charsetName);
}
return getCharset(icuCanonicalName);
}catch(UnsupportedCharsetException ex){