ICU-6583 Port over illegal sequence handling code from ticket #5691 to ICU4J. Fix minor bugs in various callback functions and error handling code in ICU4J. Reenable "full" data driven conversion test.

X-SVN-Rev: 25468
This commit is contained in:
Michael Ow 2009-02-23 21:38:21 +00:00
parent e729683a89
commit 27ce5a3df5
7 changed files with 417 additions and 562 deletions

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2006-2008, International Business Machines Corporation and *
* Copyright (C) 2006-2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
@ -332,10 +332,7 @@ public class CharsetCallback {
}
}
}
/* reset the error */
cr = CoderResult.UNDERFLOW;
cr = encoder.cbFromUWriteUChars(encoder, CharBuffer.wrap(valueString, 0, valueStringLength), target, offsets);
return cr;
}
@ -356,7 +353,7 @@ public class CharsetCallback {
if (context == null || !(context instanceof String)) {
while (i < length) {
uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT; /* adding X */
uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT; /* adding U */
valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
}
} else {
@ -376,9 +373,11 @@ public class CharsetCallback {
uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
}
} else if (((String)context).equals(ESCAPE_C)) {
uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT; /* adding X */
valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
while (i < length) {
uniValueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */
uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */
valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
}
} else {
while (i < length) {
uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
@ -388,10 +387,8 @@ public class CharsetCallback {
}
}
}
/* reset the error */
cr = CoderResult.UNDERFLOW;
CharsetDecoderICU.toUWriteUChars(decoder, uniValueString, 0, valueStringLength, target, offsets, 0);
cr = CharsetDecoderICU.toUWriteUChars(decoder, uniValueString, 0, valueStringLength, target, offsets, 0);
return cr;
}

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2006-2008, International Business Machines Corporation and *
* Copyright (C) 2006-2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
@ -502,6 +502,8 @@ public abstract class CharsetDecoderICU extends CharsetDecoder{
//UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
replayArray.put(preToUArray,0, -preToULength);
// reset position
replayArray.position(0);
source=replayArray;
source.limit(replayArrayIndex-preToULength);
@ -649,7 +651,7 @@ public abstract class CharsetDecoderICU extends CharsetDecoder{
private void copy(byte[] src, int srcOffset, char[] dst, int dstOffset, int length) {
for(int i=srcOffset; i<length; i++){
dst[dstOffset++]=(char)src[srcOffset++];
dst[dstOffset++]=(char)(src[srcOffset++] & UConverterConstants.UNSIGNED_BYTE_MASK);
}
}
/*

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2008, International Business Machines Corporation and *
* Copyright (C) 2008-2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -58,6 +58,7 @@ class CharsetHZ extends CharsetICU {
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
CoderResult err = CoderResult.UNDERFLOW;
byte[] tempBuf = new byte[2];
int targetUniChar = 0;
int mySourceChar = 0;
@ -104,10 +105,25 @@ class CharsetHZ extends CharsetICU {
* if the first byte is equal to TILDE and the trail byte is not a valid byte then it is an
* error condition
*/
mySourceChar |= 0x7e00;
targetUniChar = 0xffff;
isEmptySegment = false; /* different error here, reset this to avoid spurious future error */
break;
/*
* Ticket 5691: consistent illegal sequences:
* - We include at least the first byte in the illegal sequence.
* - If any of the non-initial bytes could be the start of a character,
* we stop the illegal sequence before the first one of those.
*/
isEmptySegment = false; /* different error here, reset this to avoid spurious furture error */
err = CoderResult.malformedForLength(1);
toUBytesArray[0] = UCNV_TILDE;
if (isStateDBCS ? (0x21 <= mySourceChar && mySourceChar <= 0x7e) : mySourceChar <= 0x7f) {
/* The current byte could be the start of a character: Back it out. */
toULength = 1;
source.position(source.position() - 1);
} else {
/* Include the current byte in the illegal sequence. */
toUBytesArray[1] = (byte)mySourceChar;
toULength = 2;
}
return err;
}
} else if (isStateDBCS) {
if (toUnicodeStatus == 0) {
@ -124,19 +140,36 @@ class CharsetHZ extends CharsetICU {
continue;
} else {
/* trail byte */
boolean leadIsOk, trailIsOk;
int leadByte = toUnicodeStatus & 0xff;
if (0x21 <= leadByte && leadByte <= 0x7d && 0x21 <= mySourceChar && mySourceChar <= 0x7e) {
tempBuf[0] = (byte) (leadByte + 0x80);
tempBuf[1] = (byte) (mySourceChar + 0x80);
targetUniChar = gbDecoder.simpleGetNextUChar(ByteBuffer.wrap(tempBuf), super.isFallbackUsed());
} else {
targetUniChar = 0xffff;
}
targetUniChar = 0xffff;
/*
* add another bit so that the code below writes 2 bytes in case of error
* Ticket 5691: consistent illegal sequence
* - We include at least the first byte in the illegal sequence.
* - If any of the non-initial bytes could be the start of a character,
* we stop the illegal sequence before the first one of those
*
* In HZ DBCS, if the second byte is in the 21..7e range,
* we report ony the first byte as the illegal sequence.
* Otherwise we convert of report the pair of bytes.
*/
mySourceChar |= 0x10000 | (leadByte << 8);
toUnicodeStatus = 0;
leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (leadByte - 0x21)) <= (0x7d - 0x21);
trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
if (leadIsOk && trailIsOk) {
tempBuf[0] = (byte)(leadByte + 0x80);
tempBuf[1] = (byte)(mySourceChar + 0x80);
targetUniChar = gbDecoder.simpleGetNextUChar(ByteBuffer.wrap(tempBuf), super.isFallbackUsed());
mySourceChar = (leadByte << 8) | mySourceChar;
} else if (trailIsOk) {
/* report a single illegal byte and continue with the following DBCS starter byte */
source.position(source.position() - 1);
mySourceChar = (int)leadByte;
} else {
/* report a pair of illegal bytes if the second byte is not a DBCS starter */
/* add another bit so that the code below writes 2 bytes in case of error */
mySourceChar = 0x10000 | (leadByte << 8) | mySourceChar;
}
toUnicodeStatus = 0x00;
}
} else {
if (mySourceChar == UCNV_TILDE) {
@ -177,7 +210,7 @@ class CharsetHZ extends CharsetICU {
}
}
return CoderResult.UNDERFLOW;
return err;
}
}

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2008, International Business Machines Corporation and *
* Copyright (C) 2008-2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -200,10 +200,12 @@ class CharsetISO2022 extends CharsetICU {
}
/*
* Commented out because Ticket 5691: Call sites now check for validity. They can just += 0x8080 after that.
*
* This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
* 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
* unchanged.
*/
*
private static int _2022ToGR94DBCS(int value) {
int returnValue = value + 0x8080;
@ -213,7 +215,7 @@ class CharsetISO2022 extends CharsetICU {
} else {
return value;
}
}
}*/
/* is the StateEnum charset value for a DBCS charset? */
private static boolean IS_JP_DBCS(byte cs) {
@ -528,6 +530,7 @@ class CharsetISO2022 extends CharsetICU {
byte value;
int key[] = {myConverterData.key};
int offset[] = {0};
int initialToULength = decoder.toULength;
byte c;
int malformLength = 0;
@ -571,7 +574,7 @@ class CharsetISO2022 extends CharsetICU {
/* indicate that the escape sequence is incomplete: key !=0 */
return err;
} else if (value == INVALID_2022) {
return CoderResult.malformedForLength(malformLength);
err = CoderResult.malformedForLength(malformLength);
} else /* value == VALID_TERMINAL_2022 */ {
switch (var) {
case ISO_2022_JP: {
@ -679,7 +682,39 @@ class CharsetISO2022 extends CharsetICU {
}
if (!err.isError()) {
decoder.toULength = 0;
} else if (err.isMalformed()) {
if (decoder.toULength > 1) {
/*
* Ticket 5691: consistent illegal sequences:
* - We include at least the first byte (ESC) in the illegal sequence.
* - If any of the non-initial bytes could be the start of a character,
* we stop the illegal sequece before the first one of those.
* In escape sequences, all following bytes are "printable", that is,
* unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
* they are valid single/lead bytes.
* For simplicity, we always only report the initial ESC byte as the
* illegal sequence and back out all other bytes we looked at.
*/
/* Back out some bytes. */
int backOutDistance = decoder.toULength - 1;
int bytesFromThisBuffer = decoder.toULength - initialToULength;
if (backOutDistance <= bytesFromThisBuffer) {
/* same as initialToULength<=1 */
source.position(source.position() - backOutDistance);
} else {
/* Back out bytes from the previous buffer: Need to replay them. */
decoder.preToULength = (byte)(bytesFromThisBuffer - backOutDistance);
/* same as -(initalToULength-1) */
/* preToULength is negative! */
for (int i = 0; i < -(decoder.preToULength); i++) {
decoder.preToUArray[i] = decoder.toUBytesArray[i+1];
}
source.position(source.position() - bytesFromThisBuffer);
}
decoder.toULength = 1;
}
}
return err;
}
@ -820,7 +855,7 @@ class CharsetISO2022 extends CharsetICU {
gotoEscape = true;
} else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) {
/* continue with a partial double-byte character */
mySourceChar = toUBytesArray[0];
mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK);
toULength = 0;
cs = myConverterData.toU2022State.cs[myConverterData.toU2022State.g];
// goto getTrailByte;
@ -838,7 +873,7 @@ class CharsetISO2022 extends CharsetICU {
if (gotoEscape || gotoGetTrail || target.hasRemaining()) {
if (!gotoEscape && !gotoGetTrail) {
mySourceChar = UConverterConstants.UNSIGNED_BYTE_MASK & source.get();
mySourceChar = source.get() & UConverterConstants.UNSIGNED_BYTE_MASK;
mySourceCharTemp = mySourceChar;
}
@ -963,26 +998,48 @@ class CharsetISO2022 extends CharsetICU {
// getTrailByte:
int tmpSourceChar;
gotoGetTrail = false;
byte trailByte;
trailByte = source.get();
tmpSourceChar = (mySourceChar << 8) | (short)(UConverterConstants.UNSIGNED_BYTE_MASK & trailByte);
if (cs == JISX208) {
_2022ToSJIS((char)(UConverterConstants.UNSIGNED_BYTE_MASK & mySourceChar),
(char)(UConverterConstants.UNSIGNED_BYTE_MASK & trailByte), tempBuf);
} else {
if (cs == KSC5601) {
tmpSourceChar = _2022ToGR94DBCS(tmpSourceChar);
short trailByte;
boolean leadIsOk, trailIsOk;
trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK);
/*
* Ticket 5691: consistent illegal sequences:
* - We include at least the first byte in the illegal sequence.
* - If any of the non-initial bytes could be the start of a character,
* we stop the illegal sequence before the first one of those.
*
* In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
* an ESC/SO/SI, we report only the first byte as the illegal sequence.
* Otherwise we convert or report the pair of bytes.
*/
leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21);
if (leadIsOk && trailIsOk) {
source.get();
tmpSourceChar = (mySourceChar << 8) | trailByte;
if (cs == JISX208) {
_2022ToSJIS((char)mySourceChar, (char)trailByte, tempBuf);
mySourceChar = tmpSourceChar;
} else {
/* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
mySourceChar = tmpSourceChar;
if (cs == KSC5601) {
tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
}
tempBuf[0] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (tmpSourceChar >> 8));
tempBuf[1] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & tmpSourceChar);
}
tempBuf[0] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (tmpSourceChar >> 8));
tempBuf[1] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & tmpSourceChar);
targetUniChar = MBCSSimpleGetNextUChar(myConverterData.myConverterArray[cs], ByteBuffer.wrap(tempBuf), false);
} else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
/* report a pair of illegal bytes if the second byte is not a DBCS starter */
source.get();
/* add another bit so that the code below writes 2 bytes in case of error */
mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
}
ByteBuffer tempByteBuf = ByteBuffer.wrap(tempBuf);
targetUniChar = MBCSSimpleGetNextUChar(myConverterData.myConverterArray[cs], tempByteBuf, false);
mySourceChar = tmpSourceChar;
} else {
toUBytesArray[0] = (byte)mySourceChar;
toULength = 1;
// goto endloop;
// goto endloop
return err;
}
} /* end of inner switch */
@ -1056,8 +1113,9 @@ class CharsetISO2022 extends CharsetICU {
gotoEscape = true;
} else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) {
/* continue with a partial double-byte character */
mySourceChar = toUBytesArray[0];
mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK);
toULength = 0;
targetUniChar = UConverterConstants.missingCharMarker;
// goto getTrailByte
gotoGetTrailByte = true;
}
@ -1139,36 +1197,58 @@ class CharsetISO2022 extends CharsetICU {
UConverterSharedData cnv;
byte tempState;
int tempBufLen;
byte trailByte;
boolean leadIsOk, trailIsOk;
short trailByte;
// getTrailByte: label
gotoGetTrailByte = false; // reset gotoGetTrailByte
trailByte = source.get();
tempState = myConverterData.toU2022State.cs[myConverterData.toU2022State.g];
if (tempState > CNS_11643_0) {
cnv = myConverterData.myConverterArray[CNS_11643];
tempBuf[0] = (byte)(0x80 + (tempState - CNS_11643_0));
tempBuf[1] = (byte)(mySourceChar);
tempBuf[2] = trailByte;
tempBufLen = 3;
} else {
cnv = myConverterData.myConverterArray[tempState];
tempBuf[0] = (byte)(mySourceChar);
tempBuf[1] = trailByte;
tempBufLen = 2;
trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK);
/*
* Ticket 5691: consistent illegal sequences:
* - We include at least the first byte in the illegal sequence.
* - If any of the non-initial bytes could be the start of a character,
* we stop the illegal sequence before the first one of those.
*
* In ISO-2022 DBCS, if the second byte is in the range 21..7e range or is
* an ESC/SO/SI, we report only the first byte as the illegal sequence.
* Otherwise we convert or report the pair of bytes.
*/
leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21);
if (leadIsOk && trailIsOk) {
source.get();
tempState = myConverterData.toU2022State.cs[myConverterData.toU2022State.g];
if (tempState > CNS_11643_0) {
cnv = myConverterData.myConverterArray[CNS_11643];
tempBuf[0] = (byte)(0x80 + (tempState - CNS_11643_0));
tempBuf[1] = (byte)mySourceChar;
tempBuf[2] = (byte)trailByte;
tempBufLen = 3;
} else {
cnv = myConverterData.myConverterArray[tempState];
tempBuf[0] = (byte)mySourceChar;
tempBuf[1] = (byte)trailByte;
tempBufLen = 2;
}
ByteBuffer tempBuffer = ByteBuffer.wrap(tempBuf);
tempBuffer.limit(tempBufLen);
targetUniChar = MBCSSimpleGetNextUChar(cnv, tempBuffer, false);
mySourceChar = (mySourceChar << 8) | trailByte;
} else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
/* report a pair of illegal bytes if the second byte is not a DBCS starter */
source.get();
/* add another bit so that the code below writes 2 bytes in case of error */
mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
}
mySourceChar = (mySourceChar << 8) | (UConverterConstants.UNSIGNED_BYTE_MASK & trailByte);
if (myConverterData.toU2022State.g >= 2) {
/* return from a single-shift state to the previous one */
myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
}
ByteBuffer tempBuffer = ByteBuffer.wrap(tempBuf);
tempBuffer.limit(tempBufLen);
tempBuffer.position(0);
targetUniChar = MBCSSimpleGetNextUChar(cnv, tempBuffer, false);
} else {
toUBytesArray[0] = (byte)mySourceChar;
toULength = 1;
// goto endloop;
return err;
}
} else {
@ -1228,7 +1308,7 @@ class CharsetISO2022 extends CharsetICU {
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
CoderResult err = CoderResult.UNDERFLOW;
char mySourceChar = 0x0000;
int mySourceChar = 0x0000;
int targetUniChar = 0x0000;
byte[] tempBuf = new byte[2];
boolean usingFallback;
@ -1247,7 +1327,7 @@ class CharsetISO2022 extends CharsetICU {
gotoEscape = true;
} else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) {
/* continue with a partial double-byte character */
mySourceChar = (char)toUBytesArray[0];
mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK);
toULength = 0;
gotoGetTrailByte = true;
}
@ -1255,7 +1335,7 @@ class CharsetISO2022 extends CharsetICU {
while (source.hasRemaining() || gotoGetTrailByte || gotoEscape) {
if (target.hasRemaining() || gotoGetTrailByte || gotoEscape) {
if (!gotoGetTrailByte && !gotoEscape) {
mySourceChar = (char)(source.get()&UConverterConstants.UNSIGNED_BYTE_MASK);
mySourceChar = (char)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK);
}
if (!gotoGetTrailByte && !gotoEscape && mySourceChar == UConverterConstants.SI) {
@ -1290,31 +1370,52 @@ class CharsetISO2022 extends CharsetICU {
myConverterData.isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */
if (myConverterData.toU2022State.g == 1 || gotoGetTrailByte) {
if (source.hasRemaining() || gotoGetTrailByte) {
boolean leadIsOk, trailIsOk;
short trailByte;
// getTrailByte label
gotoGetTrailByte = false; // reset gotoGetTrailByte flag
byte trailByte;
trailByte = source.get();
tempBuf[0] = (byte)(mySourceChar + 0x80);
tempBuf[1] = (byte)(trailByte + 0x80);
mySourceChar = (char)((mySourceChar << 8) | (short)(trailByte&UConverterConstants.UNSIGNED_BYTE_MASK));
if ((mySourceChar & 0x8080) == 0) {
trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK);
targetUniChar = UConverterConstants.missingCharMarker;
/*
* Ticket 5691: consistent illegal sequences:
* - We include at least the first byte in the illegal sequence.
* - If any of the non-initial bytes could be the start of a character,
* we stop the illegal sequence before the first one of those.
*
* In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
* an ESC/SO/SI, we report only the first byte as the illegal sequence.
* Otherwise we convert or report the pair of bytes.
*/
leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21);
if (leadIsOk && trailIsOk) {
source.get();
tempBuf[0] = (byte)(mySourceChar + 0x80);
tempBuf[1] = (byte)(trailByte + 0x80);
targetUniChar = MBCSSimpleGetNextUChar(myConverterData.currentConverter.sharedData, ByteBuffer.wrap(tempBuf), usingFallback);
} else {
/* illegal bytes > 0x7f */
targetUniChar = UConverterConstants.missingCharMarker;
mySourceChar = (char)((mySourceChar << 8) | trailByte);
} else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
/* report a pair of illegal bytes if the second byte is not a DBCS starter */
source.get();
/* add another bit so that the code below writes 2 bytes in case of error */
mySourceChar = (char)(0x10000 | (mySourceChar << 8) | trailByte);
}
} else {
toUBytesArray[0] = (byte)mySourceChar;
toULength = 1;
break;
}
} else {
int oldSourceLimit = source.limit();
} else if (mySourceChar <= 0x7f) {
int savedSourceLimit = source.limit();
int savedSourcePosition = source.position();
source.limit(source.position());
source.position(source.position()-1);
targetUniChar = MBCSSimpleGetNextUChar(myConverterData.currentConverter.sharedData, source, usingFallback);
source.limit(oldSourceLimit);
source.limit(savedSourceLimit);
source.position(savedSourcePosition);
} else {
targetUniChar = 0xffff;
}
if (targetUniChar < 0xfffe) {
target.put((char)targetUniChar);
@ -1412,7 +1513,7 @@ class CharsetISO2022 extends CharsetICU {
}
}
if (err.isError() || (source.position() == source.limit())) {
if (err.isError() || err.isOverflow() || (source.position() == source.limit())) {
return err;
}
}
@ -2580,7 +2681,11 @@ class CharsetISO2022 extends CharsetICU {
}
/* only DBCS or SBCS characters are expected */
/* DB characters with high bit set to 1 are expected */
if (length > 2 || length == 0 || (((targetByteUnit[0] & 0x8080) != 0x8080) && length == 2)) {
if (length > 2 || length == 0 ||
(length == 1 && targetByteUnit[0] > 0x7f) ||
(length ==2 &&
((char)(targetByteUnit[0] - 0xa1a1) > (0xfefe - 0xa1a1) ||
((targetByteUnit[0] - 0xa1) & UConverterConstants.UNSIGNED_BYTE_MASK) > (0xfe - 0xa1)))) {
targetByteUnit[0] = UConverterConstants.missingCharMarker;
}
}

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2006-2008, International Business Machines Corporation and *
* Copyright (C) 2006-2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
@ -1573,330 +1573,8 @@ class CharsetMBCS extends CharsetICU {
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
CoderResult[] cr = { CoderResult.UNDERFLOW };
int sourceArrayIndex;
int stateTable[][/* 256 */];
char[] unicodeCodeUnits;
int offset;
byte state;
int byteIndex;
byte[] bytes;
int sourceIndex, nextSourceIndex;
int entry = 0;
char c;
byte action;
if (preToULength > 0) {
/*
* pass sourceIndex=-1 because we continue from an earlier buffer in the future, this may change with
* continuous offsets
*/
cr[0] = continueMatchToU(source, target, offsets, -1, flush);
if (cr[0].isError() || preToULength < 0) {
return cr[0];
}
}
if (sharedData.mbcs.countStates == 1) {
if ((sharedData.mbcs.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
cr[0] = cnvMBCSSingleToBMPWithOffsets(source, target, offsets, flush);
} else {
cr[0] = cnvMBCSSingleToUnicodeWithOffsets(source, target, offsets, flush);
}
return cr[0];
}
/* set up the local pointers */
sourceArrayIndex = source.position();
if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
stateTable = sharedData.mbcs.swapLFNLStateTable;
} else {
stateTable = sharedData.mbcs.stateTable;
}
unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits;
/* get the converter state from UConverter */
offset = (int) toUnicodeStatus;
byteIndex = toULength;
bytes = toUBytesArray;
/*
* if we are in the SBCS state for a DBCS-only converter, then load the DBCS state from the MBCS data
* (dbcsOnlyState==0 if it is not a DBCS-only converter)
*/
state = (byte)mode;
if (state == 0) {
state = sharedData.mbcs.dbcsOnlyState;
}
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex = byteIndex == 0 ? 0 : -1;
nextSourceIndex = 0;
/* conversion loop */
while (sourceArrayIndex < source.limit()) {
/*
* This following test is to see if available input would overflow the output. It does not catch output
* of more than one code unit that overflows as a result of a surrogate pair or callback output from the
* last source byte. Therefore, those situations also test for overflows and will then break the loop,
* too.
*/
if (!target.hasRemaining()) {
/* target is full */
cr[0] = CoderResult.OVERFLOW;
break;
}
if (byteIndex == 0) {
/* optimized loop for 1/2-byte input and BMP output */
// agljport:todo see ucnvmbcs.c for deleted block
do {
entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK];
if (MBCS_ENTRY_IS_TRANSITION(entry)) {
state = (byte) MBCS_ENTRY_TRANSITION_STATE(entry);
offset = MBCS_ENTRY_TRANSITION_OFFSET(entry);
++sourceArrayIndex;
if (sourceArrayIndex < source.limit()
&& MBCS_ENTRY_IS_FINAL(entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK])
&& MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_VALID_16
&& (c = unicodeCodeUnits[offset + MBCS_ENTRY_FINAL_VALUE_16(entry)]) < 0xfffe) {
++sourceArrayIndex;
target.put(c);
if (offsets != null) {
offsets.put(sourceIndex);
sourceIndex = (nextSourceIndex += 2);
}
state = (byte) MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
offset = 0;
} else {
/* set the state and leave the optimized loop */
++nextSourceIndex;
bytes[0] = source.get(sourceArrayIndex - 1);
byteIndex = 1;
break;
}
} else {
if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
/* output BMP code point */
++sourceArrayIndex;
target.put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
if (offsets != null) {
offsets.put(sourceIndex);
sourceIndex = ++nextSourceIndex;
}
state = (byte) MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
} else {
/* leave the optimized loop */
break;
}
}
} while (sourceArrayIndex < source.limit() && target.hasRemaining());
/*
* these tests and break statements could be put inside the loop if C had "break outerLoop" like
* Java
*/
if (sourceArrayIndex >= source.limit()) {
break;
}
if (!target.hasRemaining()) {
/* target is full */
cr[0] = CoderResult.OVERFLOW;
break;
}
++nextSourceIndex;
bytes[byteIndex++] = source.get(sourceArrayIndex++);
} else /* byteIndex>0 */{
++nextSourceIndex;
entry = stateTable[state][(bytes[byteIndex++] = source.get(sourceArrayIndex++))
& UConverterConstants.UNSIGNED_BYTE_MASK];
}
if (MBCS_ENTRY_IS_TRANSITION(entry)) {
state = (byte) MBCS_ENTRY_TRANSITION_STATE(entry);
offset += MBCS_ENTRY_TRANSITION_OFFSET(entry);
continue;
}
/* save the previous state for proper extension mapping with SI/SO-stateful converters */
mode = state;
/* set the next state early so that we can reuse the entry variable */
state = (byte) MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
/*
* An if-else-if chain provides more reliable performance for the most common cases compared to a
* switch.
*/
action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry));
if (action == MBCS_STATE_VALID_16) {
offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
c = unicodeCodeUnits[offset];
if (c < 0xfffe) {
/* output BMP code point */
target.put(c);
if (offsets != null) {
offsets.put(sourceIndex);
}
byteIndex = 0;
} else if (c == 0xfffe) {
if (isFallbackUsed() && (entry = (int) getFallback(sharedData.mbcs, offset)) != 0xfffe) {
/* output fallback BMP code point */
target.put((char) entry);
if (offsets != null) {
offsets.put(sourceIndex);
}
byteIndex = 0;
}
} else {
/* callback(illegal) */
cr[0] = CoderResult.malformedForLength(byteIndex);
}
} else if (action == MBCS_STATE_VALID_DIRECT_16) {
/* output BMP code point */
target.put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
if (offsets != null) {
offsets.put(sourceIndex);
}
byteIndex = 0;
} else if (action == MBCS_STATE_VALID_16_PAIR) {
offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
c = unicodeCodeUnits[offset++];
if (c < 0xd800) {
/* output BMP code point below 0xd800 */
target.put(c);
if (offsets != null) {
offsets.put(sourceIndex);
}
byteIndex = 0;
} else if (isFallbackUsed() ? c <= 0xdfff : c <= 0xdbff) {
/* output roundtrip or fallback surrogate pair */
target.put((char) (c & 0xdbff));
if (offsets != null) {
offsets.put(sourceIndex);
}
byteIndex = 0;
if (target.hasRemaining()) {
target.put(unicodeCodeUnits[offset]);
if (offsets != null) {
offsets.put(sourceIndex);
}
} else {
/* target overflow */
charErrorBufferArray[0] = unicodeCodeUnits[offset];
charErrorBufferLength = 1;
cr[0] = CoderResult.OVERFLOW;
offset = 0;
break;
}
} else if (isFallbackUsed() ? (c & 0xfffe) == 0xe000 : c == 0xe000) {
/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
target.put(unicodeCodeUnits[offset]);
if (offsets != null) {
offsets.put(sourceIndex);
}
byteIndex = 0;
} else if (c == 0xffff) {
/* callback(illegal) */
cr[0] = CoderResult.malformedForLength(byteIndex);
}
} else if (action == MBCS_STATE_VALID_DIRECT_20
|| (action == MBCS_STATE_FALLBACK_DIRECT_20 && isFallbackUsed())) {
entry = MBCS_ENTRY_FINAL_VALUE(entry);
/* output surrogate pair */
target.put((char) (0xd800 | (char) (entry >> 10)));
if (offsets != null) {
offsets.put(sourceIndex);
}
byteIndex = 0;
c = (char) (0xdc00 | (char) (entry & 0x3ff));
if (target.hasRemaining()) {
target.put(c);
if (offsets != null) {
offsets.put(sourceIndex);
}
} else {
/* target overflow */
charErrorBufferArray[0] = c;
charErrorBufferLength = 1;
cr[0] = CoderResult.OVERFLOW;
offset = 0;
break;
}
} else if (action == MBCS_STATE_CHANGE_ONLY) {
/*
* This serves as a state change without any output. It is useful for reading simple stateful
* encodings, for example using just Shift-In/Shift-Out codes. The 21 unused bits may later be used
* for more sophisticated state transitions.
*/
if (sharedData.mbcs.dbcsOnlyState == 0) {
byteIndex = 0;
} else {
/* SI/SO are illegal for DBCS-only conversion */
state = (byte) (mode); /* restore the previous state */
/* callback(illegal) */
cr[0] = CoderResult.malformedForLength(byteIndex);
}
} else if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
if (isFallbackUsed()) {
/* output BMP code point */
target.put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
if (offsets != null) {
offsets.put(sourceIndex);
}
byteIndex = 0;
}
} else if (action == MBCS_STATE_UNASSIGNED) {
/* just fall through */
} else if (action == MBCS_STATE_ILLEGAL) {
/* callback(illegal) */
cr[0] = CoderResult.malformedForLength(byteIndex);
} else {
/* reserved, must never occur */
byteIndex = 0;
}
/* end of action codes: prepare for a new character */
offset = 0;
if (byteIndex == 0) {
sourceIndex = nextSourceIndex;
} else if (cr[0].isError()) {
/* callback(illegal) */
break;
} else /* unassigned sequences indicated with byteIndex>0 */{
/* try an extension mapping */
int sourceBeginIndex = sourceArrayIndex;
source.position(sourceArrayIndex);
byteIndex = toU(byteIndex, source, target, offsets, sourceIndex, flush, cr);
sourceArrayIndex = source.position();
sourceIndex = nextSourceIndex + (int) (sourceArrayIndex - sourceBeginIndex);
if (cr[0].isError() || cr[0].isOverflow()) {
/* not mappable or buffer overflow */
break;
}
}
}
/* set the converter state back into UConverter */
toUnicodeStatus = offset;
mode = state;
toULength = byteIndex;
/* write back the updated pointers */
source.position(sourceArrayIndex);
return cr[0];
/* Just call cnvMBCSToUnicodeWithOffsets() to remove duplicate code. */
return cnvMBCSToUnicodeWithOffsets(source, target, offsets, flush);
}
/*
@ -2253,132 +1931,134 @@ class CharsetMBCS extends CharsetICU {
CoderResult cnvMBCSToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
CoderResult[] cr = { CoderResult.UNDERFLOW };
int[][] stateTable;
int sourceArrayIndex, sourceArrayIndexStart;
int stateTable[][/* 256 */];
char[] unicodeCodeUnits;
int sourceIndex, nextSourceIndex;
int offset;
short state;
byte state;
int byteIndex;
byte[] bytes;
int entry;
int sourceIndex, nextSourceIndex;
int entry = 0;
char c;
short action;
if (this.preToULength > 0) {
byte action;
if (preToULength > 0) {
/*
* pass sourceIndex-1 because we continue from an earlier buffer
* in the future, this may change with continuous offsets
* pass sourceIndex=-1 because we continue from an earlier buffer in the future, this may change with
* continuous offsets
*/
cr[0] = continueMatchToU(source, target, offsets, -1, flush);
if (cr[0].isError() || this.preToULength < 0) {
if (cr[0].isError() || preToULength < 0) {
return cr[0];
}
}
if (sharedData.mbcs.countStates == 1) {
if ((sharedData.mbcs.unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
if ((sharedData.mbcs.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
cr[0] = cnvMBCSSingleToBMPWithOffsets(source, target, offsets, flush);
} else {
cr[0] = cnvMBCSSingleToUnicodeWithOffsets(source, target, offsets, flush);
}
return cr[0];
}
if ((options&UConverterConstants.OPTION_SWAP_LFNL) != 0) {
/* set up the local pointers */
sourceArrayIndex = sourceArrayIndexStart = source.position();
if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
stateTable = sharedData.mbcs.swapLFNLStateTable;
} else {
stateTable = sharedData.mbcs.stateTable;
}
unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits;
/* get the converter state from UConverter */
offset = this.toUnicodeStatus;
byteIndex = this.toULength;
bytes = this.toUBytesArray;
offset = (int)toUnicodeStatus;
byteIndex = toULength;
bytes = toUBytesArray;
/*
* if we are in the SBCS state for a DBCS-only converter,
* then load the DBCS state from the MBCS data
* if we are in the SBCS state for a DBCS-only converter, then load the DBCS state from the MBCS data
* (dbcsOnlyState==0 if it is not a DBCS-only converter)
*/
state = (short)(UConverterConstants.UNSIGNED_BYTE_MASK&this.mode);
state = (byte)mode;
if (state == 0) {
state = sharedData.mbcs.dbcsOnlyState;
}
/* sourceIndex=-1 if the current character begain in the previous buffer */
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex = byteIndex == 0 ? 0 : -1;
nextSourceIndex = 0;
/* conversion loop */
while (source.hasRemaining()) {
while (sourceArrayIndex < source.limit()) {
/*
* This following test is to see if available input would overflow the output.
* It does not catch output of more than one code unit that
* overflows as a result of a surrogate pair or callback output
* from the last source byte.
* Therefore, those situations also test for overflows and will
* then break the loop, too.
* This following test is to see if available input would overflow the output. It does not catch output
* of more than one code unit that overflows as a result of a surrogate pair or callback output from the
* last source byte. Therefore, those situations also test for overflows and will then break the loop,
* too.
*/
if (!target.hasRemaining()) {
/* target is full */
cr[0] = CoderResult.OVERFLOW;
break;
}
if (byteIndex == 0) {
/* optimized loop for 1/2-byte input and BMP output */
// agljport:todo see ucnvmbcs.c for deleted block
do {
entry = stateTable[state][(short)source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK];
entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK];
if (MBCS_ENTRY_IS_TRANSITION(entry)) {
state = (short)(UConverterConstants.UNSIGNED_BYTE_MASK&MBCS_ENTRY_TRANSITION_STATE(entry));
state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry);
offset = MBCS_ENTRY_TRANSITION_OFFSET(entry);
source.get();
if (source.hasRemaining() &&
MBCS_ENTRY_IS_FINAL(entry=stateTable[state][(short)source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK]) &&
MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_VALID_16 &&
(c = unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)]) < 0xfffe) {
source.get();
++sourceArrayIndex;
if (sourceArrayIndex < source.limit()
&& MBCS_ENTRY_IS_FINAL(entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK])
&& MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_VALID_16
&& (c = unicodeCodeUnits[offset + MBCS_ENTRY_FINAL_VALUE_16(entry)]) < 0xfffe) {
++sourceArrayIndex;
target.put(c);
if (offsets != null) {
offsets.put(sourceIndex);
sourceIndex = (nextSourceIndex + 2);
sourceIndex = (nextSourceIndex += 2);
}
state = (short)(UConverterConstants.UNSIGNED_BYTE_MASK&MBCS_ENTRY_FINAL_STATE(entry)); /* typically 0 */
state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
offset = 0;
} else {
/* set the state and leave the optimized loop */
++nextSourceIndex;
bytes[0] = source.get(source.position()-1);
bytes[0] = source.get(sourceArrayIndex - 1);
byteIndex = 1;
break;
}
} else {
if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
/* output BMP code point */
source.get();
++sourceArrayIndex;
target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry));
if (offsets != null) {
offsets.put(sourceIndex);
sourceIndex = ++nextSourceIndex;
}
state = (short)(UConverterConstants.UNSIGNED_BYTE_MASK&MBCS_ENTRY_FINAL_STATE(entry)); /* typically 0 */
state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
} else {
/* leave the optimized loop */
break;
}
}
} while (source.hasRemaining() && target.hasRemaining());
/* these tests and break statements could be put inside the loop
* if C had "break outerLoop" like Java
} while (sourceArrayIndex < source.limit() && target.hasRemaining());
/*
* these tests and break statements could be put inside the loop if C had "break outerLoop" like
* Java
*/
if (!source.hasRemaining()) {
if (sourceArrayIndex >= source.limit()) {
break;
}
if (!target.hasRemaining()) {
@ -2386,31 +2066,32 @@ class CharsetMBCS extends CharsetICU {
cr[0] = CoderResult.OVERFLOW;
break;
}
++nextSourceIndex;
bytes[byteIndex++] = source.get();
} else { /* byteIndex>0 */
bytes[byteIndex++] = source.get(sourceArrayIndex++);
} else /* byteIndex>0 */{
++nextSourceIndex;
entry = stateTable[state][(short)(bytes[byteIndex++]=source.get()) & UConverterConstants.UNSIGNED_BYTE_MASK];
entry = stateTable[state][(bytes[byteIndex++] = source.get(sourceArrayIndex++))
& UConverterConstants.UNSIGNED_BYTE_MASK];
}
if (MBCS_ENTRY_IS_TRANSITION(entry)) {
state = (short)(UConverterConstants.UNSIGNED_BYTE_MASK&MBCS_ENTRY_TRANSITION_STATE(entry));
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry);
offset += MBCS_ENTRY_TRANSITION_OFFSET(entry);
continue;
}
/* save the previous state for proper extension mapping with SI/SO-stateful converters */
mode = state;
/* set the next state early so that we can reuse the entry variable */
state = (short)(UConverterConstants.UNSIGNED_BYTE_MASK&MBCS_ENTRY_FINAL_STATE(entry)); /* typically 0 */
state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
/*
* An if-else-if chain provides more reliable performance for
* the most common cases compared to a switch.
* An if-else-if chain provides more reliable performance for the most common cases compared to a
* switch.
*/
action = (short)(UConverterConstants.UNSIGNED_BYTE_MASK&MBCS_ENTRY_FINAL_ACTION(entry));
action = (byte)MBCS_ENTRY_FINAL_ACTION(entry);
if (action == MBCS_STATE_VALID_16) {
offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
c = unicodeCodeUnits[offset];
@ -2422,7 +2103,7 @@ class CharsetMBCS extends CharsetICU {
}
byteIndex = 0;
} else if (c == 0xfffe) {
if (CharsetDecoderICU.isToUUseFallback() && (entry = (int)getFallback(sharedData.mbcs, offset)) != 0xfffe) {
if (isFallbackUsed() && (entry = (int)getFallback(sharedData.mbcs, offset)) != 0xfffe) {
/* output fallback BMP code point */
target.put((char)entry);
if (offsets != null) {
@ -2432,7 +2113,7 @@ class CharsetMBCS extends CharsetICU {
}
} else {
/* callback(illegal) */
cr[0] = CoderResult.malformedForLength(1);
cr[0] = CoderResult.malformedForLength(byteIndex);
}
} else if (action == MBCS_STATE_VALID_DIRECT_16) {
/* output BMP code point */
@ -2451,9 +2132,9 @@ class CharsetMBCS extends CharsetICU {
offsets.put(sourceIndex);
}
byteIndex = 0;
} else if (CharsetDecoderICU.isToUUseFallback() ? c<=0xdfff : c<=0xdbff) {
} else if (isFallbackUsed() ? c <= 0xdfff : c <= 0xdbff) {
/* output roundtrip or fallback surrogate pair */
target.put((char)(c&0xdbff));
target.put((char)(c & 0xdbff));
if (offsets != null) {
offsets.put(sourceIndex);
}
@ -2468,11 +2149,11 @@ class CharsetMBCS extends CharsetICU {
charErrorBufferArray[0] = unicodeCodeUnits[offset];
charErrorBufferLength = 1;
cr[0] = CoderResult.OVERFLOW;
offset = 0;
break;
}
} else if (CharsetDecoderICU.isToUUseFallback() ? (c&0xfffe)==0xe000 : c==0xe000) {
} else if (isFallbackUsed() ? (c & 0xfffe) == 0xe000 : c == 0xe000) {
/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
target.put(unicodeCodeUnits[offset]);
if (offsets != null) {
@ -2481,18 +2162,18 @@ class CharsetMBCS extends CharsetICU {
byteIndex = 0;
} else if (c == 0xffff) {
/* callback(illegal) */
cr[0] = CoderResult.malformedForLength(1);
cr[0] = CoderResult.malformedForLength(byteIndex);
}
} else if (action == MBCS_STATE_VALID_DIRECT_20 ||
action == MBCS_STATE_FALLBACK_DIRECT_20 && CharsetDecoderICU.isToUUseFallback()) {
} else if (action == MBCS_STATE_VALID_DIRECT_20
|| (action == MBCS_STATE_FALLBACK_DIRECT_20 && isFallbackUsed())) {
entry = MBCS_ENTRY_FINAL_VALUE(entry);
/* output surrogate pair */
target.put((char)(0xd800 | (char)(entry&0x3ff)));
target.put((char)(0xd800 | (char)(entry >> 10)));
if (offsets != null) {
offsets.put(sourceIndex);
}
byteIndex = 0;
c = (char)(0xdc00 | (char)(entry>>10));
c = (char)(0xdc00 | (char)(entry & 0x3ff));
if (target.hasRemaining()) {
target.put(c);
if (offsets != null) {
@ -2503,30 +2184,27 @@ class CharsetMBCS extends CharsetICU {
charErrorBufferArray[0] = c;
charErrorBufferLength = 1;
cr[0] = CoderResult.OVERFLOW;
offset = 0;
break;
}
} else if (action == MBCS_STATE_CHANGE_ONLY) {
/*
* This serves as a state change without any output.
* It is useful for reading simple stateful encodings,
* for example using just Shift-In/Shift-Out codes.
* The 21 unused bits may later be used for more sophisticated
* state transistions.
* This serves as a state change without any output. It is useful for reading simple stateful
* encodings, for example using just Shift-In/Shift-Out codes. The 21 unused bits may later be used
* for more sophisticated state transitions.
*/
if (sharedData.mbcs.dbcsOnlyState == 0) {
byteIndex = 0;
} else {
/* SI/SO are illegal for DBCS-only conversion */
state = (short)(UConverterConstants.UNSIGNED_BYTE_MASK&mode); /* restore the previous state */
state = (byte)(mode); /* restore the previous state */
/* callback(illegal) */
cr[0] = CoderResult.malformedForLength(1);
cr[0] = CoderResult.malformedForLength(byteIndex);
}
} else if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
if (CharsetDecoderICU.isToUUseFallback()) {
if (isFallbackUsed()) {
/* output BMP code point */
target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry));
if (offsets != null) {
@ -2538,37 +2216,70 @@ class CharsetMBCS extends CharsetICU {
/* just fall through */
} else if (action == MBCS_STATE_ILLEGAL) {
/* callback(illegal) */
cr[0] = CoderResult.malformedForLength(1);
cr[0] = CoderResult.malformedForLength(byteIndex);
} else {
/* reserved, must never occur */
byteIndex = 0;
}
/* end of action codes: prepare for new character */
/* end of action codes: prepare for a new character */
offset = 0;
if (byteIndex == 0) {
sourceIndex = nextSourceIndex;
} else if (cr[0].isError()) {
/* callback(illegal) */
if (byteIndex > 1) {
/*
* Ticket 5691: consistent illegal sequences:
* - We include at least the first byte in the illegal sequence.
* - If any of the non-initial bytes could be the start of a character,
* we stop the illegal sequence before the first one of those.
*/
boolean isDBCSOnly = (sharedData.mbcs.dbcsOnlyState != 0);
byte i;
for (i = 1; i < byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, (short)(bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK)); i++) {}
if (i < byteIndex) {
byte backOutDistance = (byte)(byteIndex - i);
int bytesFromThisBuffer = sourceArrayIndex - sourceArrayIndexStart;
byteIndex = i; /* length of reported illegal byte sequence */
if (backOutDistance <= bytesFromThisBuffer) {
sourceArrayIndex -= backOutDistance;
} else {
/* Back out bytes from the previous buffer: Need to replay them. */
this.preToULength = (byte)(bytesFromThisBuffer - backOutDistance);
/* preToULength is negative! */
for (int n = 0; n < -this.preToULength; n++) {
this.preToUArray[n] = bytes[i+n];
}
sourceArrayIndex = sourceArrayIndexStart;
}
}
}
break;
} else { /* unassigned sequences indicated with byteIndex>0 */
} else /* unassigned sequences indicated with byteIndex>0 */{
/* try an extension mapping */
int sourceBeginIndex = sourceArrayIndex;
source.position(sourceArrayIndex);
byteIndex = toU(byteIndex, source, target, offsets, sourceIndex, flush, cr);
sourceIndex = nextSourceIndex + source.position();
if (cr[0].isError()) {
sourceArrayIndex = source.position();
sourceIndex = nextSourceIndex += (int)(sourceArrayIndex - sourceBeginIndex);
if (cr[0].isError() || cr[0].isOverflow()) {
/* not mappable or buffer overflow */
break;
}
}
}
/* set the converter state back into UConverter */
toUnicodeStatus = offset;
mode = state;
toULength = byteIndex;
/* write back the updated pointers */
source.position(sourceArrayIndex);
return cr[0];
}
/*
@ -2908,8 +2619,7 @@ class CharsetMBCS extends CharsetICU {
/* conversion loop */
while (true) {
// entry=stateTable[state][(uint8_t)source[i++]];
entry = stateTable[state][source.get() & UConverterConstants.UNSIGNED_BYTE_MASK];
i = source.position();
entry = stateTable[state][source.get(i++) & UConverterConstants.UNSIGNED_BYTE_MASK];
if (MBCS_ENTRY_IS_TRANSITION(entry)) {
state = MBCS_ENTRY_TRANSITION_STATE(entry);
@ -2991,8 +2701,8 @@ class CharsetMBCS extends CharsetICU {
/* try an extension mapping */
if (sharedData.mbcs.extIndexes != null) {
/* Increase the limit for proper handling. Used in LMBCS. */
if (source.limit() >= source.position() + length) {
source.limit(source.position() + length);
if (source.limit() > i + length) {
source.limit(i + length);
}
return simpleMatchToU(source, useFallback);
}
@ -3000,6 +2710,51 @@ class CharsetMBCS extends CharsetICU {
return c;
}
private boolean hasValidTrailBytes(int[][] stateTable, short state) {
int[] row = stateTable[state];
int b, entry;
/* First test for final entries in this state for some commonly valid byte values. */
entry = row[0xa1];
if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) {
return true;
}
entry = row[0x41];
if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) {
return true;
}
/* Then test for final entries in this state. */
for (b = 0; b <= 0xff; b++) {
entry = row[b];
if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) {
return true;
}
}
/* Then recurse for transition entries. */
for (b = 0; b <= 0xff; b++) {
entry = row[b];
if (MBCS_ENTRY_IS_TRANSITION(entry) &&
hasValidTrailBytes(stateTable, (short)(MBCS_ENTRY_TRANSITION_STATE(entry) & UConverterConstants.UNSIGNED_BYTE_MASK))) {
return true;
}
}
return false;
}
private boolean isSingleOrLead(int[][] stateTable, int state, boolean isDBCSOnly, int b) {
int[] row = stateTable[state];
int entry = row[b];
if (MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
return hasValidTrailBytes(stateTable, (short)(MBCS_ENTRY_TRANSITION_STATE(entry) & UConverterConstants.UNSIGNED_BYTE_MASK));
} else {
short action = (short)(MBCS_ENTRY_FINAL_ACTION(entry) & UConverterConstants.UNSIGNED_BYTE_MASK);
if (action == MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
return false; /* SI/SO are illegal for DBCS-only conversion */
} else {
return (action != MBCS_STATE_ILLEGAL);
}
}
}
}

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2006-2008, International Business Machines Corporation and *
* Copyright (C) 2006-2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
@ -4809,21 +4809,6 @@ public class TestCharset extends TestFmwk {
if (!result.isOverflow()) {
errln("Overflow buffer while decoding ISO-2022-KR should have occurred.");
}
/* This is part of the ambiguous converter test in ICU4C and is used here to provide
* better code coverage.
*/
byte [] bytearray2 = {
0x61, 0x5b, 0x5c
};
bb = ByteBuffer.wrap(bytearray2);
cb = CharBuffer.allocate(20);
result = decoder.decode(bb, cb, true);
if (!result.isMalformed()) {
errln("Malformed error while decoding ISO-2022-KR should have occurred.");
}
}
//provide better code coverage for Charset ISO-2022-JP

View File

@ -1090,38 +1090,16 @@ public class TestConversion extends ModuleTest {
output.limit(output.position());
output.rewind();
//TODO: Fix Me! After Ticket#6583 is completed, this code should be removed.
boolean ignoreError = (0 <= cc.caseNr && cc.caseNr <= 15) || cc.caseNr == 17 || cc.caseNr == 18;
//TODO: End
// test to see if the conversion matches actual results
if (output.limit() != expected.length()) {
//TODO: Remove this
if (ignoreError) {
logln("Test failed: output length does not match expected for charset: "+cc.charset+ " [" + cc.caseNr + "]");
} else {
errln("Test failed: output length does not match expected for charset: "+cc.charset+ " [" + cc.caseNr + "]");
res = false;
}
//TODO: End
// errln("Test failed: output length does not match expected for charset: "+cc.charset+ " [" + cc.caseNr + "]");
// res = false;
errln("Test failed: output length does not match expected for charset: "+cc.charset+ " [" + cc.caseNr + "]");
res = false;
} else {
for (int i = 0; i < expected.length(); i++) {
if (output.get(i) != expected.charAt(i)) {
//TODO: Remove this
if (ignoreError) {
logln("Test failed: output does not match expected for charset: " + cc.charset
+ " [" + cc.caseNr + "]");
} else {
errln("Test failed: output does not match expected for charset: " + cc.charset
+ " [" + cc.caseNr + "]");
res = false;
}
//TODO: End
// errln("Test failed: output does not match expected for charset: " + cc.charset
// + " [" + cc.caseNr + "]");
// res = false;
errln("Test failed: output does not match expected for charset: " + cc.charset
+ " [" + cc.caseNr + "]");
res = false;
break;
}
}