ICU-6955 Remove duplicate code in CharsetMBCS add test case for SCSU.

X-SVN-Rev: 26139
This commit is contained in:
Michael Ow 2009-06-22 19:37:57 +00:00
parent 42da1f8d6b
commit 9dcb89e824
4 changed files with 22 additions and 452 deletions

View File

@ -864,6 +864,13 @@ conversion:table(nofallback) {
:intvector{},
:int{1}, :int{0}, "", ".", :bin{""}
}
{
"SCSU",
:bin{ 0f6441b413a733f2 },
"\u6441\ub413\ua733",
:intvector{},
:int{1}, :int{0}, "illegal", ".", :bin{ f2 }
}
}
}

View File

@ -2772,7 +2772,6 @@ class CharsetMBCS extends CharsetICU {
}
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
CoderResult[] cr = { CoderResult.UNDERFLOW };
// if (!source.hasRemaining() && fromUChar32 == 0)
// return cr[0];
@ -3861,455 +3860,8 @@ class CharsetMBCS extends CharsetICU {
}
CoderResult cnvMBCSFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
CoderResult[] cr = { CoderResult.UNDERFLOW };
char[] table;
int p;
ByteBuffer bytes;
short outputType;
SideEffects x = new SideEffects(0, 0, 0, 0, 0, 0);
int targetCapacity = target.limit() - target.position();
int stage2Entry = 0;
//int asciiRoundtrips;
long value;
int length = 0;
int uniMask;
boolean doLoop = true;
boolean gotoGetTrail = false;
if (preFromUFirstCP >= 0) {
/*
* pass sourceIndex=-1 because we continue from an earlier buffer
* in the future, this may change with continuous offsets.
*/
cr[0] = continueMatchFromU(source, target, offsets, flush, -1);
if (cr[0].isError() || preFromULength < 0) {
return cr[0];
}
}
/* use optimized function if possible */
outputType = sharedData.mbcs.outputType;
uniMask = sharedData.mbcs.unicodeMask;
if (outputType == MBCS_OUTPUT_1 && ((uniMask&UConverterConstants.HAS_SURROGATES) == 0)) {
if ((uniMask&UConverterConstants.HAS_SURROGATES) == 0) {
cr[0] = cnvMBCSSingleFromBMPWithOffsets(source, target, offsets, flush);
} else {
cr[0] = cnvMBCSSingleFromUnicodeWithOffsets(source, target, offsets, flush);
}
return cr[0];
}/* else if (outputType == MBCS_OUTPUT_2 && mbcs.sharedData.mbcs.utf8Friendly) {
cr[0] = cnvMBCSDoubleFromUnicodeWithOffsets(source, target, offsets, flush);
return cr[0];
}*/
table = sharedData.mbcs.fromUnicodeTable;
/* if (mbcs.sharedData.mbcs.utf8Friendly) {
mbcsIndex = mbcs.sharedData.mbcs.mbcsIndex;
} else {
mbcsIndex = null;
} */
if ((options&UConverterConstants.OPTION_SWAP_LFNL) != 0) {
bytes = ByteBuffer.wrap(sharedData.mbcs.swapLFNLFromUnicodeBytes);
} else {
bytes = ByteBuffer.wrap(sharedData.mbcs.fromUnicodeBytes);
}
//asciiRoundtrips = mbcs.sharedData.mbcs.asciiRoundtrips;
/* get the converter state from UConverter */
x.c = fromUChar32;
if (outputType == MBCS_OUTPUT_2_SISO) {
x.prevLength = fromUnicodeStatus;
if (x.prevLength == 0) {
/* set the real value */
x.prevLength = 1;
}
} else {
/* prevent fromUnicodeStatus from being set to something non-0 */
x.prevLength = 0;
}
/* sourceIndex = -1 if the current character began in the previous buffer */
x.prevSourceIndex = -1;
x.sourceIndex = x.c==0 ? 0 : -1;
x.nextSourceIndex = 0;
/* conversion loop */
if (x.c != 0 && targetCapacity > 0) {
gotoGetTrail = true; // set gotoGetTrail flag and go to gotoGetTrail label
}
while (gotoGetTrail || source.hasRemaining()) {
/*
* This following test is to see if available input would overflow the output.
* It does not catch output of more than one byte that
* overflows as a result of a multi-byte character or callback output
* from the last source character.
* Therefore, those situations also test for overflows and will
* then break the loop, too.
*/
if (gotoGetTrail || targetCapacity > 0) {
/*
* Get a correct Unicode code point:
* a single UChar for a BMP code point or
* a matched surrogate pair for a "supplementary code point."
*/
if (!gotoGetTrail) {
x.c = source.get();
++x.nextSourceIndex;
/* This is commented out because of the fact that IS_ASCII_ROUNDTRIP is not
* being used in ICU4J.
*/
/*if (x.c <= 0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
target.put((byte)x.c);
if (offsets != null) {
offsets.put(x.sourceIndex);
x.prevSourceIndex = x.sourceIndex;
x.sourceIndex = x.nextSourceIndex;
}
targetCapacity--;
x.c = 0;
continue;
}*/
}
/* Code to use utf8friendly code was removed since it is not needed in Java. */
/* This also tests if the codepage maps single surrogates.
* If it does, then surrogates are not paired but mapped separately.
* Note that in this case unmatched surrogates are not detected.
*/
if (gotoGetTrail || (UTF16.isSurrogate((char)x.c) && (uniMask&UConverterConstants.HAS_SURROGATES) == 0)) {
if (gotoGetTrail || (UTF16.isLeadSurrogate((char)x.c))) {
// getTrail label
gotoGetTrail = false; // reset gotoGetTrail flag
x.sourceArrayIndex = source.position();
doLoop = getTrail(source, target, uniMask, x, flush, cr);
if (x.doread && doLoop) {
continue;
} else if (!x.doread && !doLoop) {
break;
} else if (!doLoop) {
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
cr[0] = CoderResult.malformedForLength(1);
break;
}
}
/* convert the Unicode point in c into codepage bytes */
/*
* The basic lookup is a triple-stage compact array (trie) lookup.
*
* Single-byte codepages are handled with a different data structure
* by _MBCSSingle... functions.
*
* The result consists of a 32-bit value from stage 2 and
* a pointer to as many bytes as are stored per character.
* The pointer points to the character's bytes in stage 3.
* Bits 15..0 of the stage 2 entry contain the stage 3 index
* for that pointer, while bits 31..16 are flags for which of
* the 16 characters in the block are roundtrip-assigned.
*
* For 2-byte and 4 byte codepages, the bytes are stored as uint16_t
* respectively as uint32_t, in the platform encoding.
* For 3-byte codepages, the bytes are always stored in big-endian order.
*
* For EUC encodings that use only either 0x8e or 0x8f as the first
* byte of their longest byte sequences, the first two bytes in
* this third stage indicate with their 7th bits whether these bytes
* are to be writeen directly or actually need to be preceeded by
* one of the two Single-Shift codes. With this, the third stage
* stores one byte fewer per character than the actual maximum length of
* EUC byte sequences.
*
* Other than that, leading zero bytes are removed and the other
* bytes output. A single zero byte may be ouput if the "assigned"
* bit in stage 2 was on.
* The data structure does not support zero byte output as a fallback,
* and also does not allow output of leading zeros.
*/
stage2Entry = MBCS_STAGE_2_FROM_U(table, x.c);
/* get the bytes and the length for the output */
switch (outputType) {
case MBCS_OUTPUT_2:
value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
if (value <= 0xff) {
length = 1;
} else {
length = 2;
}
break;
case MBCS_OUTPUT_2_SISO:
/* 1/2-byte stateful with Shift-In/Shift-Out */
/*
* Save the old state in the converter object
* right here, then change the local pervLength state variable if necessary.
* Then, if this character turns out to be unassigned or a fallback that
* is not taken, the callback code must not save the new state in the converter
* because the new state is for a character that is not output.
* However, the callback must still restore the state from the converter
* in case the callback function changed it for its output.
*/
fromUnicodeStatus = x.prevLength; /* save the old state */
value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
if (value <= 0xff) {
if (value == 0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, x.c)) {
/* no mapping, leave value == 0 */
length = 0;
} else if (x.prevLength <= 1) {
length = 1;
} else {
/* change from double-byte mode to single-byte */
value |= UConverterConstants.UNSIGNED_INT_MASK & (UConverterConstants.SI<<8);
length = 2;
x.prevLength = 1;
}
} else {
if (x.prevLength == 2) {
length = 2;
} else {
/* change from single-byte mode to double-byte */
value |= UConverterConstants.UNSIGNED_INT_MASK & (UConverterConstants.SO<<16);
length = 3;
x.prevLength = 2;
}
}
break;
case MBCS_OUTPUT_DBCS_ONLY:
/* table with single-byte results, but only DBCS mappings used */
value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
if (value <= 0xff) {
/* no mapping or SBCS result, not taken for DBCS-only */
value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */
length = 0;
} else {
length = 2;
}
break;
case MBCS_OUTPUT_3:
p = MBCS_POINTER_3_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
value = UConverterConstants.UNSIGNED_INT_MASK&((int)bytes.get(p)<<16 | (int)bytes.get(p+1)<<8 | bytes.get(p+2));
if (value <= 0xff) {
length = 1;
} else if (value <= 0xffff) {
length = 2;
} else {
length = 3;
}
break;
case MBCS_OUTPUT_4:
value = MBCS_VALUE_4_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
if (value <= 0xff) {
length = 1;
} else if (value <= 0xffff) {
length = 2;
} else if (value <= 0xffffff) {
length = 3;
} else {
length = 4;
}
break;
case MBCS_OUTPUT_3_EUC:
value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
/* EUC 16-bit fixed-length representation */
if (value <= 0xff) {
length = 1;
} else if ((value&0x8000) == 0) {
value |= 0x8e8000;
length = 3;
} else if ((value&0x80) == 0) {
value |= 0x8f0080;
length = 3;
} else {
length = 2;
}
break;
case MBCS_OUTPUT_4_EUC:
p = MBCS_POINTER_3_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
value = UConverterConstants.UNSIGNED_INT_MASK&((int)bytes.get(p)<<16 | (int)bytes.get(p+1)<<8 | bytes.get(p+2));
/* EUC 16-bit fixed-length representation applied to the first two bytes */
if (value <= 0xff) {
length = 1;
} else if (value <= 0xffff) {
length = 2;
} else if ((value&0x800000) == 0) {
value |= 0x08e800000;
length = 4;
} else if ((value&0x8000) == 0) {
value |= 0x08f008000;
length = 4;
} else {
length = 3;
}
break;
default :
/* must not occur */
value = stage2Entry = 0;
length = 0;
break;
}
/* is this code point assigned, or do we use fallbacks? */
if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, x.c)) ||
(CharsetEncoderICU.isFromUUseFallback(useFallback, x.c) && value != 0)) {
/*
* We allow a 0 byte output if the "assigned" bit is set for this entry.
* There is no way with this data structure for fallback output
* to be a zero byte.
*/
// unassigned label
int currentSourcePos = source.position();
doLoop = unassigned(source, target, offsets, x, flush, cr);
if (doLoop) {
continue;
} else {
if (source.position() < currentSourcePos) {
source.position(currentSourcePos);
}
break;
}
}
/* write the output character bytes from value and length */
/* from the first if in the loop we know that targetCapacity>0 */
if (length <= targetCapacity) {
switch (length) {
/* each branch falls through to the next one */
case 4:
target.put((byte)(value>>24));
if (offsets != null) {
offsets.put(x.sourceIndex);
}
case 3:
target.put((byte)(value>>16));
if (offsets != null) {
offsets.put(x.sourceIndex);
}
case 2:
target.put((byte)(value>>8));
if (offsets != null) {
offsets.put(x.sourceIndex);
}
case 1:
target.put((byte)value);
if (offsets != null) {
offsets.put(x.sourceIndex);
}
default :
/* will never occur */
break;
}
targetCapacity -= length;
} else {
/*
* We actually do this backwards here:
* In order to save an intermediate variable, we output
* first to the overflow buffer what does not fit into the
* regular target.
*/
/* we know that 1<=targetCapacity<length<=4 */
length -= targetCapacity;
int i = 0; // index for errorBuffer
switch (length) {
/* each branch falls through to the next one */
case 3:
errorBuffer[i++] = (byte)(value>>16);
case 2:
errorBuffer[i++] = (byte)(value>>8);
case 1:
errorBuffer[i++] = (byte)value;
default :
/* will never occur */
break;
}
errorBufferLength = length;
/* now output what fits into the regular target */
value>>=8*length; /* length was reduced by targetCapacity */
switch (targetCapacity) {
/* each branch falls through to the next one */
case 3:
target.put((byte)(value>>16));
if (offsets != null) {
offsets.put(x.sourceIndex);
}
case 2:
target.put((byte)(value>>8));
if (offsets != null) {
offsets.put(x.sourceIndex);
}
case 1:
target.put((byte)value);
if (offsets != null) {
offsets.put(x.sourceIndex);
}
default :
/* will never occur */
break;
}
/* target overflow */
targetCapacity = 0;
cr[0] = CoderResult.OVERFLOW;
x.c = 0;
break;
}
/* normal end of conversion: prepare for a new character */
x.c = 0;
if (offsets != null) {
x.prevSourceIndex = x.sourceIndex;
x.sourceIndex = x.nextSourceIndex;
}
continue;
} else {
/* target is full */
cr[0] = CoderResult.OVERFLOW;
break;
}
}
/*
* the end of the input stream and detection of truncated input
* are handled by the framework, but for EBCDIC_STATEFUL conversion
* we need to emit an SI at the very end
*
* conditions:
* successful
* EBCDIC_STATEFUL in DBCS mode
* end of input and no truncated input
*/
if (!cr[0].isError() && outputType == MBCS_OUTPUT_2_SISO && x.prevLength == 2 && flush && !source.hasRemaining() && x.c == 0) {
/* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
if (targetCapacity > 0) {
target.put((byte)UConverterConstants.SI);
if (offsets != null) {
/* set the last source character's index (sourceIndex points at sourceLimit now) */
offsets.put(x.prevSourceIndex);
}
} else {
/* target is full */
errorBuffer[0] = UConverterConstants.SI;
errorBufferLength = 1;
cr[0] = CoderResult.OVERFLOW;
}
x.prevLength = 1; /* we switched into SBCS */
}
/* set the converter state back into UConverter */
fromUChar32 = x.c;
fromUnicodeStatus = x.prevLength;
return cr[0];
// Just call encodeLoop to remove duplicate code.
return encodeLoop(source, target, offsets, flush);
}
/*

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d8e824c842e59c326c65665a7e9f26ec7c7c4c8aa1cec2bde603fb8f27371184
size 772451
oid sha256:5e4ffe9070b3d419a5df23d222bb6dcd68790c8044172727c985f5fd8adbe555
size 772538

View File

@ -5157,6 +5157,17 @@ public class TestCharset extends TestFmwk {
if(!roundTripResult.equals(encoderBuffer)){
errln("Error occured while encoding "+ charset.name());
}
// Test overflow for code coverage reasons
if (i == 0) {
ByteBuffer test = encoderResult;
test.position(0);
CharBuffer smallBuffer = CharBuffer.allocate(11);
decode.reset();
CoderResult status = decode.decode(test, smallBuffer, true);
if (status != CoderResult.OVERFLOW) {
errln("Overflow buffer error should have been thrown.");
}
}
}catch(Exception e){
errln("Exception while converting SCSU thrown: " + e);
}