ICU-6955 Remove duplicate code in CharsetMBCS add test case for SCSU.
X-SVN-Rev: 26139
This commit is contained in:
parent
42da1f8d6b
commit
9dcb89e824
7
icu4c/source/test/testdata/conversion.txt
vendored
7
icu4c/source/test/testdata/conversion.txt
vendored
@ -864,6 +864,13 @@ conversion:table(nofallback) {
|
||||
:intvector{},
|
||||
:int{1}, :int{0}, "", ".", :bin{""}
|
||||
}
|
||||
{
|
||||
"SCSU",
|
||||
:bin{ 0f6441b413a733f2 },
|
||||
"\u6441\ub413\ua733",
|
||||
:intvector{},
|
||||
:int{1}, :int{0}, "illegal", ".", :bin{ f2 }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2772,7 +2772,6 @@ class CharsetMBCS extends CharsetICU {
|
||||
}
|
||||
|
||||
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
|
||||
|
||||
CoderResult[] cr = { CoderResult.UNDERFLOW };
|
||||
// if (!source.hasRemaining() && fromUChar32 == 0)
|
||||
// return cr[0];
|
||||
@ -3861,455 +3860,8 @@ class CharsetMBCS extends CharsetICU {
|
||||
}
|
||||
|
||||
CoderResult cnvMBCSFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
|
||||
CoderResult[] cr = { CoderResult.UNDERFLOW };
|
||||
|
||||
char[] table;
|
||||
int p;
|
||||
ByteBuffer bytes;
|
||||
short outputType;
|
||||
|
||||
SideEffects x = new SideEffects(0, 0, 0, 0, 0, 0);
|
||||
|
||||
int targetCapacity = target.limit() - target.position();
|
||||
|
||||
int stage2Entry = 0;
|
||||
//int asciiRoundtrips;
|
||||
long value;
|
||||
int length = 0;
|
||||
int uniMask;
|
||||
|
||||
boolean doLoop = true;
|
||||
boolean gotoGetTrail = false;
|
||||
|
||||
if (preFromUFirstCP >= 0) {
|
||||
/*
|
||||
* pass sourceIndex=-1 because we continue from an earlier buffer
|
||||
* in the future, this may change with continuous offsets.
|
||||
*/
|
||||
cr[0] = continueMatchFromU(source, target, offsets, flush, -1);
|
||||
if (cr[0].isError() || preFromULength < 0) {
|
||||
return cr[0];
|
||||
}
|
||||
}
|
||||
|
||||
/* use optimized function if possible */
|
||||
outputType = sharedData.mbcs.outputType;
|
||||
uniMask = sharedData.mbcs.unicodeMask;
|
||||
if (outputType == MBCS_OUTPUT_1 && ((uniMask&UConverterConstants.HAS_SURROGATES) == 0)) {
|
||||
if ((uniMask&UConverterConstants.HAS_SURROGATES) == 0) {
|
||||
cr[0] = cnvMBCSSingleFromBMPWithOffsets(source, target, offsets, flush);
|
||||
} else {
|
||||
cr[0] = cnvMBCSSingleFromUnicodeWithOffsets(source, target, offsets, flush);
|
||||
}
|
||||
return cr[0];
|
||||
}/* else if (outputType == MBCS_OUTPUT_2 && mbcs.sharedData.mbcs.utf8Friendly) {
|
||||
cr[0] = cnvMBCSDoubleFromUnicodeWithOffsets(source, target, offsets, flush);
|
||||
return cr[0];
|
||||
}*/
|
||||
|
||||
table = sharedData.mbcs.fromUnicodeTable;
|
||||
/* if (mbcs.sharedData.mbcs.utf8Friendly) {
|
||||
mbcsIndex = mbcs.sharedData.mbcs.mbcsIndex;
|
||||
} else {
|
||||
mbcsIndex = null;
|
||||
} */
|
||||
|
||||
if ((options&UConverterConstants.OPTION_SWAP_LFNL) != 0) {
|
||||
bytes = ByteBuffer.wrap(sharedData.mbcs.swapLFNLFromUnicodeBytes);
|
||||
} else {
|
||||
bytes = ByteBuffer.wrap(sharedData.mbcs.fromUnicodeBytes);
|
||||
}
|
||||
//asciiRoundtrips = mbcs.sharedData.mbcs.asciiRoundtrips;
|
||||
|
||||
/* get the converter state from UConverter */
|
||||
x.c = fromUChar32;
|
||||
if (outputType == MBCS_OUTPUT_2_SISO) {
|
||||
x.prevLength = fromUnicodeStatus;
|
||||
if (x.prevLength == 0) {
|
||||
/* set the real value */
|
||||
x.prevLength = 1;
|
||||
}
|
||||
} else {
|
||||
/* prevent fromUnicodeStatus from being set to something non-0 */
|
||||
x.prevLength = 0;
|
||||
}
|
||||
|
||||
/* sourceIndex = -1 if the current character began in the previous buffer */
|
||||
x.prevSourceIndex = -1;
|
||||
x.sourceIndex = x.c==0 ? 0 : -1;
|
||||
x.nextSourceIndex = 0;
|
||||
|
||||
/* conversion loop */
|
||||
if (x.c != 0 && targetCapacity > 0) {
|
||||
gotoGetTrail = true; // set gotoGetTrail flag and go to gotoGetTrail label
|
||||
}
|
||||
|
||||
while (gotoGetTrail || source.hasRemaining()) {
|
||||
/*
|
||||
* This following test is to see if available input would overflow the output.
|
||||
* It does not catch output of more than one byte that
|
||||
* overflows as a result of a multi-byte character or callback output
|
||||
* from the last source character.
|
||||
* Therefore, those situations also test for overflows and will
|
||||
* then break the loop, too.
|
||||
*/
|
||||
if (gotoGetTrail || targetCapacity > 0) {
|
||||
/*
|
||||
* Get a correct Unicode code point:
|
||||
* a single UChar for a BMP code point or
|
||||
* a matched surrogate pair for a "supplementary code point."
|
||||
*/
|
||||
if (!gotoGetTrail) {
|
||||
x.c = source.get();
|
||||
++x.nextSourceIndex;
|
||||
/* This is commented out because of the fact that IS_ASCII_ROUNDTRIP is not
|
||||
* being used in ICU4J.
|
||||
*/
|
||||
/*if (x.c <= 0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
|
||||
target.put((byte)x.c);
|
||||
if (offsets != null) {
|
||||
offsets.put(x.sourceIndex);
|
||||
x.prevSourceIndex = x.sourceIndex;
|
||||
x.sourceIndex = x.nextSourceIndex;
|
||||
}
|
||||
targetCapacity--;
|
||||
x.c = 0;
|
||||
continue;
|
||||
}*/
|
||||
}
|
||||
/* Code to use utf8friendly code was removed since it is not needed in Java. */
|
||||
/* This also tests if the codepage maps single surrogates.
|
||||
* If it does, then surrogates are not paired but mapped separately.
|
||||
* Note that in this case unmatched surrogates are not detected.
|
||||
*/
|
||||
if (gotoGetTrail || (UTF16.isSurrogate((char)x.c) && (uniMask&UConverterConstants.HAS_SURROGATES) == 0)) {
|
||||
if (gotoGetTrail || (UTF16.isLeadSurrogate((char)x.c))) {
|
||||
// getTrail label
|
||||
gotoGetTrail = false; // reset gotoGetTrail flag
|
||||
|
||||
x.sourceArrayIndex = source.position();
|
||||
|
||||
doLoop = getTrail(source, target, uniMask, x, flush, cr);
|
||||
if (x.doread && doLoop) {
|
||||
continue;
|
||||
} else if (!x.doread && !doLoop) {
|
||||
break;
|
||||
} else if (!doLoop) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* this is an unmatched trail code unit (2nd surrogate) */
|
||||
/* callback(illegal) */
|
||||
cr[0] = CoderResult.malformedForLength(1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* convert the Unicode point in c into codepage bytes */
|
||||
/*
|
||||
* The basic lookup is a triple-stage compact array (trie) lookup.
|
||||
*
|
||||
* Single-byte codepages are handled with a different data structure
|
||||
* by _MBCSSingle... functions.
|
||||
*
|
||||
* The result consists of a 32-bit value from stage 2 and
|
||||
* a pointer to as many bytes as are stored per character.
|
||||
* The pointer points to the character's bytes in stage 3.
|
||||
* Bits 15..0 of the stage 2 entry contain the stage 3 index
|
||||
* for that pointer, while bits 31..16 are flags for which of
|
||||
* the 16 characters in the block are roundtrip-assigned.
|
||||
*
|
||||
* For 2-byte and 4 byte codepages, the bytes are stored as uint16_t
|
||||
* respectively as uint32_t, in the platform encoding.
|
||||
* For 3-byte codepages, the bytes are always stored in big-endian order.
|
||||
*
|
||||
* For EUC encodings that use only either 0x8e or 0x8f as the first
|
||||
* byte of their longest byte sequences, the first two bytes in
|
||||
* this third stage indicate with their 7th bits whether these bytes
|
||||
* are to be writeen directly or actually need to be preceeded by
|
||||
* one of the two Single-Shift codes. With this, the third stage
|
||||
* stores one byte fewer per character than the actual maximum length of
|
||||
* EUC byte sequences.
|
||||
*
|
||||
* Other than that, leading zero bytes are removed and the other
|
||||
* bytes output. A single zero byte may be ouput if the "assigned"
|
||||
* bit in stage 2 was on.
|
||||
* The data structure does not support zero byte output as a fallback,
|
||||
* and also does not allow output of leading zeros.
|
||||
*/
|
||||
stage2Entry = MBCS_STAGE_2_FROM_U(table, x.c);
|
||||
|
||||
/* get the bytes and the length for the output */
|
||||
switch (outputType) {
|
||||
case MBCS_OUTPUT_2:
|
||||
value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
|
||||
if (value <= 0xff) {
|
||||
length = 1;
|
||||
} else {
|
||||
length = 2;
|
||||
}
|
||||
break;
|
||||
case MBCS_OUTPUT_2_SISO:
|
||||
/* 1/2-byte stateful with Shift-In/Shift-Out */
|
||||
/*
|
||||
* Save the old state in the converter object
|
||||
* right here, then change the local pervLength state variable if necessary.
|
||||
* Then, if this character turns out to be unassigned or a fallback that
|
||||
* is not taken, the callback code must not save the new state in the converter
|
||||
* because the new state is for a character that is not output.
|
||||
* However, the callback must still restore the state from the converter
|
||||
* in case the callback function changed it for its output.
|
||||
*/
|
||||
fromUnicodeStatus = x.prevLength; /* save the old state */
|
||||
value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
|
||||
if (value <= 0xff) {
|
||||
if (value == 0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, x.c)) {
|
||||
/* no mapping, leave value == 0 */
|
||||
length = 0;
|
||||
} else if (x.prevLength <= 1) {
|
||||
length = 1;
|
||||
} else {
|
||||
/* change from double-byte mode to single-byte */
|
||||
value |= UConverterConstants.UNSIGNED_INT_MASK & (UConverterConstants.SI<<8);
|
||||
length = 2;
|
||||
x.prevLength = 1;
|
||||
}
|
||||
} else {
|
||||
if (x.prevLength == 2) {
|
||||
length = 2;
|
||||
} else {
|
||||
/* change from single-byte mode to double-byte */
|
||||
value |= UConverterConstants.UNSIGNED_INT_MASK & (UConverterConstants.SO<<16);
|
||||
length = 3;
|
||||
x.prevLength = 2;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case MBCS_OUTPUT_DBCS_ONLY:
|
||||
/* table with single-byte results, but only DBCS mappings used */
|
||||
value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
|
||||
if (value <= 0xff) {
|
||||
/* no mapping or SBCS result, not taken for DBCS-only */
|
||||
value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */
|
||||
length = 0;
|
||||
} else {
|
||||
length = 2;
|
||||
}
|
||||
break;
|
||||
case MBCS_OUTPUT_3:
|
||||
p = MBCS_POINTER_3_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
|
||||
value = UConverterConstants.UNSIGNED_INT_MASK&((int)bytes.get(p)<<16 | (int)bytes.get(p+1)<<8 | bytes.get(p+2));
|
||||
if (value <= 0xff) {
|
||||
length = 1;
|
||||
} else if (value <= 0xffff) {
|
||||
length = 2;
|
||||
} else {
|
||||
length = 3;
|
||||
}
|
||||
break;
|
||||
case MBCS_OUTPUT_4:
|
||||
value = MBCS_VALUE_4_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
|
||||
if (value <= 0xff) {
|
||||
length = 1;
|
||||
} else if (value <= 0xffff) {
|
||||
length = 2;
|
||||
} else if (value <= 0xffffff) {
|
||||
length = 3;
|
||||
} else {
|
||||
length = 4;
|
||||
}
|
||||
break;
|
||||
case MBCS_OUTPUT_3_EUC:
|
||||
value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
|
||||
/* EUC 16-bit fixed-length representation */
|
||||
if (value <= 0xff) {
|
||||
length = 1;
|
||||
} else if ((value&0x8000) == 0) {
|
||||
value |= 0x8e8000;
|
||||
length = 3;
|
||||
} else if ((value&0x80) == 0) {
|
||||
value |= 0x8f0080;
|
||||
length = 3;
|
||||
} else {
|
||||
length = 2;
|
||||
}
|
||||
break;
|
||||
case MBCS_OUTPUT_4_EUC:
|
||||
p = MBCS_POINTER_3_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
|
||||
value = UConverterConstants.UNSIGNED_INT_MASK&((int)bytes.get(p)<<16 | (int)bytes.get(p+1)<<8 | bytes.get(p+2));
|
||||
/* EUC 16-bit fixed-length representation applied to the first two bytes */
|
||||
if (value <= 0xff) {
|
||||
length = 1;
|
||||
} else if (value <= 0xffff) {
|
||||
length = 2;
|
||||
} else if ((value&0x800000) == 0) {
|
||||
value |= 0x08e800000;
|
||||
length = 4;
|
||||
} else if ((value&0x8000) == 0) {
|
||||
value |= 0x08f008000;
|
||||
length = 4;
|
||||
} else {
|
||||
length = 3;
|
||||
}
|
||||
break;
|
||||
default :
|
||||
/* must not occur */
|
||||
value = stage2Entry = 0;
|
||||
length = 0;
|
||||
break;
|
||||
}
|
||||
/* is this code point assigned, or do we use fallbacks? */
|
||||
if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, x.c)) ||
|
||||
(CharsetEncoderICU.isFromUUseFallback(useFallback, x.c) && value != 0)) {
|
||||
/*
|
||||
* We allow a 0 byte output if the "assigned" bit is set for this entry.
|
||||
* There is no way with this data structure for fallback output
|
||||
* to be a zero byte.
|
||||
*/
|
||||
// unassigned label
|
||||
int currentSourcePos = source.position();
|
||||
doLoop = unassigned(source, target, offsets, x, flush, cr);
|
||||
if (doLoop) {
|
||||
continue;
|
||||
} else {
|
||||
if (source.position() < currentSourcePos) {
|
||||
source.position(currentSourcePos);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* write the output character bytes from value and length */
|
||||
/* from the first if in the loop we know that targetCapacity>0 */
|
||||
if (length <= targetCapacity) {
|
||||
switch (length) {
|
||||
/* each branch falls through to the next one */
|
||||
case 4:
|
||||
target.put((byte)(value>>24));
|
||||
if (offsets != null) {
|
||||
offsets.put(x.sourceIndex);
|
||||
}
|
||||
case 3:
|
||||
target.put((byte)(value>>16));
|
||||
if (offsets != null) {
|
||||
offsets.put(x.sourceIndex);
|
||||
}
|
||||
case 2:
|
||||
target.put((byte)(value>>8));
|
||||
if (offsets != null) {
|
||||
offsets.put(x.sourceIndex);
|
||||
}
|
||||
case 1:
|
||||
target.put((byte)value);
|
||||
if (offsets != null) {
|
||||
offsets.put(x.sourceIndex);
|
||||
}
|
||||
default :
|
||||
/* will never occur */
|
||||
break;
|
||||
}
|
||||
|
||||
targetCapacity -= length;
|
||||
} else {
|
||||
/*
|
||||
* We actually do this backwards here:
|
||||
* In order to save an intermediate variable, we output
|
||||
* first to the overflow buffer what does not fit into the
|
||||
* regular target.
|
||||
*/
|
||||
/* we know that 1<=targetCapacity<length<=4 */
|
||||
length -= targetCapacity;
|
||||
int i = 0; // index for errorBuffer
|
||||
switch (length) {
|
||||
/* each branch falls through to the next one */
|
||||
case 3:
|
||||
errorBuffer[i++] = (byte)(value>>16);
|
||||
case 2:
|
||||
errorBuffer[i++] = (byte)(value>>8);
|
||||
case 1:
|
||||
errorBuffer[i++] = (byte)value;
|
||||
default :
|
||||
/* will never occur */
|
||||
break;
|
||||
}
|
||||
errorBufferLength = length;
|
||||
|
||||
/* now output what fits into the regular target */
|
||||
value>>=8*length; /* length was reduced by targetCapacity */
|
||||
switch (targetCapacity) {
|
||||
/* each branch falls through to the next one */
|
||||
case 3:
|
||||
target.put((byte)(value>>16));
|
||||
if (offsets != null) {
|
||||
offsets.put(x.sourceIndex);
|
||||
}
|
||||
case 2:
|
||||
target.put((byte)(value>>8));
|
||||
if (offsets != null) {
|
||||
offsets.put(x.sourceIndex);
|
||||
}
|
||||
case 1:
|
||||
target.put((byte)value);
|
||||
if (offsets != null) {
|
||||
offsets.put(x.sourceIndex);
|
||||
}
|
||||
default :
|
||||
/* will never occur */
|
||||
break;
|
||||
}
|
||||
|
||||
/* target overflow */
|
||||
targetCapacity = 0;
|
||||
cr[0] = CoderResult.OVERFLOW;
|
||||
x.c = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/* normal end of conversion: prepare for a new character */
|
||||
x.c = 0;
|
||||
if (offsets != null) {
|
||||
x.prevSourceIndex = x.sourceIndex;
|
||||
x.sourceIndex = x.nextSourceIndex;
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
/* target is full */
|
||||
cr[0] = CoderResult.OVERFLOW;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* the end of the input stream and detection of truncated input
|
||||
* are handled by the framework, but for EBCDIC_STATEFUL conversion
|
||||
* we need to emit an SI at the very end
|
||||
*
|
||||
* conditions:
|
||||
* successful
|
||||
* EBCDIC_STATEFUL in DBCS mode
|
||||
* end of input and no truncated input
|
||||
*/
|
||||
if (!cr[0].isError() && outputType == MBCS_OUTPUT_2_SISO && x.prevLength == 2 && flush && !source.hasRemaining() && x.c == 0) {
|
||||
/* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
|
||||
if (targetCapacity > 0) {
|
||||
target.put((byte)UConverterConstants.SI);
|
||||
if (offsets != null) {
|
||||
/* set the last source character's index (sourceIndex points at sourceLimit now) */
|
||||
offsets.put(x.prevSourceIndex);
|
||||
}
|
||||
} else {
|
||||
/* target is full */
|
||||
errorBuffer[0] = UConverterConstants.SI;
|
||||
errorBufferLength = 1;
|
||||
cr[0] = CoderResult.OVERFLOW;
|
||||
}
|
||||
x.prevLength = 1; /* we switched into SBCS */
|
||||
}
|
||||
/* set the converter state back into UConverter */
|
||||
fromUChar32 = x.c;
|
||||
fromUnicodeStatus = x.prevLength;
|
||||
|
||||
return cr[0];
|
||||
// Just call encodeLoop to remove duplicate code.
|
||||
return encodeLoop(source, target, offsets, flush);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d8e824c842e59c326c65665a7e9f26ec7c7c4c8aa1cec2bde603fb8f27371184
|
||||
size 772451
|
||||
oid sha256:5e4ffe9070b3d419a5df23d222bb6dcd68790c8044172727c985f5fd8adbe555
|
||||
size 772538
|
||||
|
@ -5157,6 +5157,17 @@ public class TestCharset extends TestFmwk {
|
||||
if(!roundTripResult.equals(encoderBuffer)){
|
||||
errln("Error occured while encoding "+ charset.name());
|
||||
}
|
||||
// Test overflow for code coverage reasons
|
||||
if (i == 0) {
|
||||
ByteBuffer test = encoderResult;
|
||||
test.position(0);
|
||||
CharBuffer smallBuffer = CharBuffer.allocate(11);
|
||||
decode.reset();
|
||||
CoderResult status = decode.decode(test, smallBuffer, true);
|
||||
if (status != CoderResult.OVERFLOW) {
|
||||
errln("Overflow buffer error should have been thrown.");
|
||||
}
|
||||
}
|
||||
}catch(Exception e){
|
||||
errln("Exception while converting SCSU thrown: " + e);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user