ICU-5564 conform to java's spec for UTF-16 converter

X-SVN-Rev: 20917
This commit is contained in:
Ram Viswanadha 2007-01-24 21:54:59 +00:00
parent 98cf7d46ae
commit 31a9f8c37b
12 changed files with 857 additions and 369 deletions

2
.gitattributes vendored
View File

@ -55,6 +55,8 @@ icu4c/source/test/testdata/importtest.bin -text
icu4c/source/test/testdata/uni-text.bin -text
icu4j/ee.foundation.jar -text
icu4j/license.html -text
icu4j/src/com/ibm/icu/charset/CharsetUTF16BE.java -text
icu4j/src/com/ibm/icu/charset/CharsetUTF32BE.java -text
icu4j/src/com/ibm/icu/dev/data/rbbi/english.dict -text
icu4j/src/com/ibm/icu/dev/data/testdata.jar -text
icu4j/src/com/ibm/icu/dev/data/thai6.ucs -text

View File

@ -149,6 +149,10 @@
<srcfiles dir="${build.dir}" includes="${icu4j.data.path}/*.icu"/>
</uptodate>
<!-- <echo message="icu4j.module.resources result: ${icu4j.module.resources}" /> -->
<tstamp>
<format property="date.time" pattern="yyyy-MM-dd 'at' hh:mm:ss z" locale="en,US"/>
</tstamp>
<echo message="Initialized at ${date.time}"/>
</target>
<!-- build everything but dist-related stuff -->

View File

@ -169,7 +169,7 @@ public abstract class CharsetDecoderICU extends CharsetDecoder{
setSourcePosition(in);
return ret;
}
/**
* Implements the ICU semantic for decode operation
* @param in The input byte buffer

View File

@ -29,6 +29,9 @@ import com.ibm.icu.text.UTF16;
*/
public abstract class CharsetEncoderICU extends CharsetEncoder {
static final int NEED_TO_WRITE_BOM = 1;
boolean writeBOM = false; /* only used by UTF-16, UTF-32 */
byte[] errorBuffer = new byte[30];
int errorBufferLength = 0;

View File

@ -14,6 +14,7 @@ import java.io.InputStreamReader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
@ -62,7 +63,6 @@ public abstract class CharsetICU extends Charset{
byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */
byte reserved[/*19*/]; /* +81: 19 to round out the structure */
boolean writeBOM = false; /* only used by UTF-16, UTF-32 */
/**
*
@ -120,15 +120,15 @@ public abstract class CharsetICU extends Charset{
algorithmicCharsets.put("US-ASCII", "com.ibm.icu.charset.CharsetASCII" );
algorithmicCharsets.put("ISO-8859-1", "com.ibm.icu.charset.Charset88591" );
algorithmicCharsets.put("UTF-16", "com.ibm.icu.charset.CharsetUTF16" );
algorithmicCharsets.put("UTF-16BE", "com.ibm.icu.charset.CharsetUTF16" );
algorithmicCharsets.put("UTF-16BE", "com.ibm.icu.charset.CharsetUTF16BE" );
algorithmicCharsets.put("UTF-16LE", "com.ibm.icu.charset.CharsetUTF16LE" );
algorithmicCharsets.put("UTF16_OppositeEndian", "com.ibm.icu.charset.CharsetUTF16LE" );
algorithmicCharsets.put("UTF16_PlatformEndian", "com.ibm.icu.charset.CharsetUTF16" );
algorithmicCharsets.put("UTF-32", "com.ibm.icu.charset.CharsetUTF32" );
algorithmicCharsets.put("UTF-32BE", "com.ibm.icu.charset.CharsetUTF32" );
algorithmicCharsets.put("UTF-32BE", "com.ibm.icu.charset.CharsetUTF32BE" );
algorithmicCharsets.put("UTF-32LE", "com.ibm.icu.charset.CharsetUTF32LE" );
algorithmicCharsets.put("UTF32_PlatformEndian", "com.ibm.icu.charset.CharsetUTF32LE" );
algorithmicCharsets.put("UTF32_OppositeEndian", "com.ibm.icu.charset.CharsetUTF32" );
algorithmicCharsets.put("UTF32_OppositeEndian", "com.ibm.icu.charset.CharsetUTF32LE" );
algorithmicCharsets.put("UTF32_PlatformEndian", "com.ibm.icu.charset.CharsetUTF32" );
algorithmicCharsets.put("UTF-7", "com.ibm.icu.charset.CharsetUTF7" );
algorithmicCharsets.put("UTF-8", "com.ibm.icu.charset.CharsetUTF8" );
}
@ -223,11 +223,106 @@ public abstract class CharsetICU extends Charset{
CharsetProviderICU icuProvider = new CharsetProviderICU();
CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName);
if (cs != null) {
cs.writeBOM = true;
return cs;
}
return Charset.forName(charsetName);
}
/**
* This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
* start of the stream for example U+FEFF (the Unicode BOM/signature
* character) that can be ignored.
*
* Detects Unicode signature byte sequences at the start of the byte stream
* and returns number of bytes of the BOM of the indicated Unicode charset.
* 0 is returned when no Unicode signature is recognized.
*
*/
static String detectUnicodeSignature(ByteBuffer source) {
int signatureLength = 0; // number of bytes of the signature
final int SIG_MAX_LEN = 5;
String sigUniCharset = null; // states what unicode charset is the BOM
int i = 0;
/*
* initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we
* don't misdetect something
*/
byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5,
(byte) 0xa5 };
while (i < source.remaining() && i < SIG_MAX_LEN) {
start[i] = source.get(i);
i++;
}
if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) {
signatureLength = 2;
sigUniCharset = "UTF-16BE";
source.position(signatureLength);
return sigUniCharset;
} else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) {
if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) {
signatureLength = 4;
sigUniCharset = "UTF-32LE";
source.position(signatureLength);
return sigUniCharset;
} else {
signatureLength = 2;
sigUniCharset = "UTF-16LE";
source.position(signatureLength);
return sigUniCharset;
}
} else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB
&& start[2] == (byte) 0xBF) {
signatureLength = 3;
sigUniCharset = "UTF-8";
source.position(signatureLength);
return sigUniCharset;
} else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00
&& start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) {
signatureLength = 4;
sigUniCharset = "UTF-32BE";
source.position(signatureLength);
return sigUniCharset;
} else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE
&& start[2] == (byte) 0xFF) {
signatureLength = 3;
sigUniCharset = "SCSU";
source.position(signatureLength);
return sigUniCharset;
} else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE
&& start[2] == (byte) 0x28) {
signatureLength = 3;
sigUniCharset = "BOCU-1";
source.position(signatureLength);
return sigUniCharset;
} else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F
&& start[2] == (byte) 0x76) {
if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) {
signatureLength = 5;
sigUniCharset = "UTF-7";
source.position(signatureLength);
return sigUniCharset;
} else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39
|| start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) {
signatureLength = 4;
sigUniCharset = "UTF-7";
source.position(signatureLength);
return sigUniCharset;
}
} else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73
&& start[2] == (byte) 0x66 && start[3] == (byte) 0x73) {
signatureLength = 4;
sigUniCharset = "UTF-EBCDIC";
source.position(signatureLength);
return sigUniCharset;
}
/* no known Unicode signature byte sequence recognized */
return null;
}
}

View File

@ -20,7 +20,7 @@ import com.ibm.icu.text.UTF16;
class CharsetUTF16 extends CharsetICU {
protected byte[] fromUSubstitution = new byte[]{(byte)0xff, (byte)0xfd};
public CharsetUTF16(String icuCanonicalName, String javaCanonicalName, String[] aliases){
super(icuCanonicalName, javaCanonicalName, aliases);
maxBytesPerChar = 4;
@ -28,17 +28,152 @@ class CharsetUTF16 extends CharsetICU {
maxCharsPerByte = 1;
}
class CharsetDecoderUTF16 extends CharsetDecoderICU{
ByteBuffer utf16BOM = ByteBuffer.wrap(new byte[]{ (byte)0xfe, (byte)0xff, 0, 0, (byte)0xff, (byte)0xfe, 0, 0 });
public CharsetDecoderUTF16(CharsetICU cs) {
super(cs);
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
int state, offsetDelta;
byte b;
CoderResult cr = CoderResult.UNDERFLOW;
int offsetsPos = (offsets==null)?0:offsets.position();
utf16BOM.limit(utf16BOM.capacity());
/*
* If we detect a BOM in this buffer, then we must add the BOM size to the
* offsets because the actual converter function will not see and count the BOM.
* offsetDelta will have the number of the BOM bytes that are in the current buffer.
*/
offsetDelta=0;
state=mode;
int pos = source.position();
while(pos < source.limit()) {
switch(state) {
case 0:
b=source.get(pos);
if(b==(byte)0xfe) {
state=1; /* could be FE FF */
} else if(b==(byte)0xff) {
state=5; /* could be FF FE */
} else {
state=8; /* default to UTF-16BE */
continue;
}
pos++;
break;
case 1:
case 5:
if(source.get(pos)==utf16BOM.get(state)) {
++pos;
if(state==1) {
state=8; /* detect UTF-16BE */
offsetDelta=pos-source.position();
} else if(state==5) {
state=9; /* detect UTF-16LE */
offsetDelta=pos-source.position();
}
} else {
/* switch to UTF-16BE and pass the previous bytes */
if(pos!=source.position()) {
/* just reset the source */
pos=source.position();
} else {
boolean oldFlush=flush;
int bomIndex = state&4;
ByteBuffer oldSource = source;
source = utf16BOM;
utf16BOM.position(bomIndex);/* select the correct BOM */
source.limit(bomIndex+1);/* replay previous byte */
flush = false; /* this sourceLimit is not the real source stream limit */
cr = decodeLoopUTF16BE(source, target, offsets, flush);
/* restore real pointers; pArgs->source will be set in case 8/9 */
flush = oldFlush;
source = oldSource;
}
state=8;
continue;
}
break;
case 8:
case 9:
mode = state;
source.position(pos);
cr = decodeLoopImpl(source, target, offsets, flush);
pos = source.position();
break;
default:
break; /* does not occur */
}
if(cr.isOverflow() || cr.isError()){
break;
}
}
/* add BOM size to offsets - see comment at offsetDelta declaration */
if(offsets!=null && offsetDelta!=0) {
int offsetsLimit=offsets.position();
while(offsetsPos<offsetsLimit) {
int delta = offsetDelta + offsets.get(pos);
offsets.put(pos++, delta);
}
}
source.position(pos);
if(!source.hasRemaining() && flush) {
/* handle truncated input */
switch(state) {
case 0:
break; /* no input at all, nothing to do */
case 8:
cr = decodeLoopUTF16BE(source, target, offsets, flush);
break;
case 9:
cr = decodeLoopUTF16LE(source, target, offsets, flush);
break;
default:
/* handle 0<state<8: call UTF-16BE with too-short input */
boolean oldFlush=flush;
int bomIndex = state&4;
ByteBuffer oldSource = source;
source = utf16BOM;
utf16BOM.position(bomIndex);/* select the correct BOM */
source.limit(bomIndex+1);/* replay previous byte */
flush = false; /* this sourceLimit is not the real source stream limit */
cr = decodeLoopUTF16BE(source, target, offsets, flush);
/* restore real pointers; pArgs->source will be set in case 8/9 */
flush = oldFlush;
source = oldSource;
state=8;
break;
}
}
mode=state;
return cr;
}
protected CoderResult decodeLoopImpl(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
CoderResult cr = CoderResult.UNDERFLOW;
if(mode==8){
/* call UTF-16BE */
cr = decodeLoopUTF16BE(source, target, offsets, flush);
}else if(mode==9){
/* call UTF-16LE */
cr =decodeLoopUTF16LE(source, target, offsets, flush);
}else{
/* should not occur */
throw new InternalError("Unknown State in UTF-16 converter!");
}
return cr;
}
final CoderResult decodeLoopUTF16BE(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
CoderResult cr = CoderResult.UNDERFLOW;
if(!source.hasRemaining() && toUnicodeStatus==0) {
/* no input, nothing to do */
return cr;
}
}
if(!target.hasRemaining()) {
return CoderResult.OVERFLOW;
}
@ -241,7 +376,219 @@ class CharsetUTF16 extends CharsetICU {
return cr;
}
final CoderResult decodeLoopUTF16LE(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
CoderResult cr = CoderResult.UNDERFLOW;
if(!source.hasRemaining() && toUnicodeStatus==0) {
/* no input, nothing to do */
return cr;
}
if(!target.hasRemaining()) {
return CoderResult.OVERFLOW;
}
int sourceIndex=0, count=0, length, sourceArrayIndex;
char c=0, trail;
length = source.remaining();
sourceArrayIndex = source.position();
/* complete a partial UChar or pair from the last call */
if(toUnicodeStatus!=0) {
/*
* special case: single byte from a previous buffer,
* where the byte turned out not to belong to a trail surrogate
* and the preceding, unmatched lead surrogate was put into toUBytes[]
* for error handling
*/
toUBytesArray[toUBytesBegin+0]=(byte)toUnicodeStatus;
toULength=1;
toUnicodeStatus=0;
}
if((count=toULength)!=0) {
byte[] pArray=toUBytesArray;
int pArrayIndex = toUBytesBegin;
do {
pArray[count++]=source.get(sourceArrayIndex++);
++sourceIndex;
--length;
if(count==2) {
c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK));
if(!UTF16.isSurrogate(c)) {
/* output the BMP code point */
target.put(c);
if(offsets!=null) {
offsets.put(-1);
}
count=0;
c=0;
break;
} else if(UTF16.isLeadSurrogate(c)) {
/* continue collecting bytes for the trail surrogate */
c=0; /* avoid unnecessary surrogate handling below */
} else {
/* fall through to error handling for an unmatched trail surrogate */
break;
}
} else if(count==4) {
c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK));
trail=(char)(((pArray[pArrayIndex+3]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+2]&UConverterConstants.UNSIGNED_BYTE_MASK));
if(UTF16.isTrailSurrogate(trail)) {
/* output the surrogate pair */
target.put(c);
if(target.remaining()>=1) {
target.put(trail);
if(offsets!=null) {
offsets.put(-1);
offsets.put(-1);
}
} else /* targetCapacity==1 */ {
charErrorBufferArray[charErrorBufferBegin+0]=trail;
charErrorBufferLength=1;
return CoderResult.OVERFLOW;
}
count=0;
c=0;
break;
} else {
/* unmatched lead surrogate, handle here for consistent toUBytes[] */
/* back out reading the code unit after it */
if((source.position()-sourceArrayIndex)>=2) {
sourceArrayIndex-=2;
} else {
/*
* if the trail unit's first byte was in a previous buffer, then
* we need to put it into a special place because toUBytes[] will be
* used for the lead unit's bytes
*/
toUnicodeStatus=0x100|pArray[pArrayIndex+2];
--sourceArrayIndex;
}
toULength=2;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break;
}
}
} while(length>0);
toULength=(byte)count;
}
/* copy an even number of bytes for complete UChars */
count=2*target.remaining();
if(count>length) {
count=length&~1;
}
if(c==0 && count>0) {
length-=count;
count>>=1;
//targetCapacity-=count;
if(offsets==null) {
do {
c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK));
sourceArrayIndex+=2;
if(!UTF16.isSurrogate(c)) {
target.put(c);
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))
) {
sourceArrayIndex+=2;
--count;
target.put(c);
target.put(trail);
} else {
break;
}
} while(--count>0);
} else {
do {
c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK));
sourceArrayIndex+=2;
if(!UTF16.isSurrogate(c)) {
target.put(c);
offsets.put(sourceIndex);
sourceIndex+=2;
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))
) {
sourceArrayIndex+=2;
--count;
target.put(c);
target.put(trail);
offsets.put(sourceIndex);
offsets.put(sourceIndex);
sourceIndex+=4;
} else {
break;
}
} while(--count>0);
}
if(count==0) {
/* done with the loop for complete UChars */
c=0;
} else {
/* keep c for surrogate handling, trail will be set there */
length+=2*(count-1); /* one more byte pair was consumed than count decremented */
}
}
if(c!=0) {
/*
* c is a surrogate, and
* - source or target too short
* - or the surrogate is unmatched
*/
toUBytesArray[toUBytesBegin+0]=(byte)c;
toUBytesArray[toUBytesBegin+1]=(byte)(c>>>8);
toULength=2;
if(UTF16.isLeadSurrogate(c)) {
if(length>=2) {
if(UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))) {
/* output the surrogate pair, will overflow (see conditions comment above) */
sourceArrayIndex+=2;
length-=2;
target.put(c);
if(offsets!=null) {
offsets.put(sourceIndex);
}
charErrorBufferArray[charErrorBufferBegin+0]=trail;
charErrorBufferLength=1;
toULength=0;
cr = CoderResult.OVERFLOW;
} else {
/* unmatched lead surrogate */
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
} else {
/* see if the trail surrogate is in the next buffer */
}
} else {
/* unmatched trail surrogate */
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
}
/* check for a remaining source byte */
if(!cr.isError()){
if(length>0) {
if(!target.hasRemaining()) {
cr = CoderResult.OVERFLOW;
} else {
/* it must be length==1 because otherwise the above would have copied more */
toUBytesArray[toULength++]=source.get(sourceArrayIndex++);
}
}
}
source.position(sourceArrayIndex);
return cr;
}
protected void implReset() {
super.implReset();
}
}
class CharsetEncoderUTF16 extends CharsetEncoderICU{
@ -250,13 +597,11 @@ class CharsetUTF16 extends CharsetICU {
implReset();
}
private final static int NEED_TO_WRITE_BOM = 1;
protected void implReset() {
super.implReset();
fromUnicodeStatus = NEED_TO_WRITE_BOM;
writeBOM = true;
}
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){
CoderResult cr = CoderResult.UNDERFLOW;
if(!source.hasRemaining()) {

View File

@ -0,0 +1,50 @@
/**
*******************************************************************************
* Copyright (C) 2007, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
class CharsetUTF16BE extends CharsetUTF16 {
public CharsetUTF16BE(String icuCanonicalName, String javaCanonicalName, String[] aliases){
super(icuCanonicalName, javaCanonicalName, aliases);
}
class CharsetDecoderUTF16BE extends CharsetDecoderUTF16{
public CharsetDecoderUTF16BE(CharsetICU cs) {
super(cs);
}
protected CoderResult decodeLoopImpl(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
return decodeLoopUTF16BE(source, target, offsets, flush);
}
}
public CharsetDecoder newDecoder() {
return new CharsetDecoderUTF16BE(this);
}
class CharsetEncoderUTF16BE extends CharsetEncoderUTF16{
public CharsetEncoderUTF16BE(CharsetICU cs) {
super(cs);
implReset();
}
protected void implReset() {
super.implReset();
fromUnicodeStatus = 0;
writeBOM = false;
}
}
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF16BE(this);
}
}

View File

@ -20,245 +20,33 @@ import com.ibm.icu.text.UTF16;
/**
* @author Niti Hantaweepant
*/
class CharsetUTF16LE extends CharsetICU {
protected byte[] fromUSubstitution = new byte[]{(byte)0xfd, (byte)0xff};
class CharsetUTF16LE extends CharsetUTF16 {
public CharsetUTF16LE(String icuCanonicalName, String javaCanonicalName, String[] aliases){
super(icuCanonicalName, javaCanonicalName, aliases);
maxBytesPerChar = 4;
minBytesPerChar = 2;
maxCharsPerByte = 1;
fromUSubstitution = new byte[]{(byte)0xfd, (byte)0xff};
}
class CharsetDecoderUTF16LE extends CharsetDecoderICU{
class CharsetDecoderUTF16LE extends CharsetDecoderUTF16{
public CharsetDecoderUTF16LE(CharsetICU cs) {
super(cs);
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
CoderResult cr = CoderResult.UNDERFLOW;
if(!source.hasRemaining() && toUnicodeStatus==0) {
/* no input, nothing to do */
return cr;
}
if(!target.hasRemaining()) {
return CoderResult.OVERFLOW;
}
int sourceIndex=0, count=0, length, sourceArrayIndex;
char c=0, trail;
length = source.remaining();
sourceArrayIndex = source.position();
/* complete a partial UChar or pair from the last call */
if(toUnicodeStatus!=0) {
/*
* special case: single byte from a previous buffer,
* where the byte turned out not to belong to a trail surrogate
* and the preceding, unmatched lead surrogate was put into toUBytes[]
* for error handling
*/
toUBytesArray[toUBytesBegin+0]=(byte)toUnicodeStatus;
toULength=1;
toUnicodeStatus=0;
}
if((count=toULength)!=0) {
byte[] pArray=toUBytesArray;
int pArrayIndex = toUBytesBegin;
do {
pArray[count++]=source.get(sourceArrayIndex++);
++sourceIndex;
--length;
if(count==2) {
c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK));
if(!UTF16.isSurrogate(c)) {
/* output the BMP code point */
target.put(c);
if(offsets!=null) {
offsets.put(-1);
}
count=0;
c=0;
break;
} else if(UTF16.isLeadSurrogate(c)) {
/* continue collecting bytes for the trail surrogate */
c=0; /* avoid unnecessary surrogate handling below */
} else {
/* fall through to error handling for an unmatched trail surrogate */
break;
}
} else if(count==4) {
c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK));
trail=(char)(((pArray[pArrayIndex+3]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+2]&UConverterConstants.UNSIGNED_BYTE_MASK));
if(UTF16.isTrailSurrogate(trail)) {
/* output the surrogate pair */
target.put(c);
if(target.remaining()>=1) {
target.put(trail);
if(offsets!=null) {
offsets.put(-1);
offsets.put(-1);
}
} else /* targetCapacity==1 */ {
charErrorBufferArray[charErrorBufferBegin+0]=trail;
charErrorBufferLength=1;
return CoderResult.OVERFLOW;
}
count=0;
c=0;
break;
} else {
/* unmatched lead surrogate, handle here for consistent toUBytes[] */
/* back out reading the code unit after it */
if((source.position()-sourceArrayIndex)>=2) {
sourceArrayIndex-=2;
} else {
/*
* if the trail unit's first byte was in a previous buffer, then
* we need to put it into a special place because toUBytes[] will be
* used for the lead unit's bytes
*/
toUnicodeStatus=0x100|pArray[pArrayIndex+2];
--sourceArrayIndex;
}
toULength=2;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break;
}
}
} while(length>0);
toULength=(byte)count;
}
/* copy an even number of bytes for complete UChars */
count=2*target.remaining();
if(count>length) {
count=length&~1;
}
if(c==0 && count>0) {
length-=count;
count>>=1;
//targetCapacity-=count;
if(offsets==null) {
do {
c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK));
sourceArrayIndex+=2;
if(!UTF16.isSurrogate(c)) {
target.put(c);
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))
) {
sourceArrayIndex+=2;
--count;
target.put(c);
target.put(trail);
} else {
break;
}
} while(--count>0);
} else {
do {
c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK));
sourceArrayIndex+=2;
if(!UTF16.isSurrogate(c)) {
target.put(c);
offsets.put(sourceIndex);
sourceIndex+=2;
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))
) {
sourceArrayIndex+=2;
--count;
target.put(c);
target.put(trail);
offsets.put(sourceIndex);
offsets.put(sourceIndex);
sourceIndex+=4;
} else {
break;
}
} while(--count>0);
}
if(count==0) {
/* done with the loop for complete UChars */
c=0;
} else {
/* keep c for surrogate handling, trail will be set there */
length+=2*(count-1); /* one more byte pair was consumed than count decremented */
}
}
if(c!=0) {
/*
* c is a surrogate, and
* - source or target too short
* - or the surrogate is unmatched
*/
toUBytesArray[toUBytesBegin+0]=(byte)c;
toUBytesArray[toUBytesBegin+1]=(byte)(c>>>8);
toULength=2;
if(UTF16.isLeadSurrogate(c)) {
if(length>=2) {
if(UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))) {
/* output the surrogate pair, will overflow (see conditions comment above) */
sourceArrayIndex+=2;
length-=2;
target.put(c);
if(offsets!=null) {
offsets.put(sourceIndex);
}
charErrorBufferArray[charErrorBufferBegin+0]=trail;
charErrorBufferLength=1;
toULength=0;
cr = CoderResult.OVERFLOW;
} else {
/* unmatched lead surrogate */
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
} else {
/* see if the trail surrogate is in the next buffer */
}
} else {
/* unmatched trail surrogate */
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
}
/* check for a remaining source byte */
if(!cr.isError()){
if(length>0) {
if(!target.hasRemaining()) {
cr = CoderResult.OVERFLOW;
} else {
/* it must be length==1 because otherwise the above would have copied more */
toUBytesArray[toULength++]=source.get(sourceArrayIndex++);
}
}
}
source.position(sourceArrayIndex);
return cr;
protected CoderResult decodeLoopImpl(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
return decodeLoopUTF16LE(source, target, offsets, flush);
}
}
class CharsetEncoderUTF16LE extends CharsetEncoderICU{
public CharsetEncoderUTF16LE(CharsetICU cs) {
super(cs, fromUSubstitution);
implReset();
}
private final static int NEED_TO_WRITE_BOM = 1;
protected void implReset() {
super.implReset();
fromUnicodeStatus = NEED_TO_WRITE_BOM;
fromUnicodeStatus = 0;
}
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){

View File

@ -29,13 +29,86 @@ class CharsetUTF32 extends CharsetICU {
minBytesPerChar = 4;
maxCharsPerByte = 1;
}
class CharsetDecoderUTF32 extends CharsetDecoderICU{
boolean isFirstBuffer;
final int SIGNATURE_LENGTH=4;
public CharsetDecoderUTF32(CharsetICU cs) {
super(cs);
isFirstBuffer = true;
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
int state, offsetDelta;
int offsetsPos = (offsets==null)?0:offsets.position();
CoderResult cr = CoderResult.UNDERFLOW;
state=mode;
/*
* If we detect a BOM in this buffer, then we must add the BOM size to the
* offsets because the actual converter function will not see and count the BOM.
* offsetDelta will have the number of the BOM bytes that are in the current buffer.
*/
offsetDelta=0;
int pos = source.position();
if(isFirstBuffer && toULength<SIGNATURE_LENGTH){
while(pos < source.limit() && pos < toULength) {
toUBytesArray[toULength++] = source.get(pos++);
}
if(toULength==SIGNATURE_LENGTH){
if(toUBytesArray[0]==0x00 && toUBytesArray[1]==0x00 && toUBytesArray[2]==0xFE && toUBytesArray[3]==0xFF){
// may be BE
state = 1;
offsetDelta=4;
}else if(toUBytesArray[0]==0xFF && toUBytesArray[1]==0xFE && toUBytesArray[2]==0x00 && toUBytesArray[3]==0x00){
//may be LE
state = 2;
offsetDelta=4;
}else{
//default to the subclass charset
state = 3;
toUnicodeStatus = getChar(toUBytesArray, toULength)+1;
}
isFirstBuffer = false;
}
}
/* add BOM size to offsets - see comment at offsetDelta declaration */
if(offsets!=null && offsetDelta!=0) {
int offsetsLimit=offsets.position();
while(offsetsPos<offsetsLimit) {
int delta = offsetDelta + offsets.get(pos);
offsets.put(pos++, delta);
}
}
source.position(pos);
if(!cr.isError() && source.hasRemaining()){
cr = decodeLoopImpl(source, target, offsets, flush);
}
mode=state;
return cr;
}
protected int getChar(byte[] bytes, int length){
return -1;
}
protected CoderResult decodeLoopImpl(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
CoderResult cr = CoderResult.UNDERFLOW;
if(mode==1){
/* call UTF-16BE */
cr = decodeLoopUTF32BE(source, target, offsets, flush);
}else if(mode==2){
/* call UTF-16LE */
cr =decodeLoopUTF32LE(source, target, offsets, flush);
}else{
/* should not occur */
cr = decodeLoopUTF32BE(source, target, offsets, flush);
}
return cr;
}
final CoderResult decodeLoopUTF32BE(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
CoderResult cr = CoderResult.UNDERFLOW;
int sourceArrayIndex = source.position();
@ -151,6 +224,127 @@ class CharsetUTF32 extends CharsetICU {
source.position(sourceArrayIndex);
return cr;
}
final CoderResult decodeLoopUTF32LE(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
CoderResult cr = CoderResult.UNDERFLOW;
int sourceArrayIndex = source.position();
int ch, i;
donefornow:
{
/* UTF-8 returns here for only non-offset, this needs to change.*/
if (toUnicodeStatus != 0 && target.hasRemaining()) {
i = toULength; /* restore # of bytes consumed */
ch = (int)(toUnicodeStatus - 1);/*Stores the previously calculated ch from a previous call*/
toUnicodeStatus = 0;
toULength=0;
while (i < 4) {
if (sourceArrayIndex < source.limit()) {
ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
}
else {
/* stores a partially calculated target*/
/* + 1 to make 0 a valid character */
toUnicodeStatus = ch + 1;
toULength = (byte) i;
break donefornow;
}
}
if (ch <= UConverterConstants.MAXIMUM_UTF && !isSurrogate(ch)) {
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
if (ch <= UConverterConstants.MAXIMUM_UCS2)
{
/* fits in 16 bits */
target.put((char)ch);
}
else {
/* write out the surrogates */
target.put(UTF16.getLeadSurrogate(ch));
ch = UTF16.getTrailSurrogate(ch);
if (target.hasRemaining()) {
target.put((char)ch);
}
else {
/* Put in overflow buffer (not handled here) */
charErrorBufferArray[0] = (char) ch;
charErrorBufferLength = 1;
cr = CoderResult.OVERFLOW;
}
}
}
else {
toULength = (byte)i;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break donefornow;
}
}
while (sourceArrayIndex < source.limit() && target.hasRemaining()) {
i = 0;
ch = 0;
while (i < 4) {
if (sourceArrayIndex < source.limit()) {
ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
}
else {
/* stores a partially calculated target*/
/* + 1 to make 0 a valid character */
toUnicodeStatus = ch + 1;
toULength = (byte) i;
break donefornow;
}
}
if (ch <= UConverterSharedData.MAXIMUM_UTF && !isSurrogate(ch)) {
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
{
/* fits in 16 bits */
target.put((char) ch);
}
else {
/* write out the surrogates */
target.put(UTF16.getLeadSurrogate(ch));
ch = UTF16.getTrailSurrogate(ch);
if (target.hasRemaining()) {
target.put((char)ch);
}
else {
/* Put in overflow buffer (not handled here) */
charErrorBufferArray[0] = (char) ch;
charErrorBufferLength = 1;
cr = CoderResult.OVERFLOW;
break;
}
}
}
else {
toULength = (byte)i;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break;
}
}
}
if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
/* End of target buffer */
cr = CoderResult.OVERFLOW;
}
source.position(sourceArrayIndex);
return cr;
}
protected void implReset() {
super.implReset();
isFirstBuffer = true;
}
}
class CharsetEncoderUTF32 extends CharsetEncoderICU{
@ -158,10 +352,9 @@ class CharsetUTF32 extends CharsetICU {
public CharsetEncoderUTF32(CharsetICU cs) {
super(cs, fromUSubstitution);
implReset();
writeBOM = true;
}
private final static int NEED_TO_WRITE_BOM = 1;
protected void implReset() {
super.implReset();
fromUnicodeStatus = NEED_TO_WRITE_BOM;

View File

@ -0,0 +1,58 @@
/**
*******************************************************************************
* Copyright (C) 2007, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
class CharsetUTF32BE extends CharsetUTF32 {
public CharsetUTF32BE(String icuCanonicalName, String javaCanonicalName, String[] aliases){
super(icuCanonicalName, javaCanonicalName, aliases);
}
class CharsetDecoderUTF32BE extends CharsetDecoderUTF32{
public CharsetDecoderUTF32BE(CharsetICU cs) {
super(cs);
}
protected CoderResult decodeLoopImpl(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
return decodeLoopUTF32BE(source, target, offsets, flush);
}
protected int getChar(byte[] bytes, int length){
int i=0, ch=0;
while (i<length){
ch |= (bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
i++;
}
return ch;
}
}
public CharsetDecoder newDecoder() {
return new CharsetDecoderUTF32BE(this);
}
class CharsetEncoderUTF32BE extends CharsetEncoderUTF32{
public CharsetEncoderUTF32BE(CharsetICU cs) {
super(cs);
implReset();
}
protected void implReset() {
super.implReset();
fromUnicodeStatus = 0;
writeBOM = false;
}
}
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF32BE(this);
}
}

View File

@ -19,7 +19,7 @@ import com.ibm.icu.text.UTF16;
/**
* @author Niti Hantaweepant
*/
class CharsetUTF32LE extends CharsetICU {
class CharsetUTF32LE extends CharsetUTF32 {
protected byte[] fromUSubstitution = new byte[]{(byte)0xfd, (byte)0xff, (byte)0, (byte)0};
@ -29,142 +29,34 @@ class CharsetUTF32LE extends CharsetICU {
minBytesPerChar = 4;
maxCharsPerByte = 1;
}
class CharsetDecoderUTF32LE extends CharsetDecoderICU{
class CharsetDecoderUTF32LE extends CharsetDecoderUTF32{
public CharsetDecoderUTF32LE(CharsetICU cs) {
super(cs);
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
CoderResult cr = CoderResult.UNDERFLOW;
int sourceArrayIndex = source.position();
int ch, i;
donefornow:
{
/* UTF-8 returns here for only non-offset, this needs to change.*/
if (toUnicodeStatus != 0 && target.hasRemaining()) {
i = toULength; /* restore # of bytes consumed */
ch = (int)(toUnicodeStatus - 1);/*Stores the previously calculated ch from a previous call*/
toUnicodeStatus = 0;
toULength=0;
while (i < 4) {
if (sourceArrayIndex < source.limit()) {
ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
}
else {
/* stores a partially calculated target*/
/* + 1 to make 0 a valid character */
toUnicodeStatus = ch + 1;
toULength = (byte) i;
break donefornow;
}
}
if (ch <= UConverterConstants.MAXIMUM_UTF && !isSurrogate(ch)) {
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
if (ch <= UConverterConstants.MAXIMUM_UCS2)
{
/* fits in 16 bits */
target.put((char)ch);
}
else {
/* write out the surrogates */
target.put(UTF16.getLeadSurrogate(ch));
ch = UTF16.getTrailSurrogate(ch);
if (target.hasRemaining()) {
target.put((char)ch);
}
else {
/* Put in overflow buffer (not handled here) */
charErrorBufferArray[0] = (char) ch;
charErrorBufferLength = 1;
cr = CoderResult.OVERFLOW;
}
}
}
else {
toULength = (byte)i;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break donefornow;
}
}
while (sourceArrayIndex < source.limit() && target.hasRemaining()) {
i = 0;
ch = 0;
while (i < 4) {
if (sourceArrayIndex < source.limit()) {
ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
}
else {
/* stores a partially calculated target*/
/* + 1 to make 0 a valid character */
toUnicodeStatus = ch + 1;
toULength = (byte) i;
break donefornow;
}
}
if (ch <= UConverterSharedData.MAXIMUM_UTF && !isSurrogate(ch)) {
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
{
/* fits in 16 bits */
target.put((char) ch);
}
else {
/* write out the surrogates */
target.put(UTF16.getLeadSurrogate(ch));
ch = UTF16.getTrailSurrogate(ch);
if (target.hasRemaining()) {
target.put((char)ch);
}
else {
/* Put in overflow buffer (not handled here) */
charErrorBufferArray[0] = (char) ch;
charErrorBufferLength = 1;
cr = CoderResult.OVERFLOW;
break;
}
}
}
else {
toULength = (byte)i;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break;
}
}
protected CoderResult decodeLoopImpl(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
return decodeLoopUTF32LE(source, target, offsets, flush);
}
protected int getChar(byte[] bytes, int length){
int i=0;
int ch=0;
while(i<length){
ch |= (bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
i++;
}
if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
/* End of target buffer */
cr = CoderResult.OVERFLOW;
}
source.position(sourceArrayIndex);
return cr;
}
return ch;
}
}
class CharsetEncoderUTF32LE extends CharsetEncoderICU{
public CharsetEncoderUTF32LE(CharsetICU cs) {
super(cs, fromUSubstitution);
implReset();
}
private final static int NEED_TO_WRITE_BOM = 1;
protected void implReset() {
super.implReset();
fromUnicodeStatus = NEED_TO_WRITE_BOM;
fromUnicodeStatus = 0;
}
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){

View File

@ -47,6 +47,7 @@ public class TestCharset extends TestFmwk {
(byte) 0x00,(byte) 0x0d,
(byte) 0x00,(byte) 0x0a };
static final byte[] expectedByteStr ={
(byte) 0xfe,(byte) 0xff,
(byte) 0x00,(byte) 'a',
(byte) 0x00,(byte) 'b',
(byte) 0x00,(byte) 'c',
@ -76,7 +77,7 @@ public class TestCharset extends TestFmwk {
}
public void TestUTF16Converter(){
CharsetProvider icu = new CharsetProviderICU();
Charset cs1 = icu.charsetForName("UTF-16");
Charset cs1 = icu.charsetForName("UTF-16BE");
CharsetEncoder e1 = cs1.newEncoder();
CharsetDecoder d1 = cs1.newDecoder();
@ -168,7 +169,7 @@ public class TestCharset extends TestFmwk {
}
public void TestUTF32Converter(){
CharsetProvider icu = new CharsetProviderICU();
Charset cs1 = icu.charsetForName("UTF-32");
Charset cs1 = icu.charsetForName("UTF-32BE");
CharsetEncoder e1 = cs1.newEncoder();
CharsetDecoder d1 = cs1.newDecoder();
@ -176,7 +177,7 @@ public class TestCharset extends TestFmwk {
CharsetEncoder e2 = cs2.newEncoder();
CharsetDecoder d2 = cs2.newDecoder();
for(int i=0x1d827; i<0x10FFFF; i+=0xFF){
for(int i=0x000; i<0x10FFFF; i+=0xFF){
CharBuffer us = CharBuffer.allocate(0xFF*2);
ByteBuffer bs1 = ByteBuffer.allocate(0xFF*8);
ByteBuffer bs2 = ByteBuffer.allocate(0xFF*8);
@ -868,12 +869,12 @@ public class TestCharset extends TestFmwk {
CharBuffer inBuf = CharBuffer.allocate(in.length);
inBuf.put(in);
CharsetEncoder encoder = cs.newEncoder();
ByteBuffer outBuf = ByteBuffer.allocate(in.length*2);
ByteBuffer outBuf = ByteBuffer.allocate(in.length*2+2);
inBuf.rewind();
encoder.encode(inBuf, outBuf, true);
outBuf.rewind();
if(outBuf.remaining()> in.length*2){
errln("The UTF16 encoder appended bom. Length returned: " + outBuf.remaining());
if(outBuf.get(0)!= (byte)0xFE && outBuf.get(1)!= (byte)0xFF){
errln("The UTF16 encoder did not appended bom. Length returned: " + outBuf.remaining());
}
while(outBuf.hasRemaining()){
logln("0x"+hex(outBuf.get()));
@ -881,7 +882,19 @@ public class TestCharset extends TestFmwk {
CharsetDecoder decoder = cs.newDecoder();
outBuf.rewind();
CharBuffer rt = CharBuffer.allocate(in.length);
decoder.decode(outBuf, rt, true);
CoderResult cr = decoder.decode(outBuf, rt, true);
if(cr.isError()){
errln("Decoding with BOM failed. Error: "+ cr.toString());
}
equals(rt, in);
{
rt.clear();
outBuf.rewind();
Charset utf16 = Charset.forName("UTF-16");
CharsetDecoder dc = utf16.newDecoder();
cr = dc.decode(outBuf, rt, true);
equals(rt, in);
}
}
private void smBufDecode(CharsetDecoder decoder, String encoding, ByteBuffer source, CharBuffer target) {
@ -1531,4 +1544,49 @@ public class TestCharset extends TestFmwk {
}
return null;
}
public void TestUTF32BOM(){
Charset cs = (new CharsetProviderICU()).charsetForName("UTF-32");
char[] in = new char[] { 0xd800, 0xdc00,
0xd801, 0xdc01,
0xdbff, 0xdfff,
0xd900, 0xdd00,
0x0000, 0x0041,
0x0000, 0x0042,
0x0000, 0x0043};
CharBuffer inBuf = CharBuffer.allocate(in.length);
inBuf.put(in);
CharsetEncoder encoder = cs.newEncoder();
ByteBuffer outBuf = ByteBuffer.allocate(in.length*4+4);
inBuf.rewind();
encoder.encode(inBuf, outBuf, true);
outBuf.rewind();
if(outBuf.get(0)!= (byte)0x00 && outBuf.get(1)!= (byte)0x00 &&
outBuf.get(2)!= (byte)0xFF && outBuf.get(3)!= (byte)0xFE){
errln("The UTF16 encoder did not appended bom. Length returned: " + outBuf.remaining());
}
while(outBuf.hasRemaining()){
logln("0x"+hex(outBuf.get()));
}
CharsetDecoder decoder = cs.newDecoder();
outBuf.limit(outBuf.position());
outBuf.rewind();
CharBuffer rt = CharBuffer.allocate(in.length);
CoderResult cr = decoder.decode(outBuf, rt, true);
if(cr.isError()){
errln("Decoding with BOM failed. Error: "+ cr.toString());
}
equals(rt, in);
try{
rt.clear();
outBuf.rewind();
Charset utf16 = Charset.forName("UTF-32");
CharsetDecoder dc = utf16.newDecoder();
cr = dc.decode(outBuf, rt, true);
equals(rt, in);
}catch(UnsupportedCharsetException ex){
// swallow the expection.
}
}
}