ICU-5564 conform to java's spec for UTF-16 converter
X-SVN-Rev: 20917
This commit is contained in:
parent
98cf7d46ae
commit
31a9f8c37b
2
.gitattributes
vendored
2
.gitattributes
vendored
@ -55,6 +55,8 @@ icu4c/source/test/testdata/importtest.bin -text
|
||||
icu4c/source/test/testdata/uni-text.bin -text
|
||||
icu4j/ee.foundation.jar -text
|
||||
icu4j/license.html -text
|
||||
icu4j/src/com/ibm/icu/charset/CharsetUTF16BE.java -text
|
||||
icu4j/src/com/ibm/icu/charset/CharsetUTF32BE.java -text
|
||||
icu4j/src/com/ibm/icu/dev/data/rbbi/english.dict -text
|
||||
icu4j/src/com/ibm/icu/dev/data/testdata.jar -text
|
||||
icu4j/src/com/ibm/icu/dev/data/thai6.ucs -text
|
||||
|
@ -149,6 +149,10 @@
|
||||
<srcfiles dir="${build.dir}" includes="${icu4j.data.path}/*.icu"/>
|
||||
</uptodate>
|
||||
<!-- <echo message="icu4j.module.resources result: ${icu4j.module.resources}" /> -->
|
||||
<tstamp>
|
||||
<format property="date.time" pattern="yyyy-MM-dd 'at' hh:mm:ss z" locale="en,US"/>
|
||||
</tstamp>
|
||||
<echo message="Initialized at ${date.time}"/>
|
||||
</target>
|
||||
|
||||
<!-- build everything but dist-related stuff -->
|
||||
|
@ -169,7 +169,7 @@ public abstract class CharsetDecoderICU extends CharsetDecoder{
|
||||
setSourcePosition(in);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Implements the ICU semantic for decode operation
|
||||
* @param in The input byte buffer
|
||||
|
@ -29,6 +29,9 @@ import com.ibm.icu.text.UTF16;
|
||||
*/
|
||||
public abstract class CharsetEncoderICU extends CharsetEncoder {
|
||||
|
||||
static final int NEED_TO_WRITE_BOM = 1;
|
||||
boolean writeBOM = false; /* only used by UTF-16, UTF-32 */
|
||||
|
||||
byte[] errorBuffer = new byte[30];
|
||||
int errorBufferLength = 0;
|
||||
|
||||
|
@ -14,6 +14,7 @@ import java.io.InputStreamReader;
|
||||
import java.lang.reflect.Constructor;
|
||||
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.IllegalCharsetNameException;
|
||||
import java.nio.charset.UnsupportedCharsetException;
|
||||
@ -62,7 +63,6 @@ public abstract class CharsetICU extends Charset{
|
||||
byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */
|
||||
byte reserved[/*19*/]; /* +81: 19 to round out the structure */
|
||||
|
||||
boolean writeBOM = false; /* only used by UTF-16, UTF-32 */
|
||||
|
||||
/**
|
||||
*
|
||||
@ -120,15 +120,15 @@ public abstract class CharsetICU extends Charset{
|
||||
algorithmicCharsets.put("US-ASCII", "com.ibm.icu.charset.CharsetASCII" );
|
||||
algorithmicCharsets.put("ISO-8859-1", "com.ibm.icu.charset.Charset88591" );
|
||||
algorithmicCharsets.put("UTF-16", "com.ibm.icu.charset.CharsetUTF16" );
|
||||
algorithmicCharsets.put("UTF-16BE", "com.ibm.icu.charset.CharsetUTF16" );
|
||||
algorithmicCharsets.put("UTF-16BE", "com.ibm.icu.charset.CharsetUTF16BE" );
|
||||
algorithmicCharsets.put("UTF-16LE", "com.ibm.icu.charset.CharsetUTF16LE" );
|
||||
algorithmicCharsets.put("UTF16_OppositeEndian", "com.ibm.icu.charset.CharsetUTF16LE" );
|
||||
algorithmicCharsets.put("UTF16_PlatformEndian", "com.ibm.icu.charset.CharsetUTF16" );
|
||||
algorithmicCharsets.put("UTF-32", "com.ibm.icu.charset.CharsetUTF32" );
|
||||
algorithmicCharsets.put("UTF-32BE", "com.ibm.icu.charset.CharsetUTF32" );
|
||||
algorithmicCharsets.put("UTF-32BE", "com.ibm.icu.charset.CharsetUTF32BE" );
|
||||
algorithmicCharsets.put("UTF-32LE", "com.ibm.icu.charset.CharsetUTF32LE" );
|
||||
algorithmicCharsets.put("UTF32_PlatformEndian", "com.ibm.icu.charset.CharsetUTF32LE" );
|
||||
algorithmicCharsets.put("UTF32_OppositeEndian", "com.ibm.icu.charset.CharsetUTF32" );
|
||||
algorithmicCharsets.put("UTF32_OppositeEndian", "com.ibm.icu.charset.CharsetUTF32LE" );
|
||||
algorithmicCharsets.put("UTF32_PlatformEndian", "com.ibm.icu.charset.CharsetUTF32" );
|
||||
algorithmicCharsets.put("UTF-7", "com.ibm.icu.charset.CharsetUTF7" );
|
||||
algorithmicCharsets.put("UTF-8", "com.ibm.icu.charset.CharsetUTF8" );
|
||||
}
|
||||
@ -223,11 +223,106 @@ public abstract class CharsetICU extends Charset{
|
||||
CharsetProviderICU icuProvider = new CharsetProviderICU();
|
||||
CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName);
|
||||
if (cs != null) {
|
||||
cs.writeBOM = true;
|
||||
return cs;
|
||||
}
|
||||
return Charset.forName(charsetName);
|
||||
}
|
||||
|
||||
/**
|
||||
* This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
|
||||
* start of the stream for example U+FEFF (the Unicode BOM/signature
|
||||
* character) that can be ignored.
|
||||
*
|
||||
* Detects Unicode signature byte sequences at the start of the byte stream
|
||||
* and returns number of bytes of the BOM of the indicated Unicode charset.
|
||||
* 0 is returned when no Unicode signature is recognized.
|
||||
*
|
||||
*/
|
||||
static String detectUnicodeSignature(ByteBuffer source) {
|
||||
int signatureLength = 0; // number of bytes of the signature
|
||||
final int SIG_MAX_LEN = 5;
|
||||
String sigUniCharset = null; // states what unicode charset is the BOM
|
||||
int i = 0;
|
||||
|
||||
/*
|
||||
* initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we
|
||||
* don't misdetect something
|
||||
*/
|
||||
byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5,
|
||||
(byte) 0xa5 };
|
||||
|
||||
while (i < source.remaining() && i < SIG_MAX_LEN) {
|
||||
start[i] = source.get(i);
|
||||
i++;
|
||||
}
|
||||
|
||||
if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) {
|
||||
signatureLength = 2;
|
||||
sigUniCharset = "UTF-16BE";
|
||||
source.position(signatureLength);
|
||||
return sigUniCharset;
|
||||
} else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) {
|
||||
if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) {
|
||||
signatureLength = 4;
|
||||
sigUniCharset = "UTF-32LE";
|
||||
source.position(signatureLength);
|
||||
return sigUniCharset;
|
||||
} else {
|
||||
signatureLength = 2;
|
||||
sigUniCharset = "UTF-16LE";
|
||||
source.position(signatureLength);
|
||||
return sigUniCharset;
|
||||
}
|
||||
} else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB
|
||||
&& start[2] == (byte) 0xBF) {
|
||||
signatureLength = 3;
|
||||
sigUniCharset = "UTF-8";
|
||||
source.position(signatureLength);
|
||||
return sigUniCharset;
|
||||
} else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00
|
||||
&& start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) {
|
||||
signatureLength = 4;
|
||||
sigUniCharset = "UTF-32BE";
|
||||
source.position(signatureLength);
|
||||
return sigUniCharset;
|
||||
} else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE
|
||||
&& start[2] == (byte) 0xFF) {
|
||||
signatureLength = 3;
|
||||
sigUniCharset = "SCSU";
|
||||
source.position(signatureLength);
|
||||
return sigUniCharset;
|
||||
} else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE
|
||||
&& start[2] == (byte) 0x28) {
|
||||
signatureLength = 3;
|
||||
sigUniCharset = "BOCU-1";
|
||||
source.position(signatureLength);
|
||||
return sigUniCharset;
|
||||
} else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F
|
||||
&& start[2] == (byte) 0x76) {
|
||||
|
||||
if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) {
|
||||
signatureLength = 5;
|
||||
sigUniCharset = "UTF-7";
|
||||
source.position(signatureLength);
|
||||
return sigUniCharset;
|
||||
} else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39
|
||||
|| start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) {
|
||||
signatureLength = 4;
|
||||
sigUniCharset = "UTF-7";
|
||||
source.position(signatureLength);
|
||||
return sigUniCharset;
|
||||
}
|
||||
} else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73
|
||||
&& start[2] == (byte) 0x66 && start[3] == (byte) 0x73) {
|
||||
signatureLength = 4;
|
||||
sigUniCharset = "UTF-EBCDIC";
|
||||
source.position(signatureLength);
|
||||
return sigUniCharset;
|
||||
}
|
||||
|
||||
/* no known Unicode signature byte sequence recognized */
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -20,7 +20,7 @@ import com.ibm.icu.text.UTF16;
|
||||
class CharsetUTF16 extends CharsetICU {
|
||||
|
||||
protected byte[] fromUSubstitution = new byte[]{(byte)0xff, (byte)0xfd};
|
||||
|
||||
|
||||
public CharsetUTF16(String icuCanonicalName, String javaCanonicalName, String[] aliases){
|
||||
super(icuCanonicalName, javaCanonicalName, aliases);
|
||||
maxBytesPerChar = 4;
|
||||
@ -28,17 +28,152 @@ class CharsetUTF16 extends CharsetICU {
|
||||
maxCharsPerByte = 1;
|
||||
}
|
||||
class CharsetDecoderUTF16 extends CharsetDecoderICU{
|
||||
|
||||
ByteBuffer utf16BOM = ByteBuffer.wrap(new byte[]{ (byte)0xfe, (byte)0xff, 0, 0, (byte)0xff, (byte)0xfe, 0, 0 });
|
||||
public CharsetDecoderUTF16(CharsetICU cs) {
|
||||
super(cs);
|
||||
}
|
||||
|
||||
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
int state, offsetDelta;
|
||||
byte b;
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
int offsetsPos = (offsets==null)?0:offsets.position();
|
||||
utf16BOM.limit(utf16BOM.capacity());
|
||||
/*
|
||||
* If we detect a BOM in this buffer, then we must add the BOM size to the
|
||||
* offsets because the actual converter function will not see and count the BOM.
|
||||
* offsetDelta will have the number of the BOM bytes that are in the current buffer.
|
||||
*/
|
||||
offsetDelta=0;
|
||||
state=mode;
|
||||
int pos = source.position();
|
||||
while(pos < source.limit()) {
|
||||
switch(state) {
|
||||
case 0:
|
||||
b=source.get(pos);
|
||||
if(b==(byte)0xfe) {
|
||||
state=1; /* could be FE FF */
|
||||
} else if(b==(byte)0xff) {
|
||||
state=5; /* could be FF FE */
|
||||
} else {
|
||||
state=8; /* default to UTF-16BE */
|
||||
continue;
|
||||
}
|
||||
pos++;
|
||||
break;
|
||||
case 1:
|
||||
case 5:
|
||||
if(source.get(pos)==utf16BOM.get(state)) {
|
||||
++pos;
|
||||
if(state==1) {
|
||||
state=8; /* detect UTF-16BE */
|
||||
offsetDelta=pos-source.position();
|
||||
} else if(state==5) {
|
||||
state=9; /* detect UTF-16LE */
|
||||
offsetDelta=pos-source.position();
|
||||
}
|
||||
} else {
|
||||
/* switch to UTF-16BE and pass the previous bytes */
|
||||
if(pos!=source.position()) {
|
||||
/* just reset the source */
|
||||
pos=source.position();
|
||||
} else {
|
||||
boolean oldFlush=flush;
|
||||
int bomIndex = state&4;
|
||||
ByteBuffer oldSource = source;
|
||||
source = utf16BOM;
|
||||
utf16BOM.position(bomIndex);/* select the correct BOM */
|
||||
source.limit(bomIndex+1);/* replay previous byte */
|
||||
flush = false; /* this sourceLimit is not the real source stream limit */
|
||||
cr = decodeLoopUTF16BE(source, target, offsets, flush);
|
||||
/* restore real pointers; pArgs->source will be set in case 8/9 */
|
||||
flush = oldFlush;
|
||||
source = oldSource;
|
||||
}
|
||||
state=8;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
case 9:
|
||||
mode = state;
|
||||
source.position(pos);
|
||||
cr = decodeLoopImpl(source, target, offsets, flush);
|
||||
pos = source.position();
|
||||
break;
|
||||
default:
|
||||
break; /* does not occur */
|
||||
}
|
||||
if(cr.isOverflow() || cr.isError()){
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* add BOM size to offsets - see comment at offsetDelta declaration */
|
||||
if(offsets!=null && offsetDelta!=0) {
|
||||
int offsetsLimit=offsets.position();
|
||||
while(offsetsPos<offsetsLimit) {
|
||||
int delta = offsetDelta + offsets.get(pos);
|
||||
offsets.put(pos++, delta);
|
||||
}
|
||||
}
|
||||
|
||||
source.position(pos);
|
||||
|
||||
if(!source.hasRemaining() && flush) {
|
||||
/* handle truncated input */
|
||||
switch(state) {
|
||||
case 0:
|
||||
break; /* no input at all, nothing to do */
|
||||
case 8:
|
||||
cr = decodeLoopUTF16BE(source, target, offsets, flush);
|
||||
break;
|
||||
case 9:
|
||||
cr = decodeLoopUTF16LE(source, target, offsets, flush);
|
||||
break;
|
||||
default:
|
||||
/* handle 0<state<8: call UTF-16BE with too-short input */
|
||||
boolean oldFlush=flush;
|
||||
int bomIndex = state&4;
|
||||
ByteBuffer oldSource = source;
|
||||
source = utf16BOM;
|
||||
utf16BOM.position(bomIndex);/* select the correct BOM */
|
||||
source.limit(bomIndex+1);/* replay previous byte */
|
||||
flush = false; /* this sourceLimit is not the real source stream limit */
|
||||
cr = decodeLoopUTF16BE(source, target, offsets, flush);
|
||||
/* restore real pointers; pArgs->source will be set in case 8/9 */
|
||||
flush = oldFlush;
|
||||
source = oldSource;
|
||||
state=8;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mode=state;
|
||||
return cr;
|
||||
}
|
||||
protected CoderResult decodeLoopImpl(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
if(mode==8){
|
||||
/* call UTF-16BE */
|
||||
cr = decodeLoopUTF16BE(source, target, offsets, flush);
|
||||
}else if(mode==9){
|
||||
/* call UTF-16LE */
|
||||
cr =decodeLoopUTF16LE(source, target, offsets, flush);
|
||||
}else{
|
||||
/* should not occur */
|
||||
throw new InternalError("Unknown State in UTF-16 converter!");
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
final CoderResult decodeLoopUTF16BE(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
|
||||
if(!source.hasRemaining() && toUnicodeStatus==0) {
|
||||
/* no input, nothing to do */
|
||||
return cr;
|
||||
}
|
||||
}
|
||||
if(!target.hasRemaining()) {
|
||||
return CoderResult.OVERFLOW;
|
||||
}
|
||||
@ -241,7 +376,219 @@ class CharsetUTF16 extends CharsetICU {
|
||||
|
||||
return cr;
|
||||
}
|
||||
final CoderResult decodeLoopUTF16LE(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
if(!source.hasRemaining() && toUnicodeStatus==0) {
|
||||
/* no input, nothing to do */
|
||||
return cr;
|
||||
}
|
||||
if(!target.hasRemaining()) {
|
||||
return CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
int sourceIndex=0, count=0, length, sourceArrayIndex;
|
||||
char c=0, trail;
|
||||
length = source.remaining();
|
||||
sourceArrayIndex = source.position();
|
||||
|
||||
/* complete a partial UChar or pair from the last call */
|
||||
if(toUnicodeStatus!=0) {
|
||||
/*
|
||||
* special case: single byte from a previous buffer,
|
||||
* where the byte turned out not to belong to a trail surrogate
|
||||
* and the preceding, unmatched lead surrogate was put into toUBytes[]
|
||||
* for error handling
|
||||
*/
|
||||
toUBytesArray[toUBytesBegin+0]=(byte)toUnicodeStatus;
|
||||
toULength=1;
|
||||
toUnicodeStatus=0;
|
||||
}
|
||||
if((count=toULength)!=0) {
|
||||
byte[] pArray=toUBytesArray;
|
||||
int pArrayIndex = toUBytesBegin;
|
||||
do {
|
||||
pArray[count++]=source.get(sourceArrayIndex++);
|
||||
++sourceIndex;
|
||||
--length;
|
||||
if(count==2) {
|
||||
c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
/* output the BMP code point */
|
||||
target.put(c);
|
||||
if(offsets!=null) {
|
||||
offsets.put(-1);
|
||||
}
|
||||
count=0;
|
||||
c=0;
|
||||
break;
|
||||
} else if(UTF16.isLeadSurrogate(c)) {
|
||||
/* continue collecting bytes for the trail surrogate */
|
||||
c=0; /* avoid unnecessary surrogate handling below */
|
||||
} else {
|
||||
/* fall through to error handling for an unmatched trail surrogate */
|
||||
break;
|
||||
}
|
||||
} else if(count==4) {
|
||||
c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
trail=(char)(((pArray[pArrayIndex+3]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+2]&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
if(UTF16.isTrailSurrogate(trail)) {
|
||||
/* output the surrogate pair */
|
||||
target.put(c);
|
||||
if(target.remaining()>=1) {
|
||||
target.put(trail);
|
||||
if(offsets!=null) {
|
||||
offsets.put(-1);
|
||||
offsets.put(-1);
|
||||
}
|
||||
} else /* targetCapacity==1 */ {
|
||||
charErrorBufferArray[charErrorBufferBegin+0]=trail;
|
||||
charErrorBufferLength=1;
|
||||
return CoderResult.OVERFLOW;
|
||||
}
|
||||
count=0;
|
||||
c=0;
|
||||
break;
|
||||
} else {
|
||||
/* unmatched lead surrogate, handle here for consistent toUBytes[] */
|
||||
|
||||
/* back out reading the code unit after it */
|
||||
if((source.position()-sourceArrayIndex)>=2) {
|
||||
sourceArrayIndex-=2;
|
||||
} else {
|
||||
/*
|
||||
* if the trail unit's first byte was in a previous buffer, then
|
||||
* we need to put it into a special place because toUBytes[] will be
|
||||
* used for the lead unit's bytes
|
||||
*/
|
||||
toUnicodeStatus=0x100|pArray[pArrayIndex+2];
|
||||
--sourceArrayIndex;
|
||||
}
|
||||
toULength=2;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while(length>0);
|
||||
toULength=(byte)count;
|
||||
}
|
||||
|
||||
/* copy an even number of bytes for complete UChars */
|
||||
count=2*target.remaining();
|
||||
if(count>length) {
|
||||
count=length&~1;
|
||||
}
|
||||
if(c==0 && count>0) {
|
||||
length-=count;
|
||||
count>>=1;
|
||||
//targetCapacity-=count;
|
||||
if(offsets==null) {
|
||||
do {
|
||||
c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
sourceArrayIndex+=2;
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
target.put(c);
|
||||
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
|
||||
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))
|
||||
) {
|
||||
sourceArrayIndex+=2;
|
||||
--count;
|
||||
target.put(c);
|
||||
target.put(trail);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while(--count>0);
|
||||
} else {
|
||||
do {
|
||||
c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
sourceArrayIndex+=2;
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
target.put(c);
|
||||
offsets.put(sourceIndex);
|
||||
sourceIndex+=2;
|
||||
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
|
||||
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))
|
||||
) {
|
||||
sourceArrayIndex+=2;
|
||||
--count;
|
||||
target.put(c);
|
||||
target.put(trail);
|
||||
offsets.put(sourceIndex);
|
||||
offsets.put(sourceIndex);
|
||||
sourceIndex+=4;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while(--count>0);
|
||||
}
|
||||
|
||||
if(count==0) {
|
||||
/* done with the loop for complete UChars */
|
||||
c=0;
|
||||
} else {
|
||||
/* keep c for surrogate handling, trail will be set there */
|
||||
length+=2*(count-1); /* one more byte pair was consumed than count decremented */
|
||||
}
|
||||
}
|
||||
|
||||
if(c!=0) {
|
||||
/*
|
||||
* c is a surrogate, and
|
||||
* - source or target too short
|
||||
* - or the surrogate is unmatched
|
||||
*/
|
||||
|
||||
toUBytesArray[toUBytesBegin+0]=(byte)c;
|
||||
toUBytesArray[toUBytesBegin+1]=(byte)(c>>>8);
|
||||
toULength=2;
|
||||
|
||||
if(UTF16.isLeadSurrogate(c)) {
|
||||
if(length>=2) {
|
||||
if(UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))) {
|
||||
/* output the surrogate pair, will overflow (see conditions comment above) */
|
||||
sourceArrayIndex+=2;
|
||||
length-=2;
|
||||
target.put(c);
|
||||
if(offsets!=null) {
|
||||
offsets.put(sourceIndex);
|
||||
}
|
||||
charErrorBufferArray[charErrorBufferBegin+0]=trail;
|
||||
charErrorBufferLength=1;
|
||||
toULength=0;
|
||||
cr = CoderResult.OVERFLOW;
|
||||
} else {
|
||||
/* unmatched lead surrogate */
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
} else {
|
||||
/* see if the trail surrogate is in the next buffer */
|
||||
}
|
||||
} else {
|
||||
/* unmatched trail surrogate */
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* check for a remaining source byte */
|
||||
if(!cr.isError()){
|
||||
if(length>0) {
|
||||
if(!target.hasRemaining()) {
|
||||
cr = CoderResult.OVERFLOW;
|
||||
} else {
|
||||
/* it must be length==1 because otherwise the above would have copied more */
|
||||
toUBytesArray[toULength++]=source.get(sourceArrayIndex++);
|
||||
}
|
||||
}
|
||||
}
|
||||
source.position(sourceArrayIndex);
|
||||
|
||||
return cr;
|
||||
}
|
||||
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
}
|
||||
}
|
||||
class CharsetEncoderUTF16 extends CharsetEncoderICU{
|
||||
|
||||
@ -250,13 +597,11 @@ class CharsetUTF16 extends CharsetICU {
|
||||
implReset();
|
||||
}
|
||||
|
||||
private final static int NEED_TO_WRITE_BOM = 1;
|
||||
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
fromUnicodeStatus = NEED_TO_WRITE_BOM;
|
||||
writeBOM = true;
|
||||
}
|
||||
|
||||
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
if(!source.hasRemaining()) {
|
||||
|
50
icu4j/src/com/ibm/icu/charset/CharsetUTF16BE.java
Normal file
50
icu4j/src/com/ibm/icu/charset/CharsetUTF16BE.java
Normal file
@ -0,0 +1,50 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2007, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.charset;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
class CharsetUTF16BE extends CharsetUTF16 {
|
||||
public CharsetUTF16BE(String icuCanonicalName, String javaCanonicalName, String[] aliases){
|
||||
super(icuCanonicalName, javaCanonicalName, aliases);
|
||||
}
|
||||
class CharsetDecoderUTF16BE extends CharsetDecoderUTF16{
|
||||
|
||||
public CharsetDecoderUTF16BE(CharsetICU cs) {
|
||||
super(cs);
|
||||
}
|
||||
protected CoderResult decodeLoopImpl(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
return decodeLoopUTF16BE(source, target, offsets, flush);
|
||||
}
|
||||
}
|
||||
public CharsetDecoder newDecoder() {
|
||||
return new CharsetDecoderUTF16BE(this);
|
||||
}
|
||||
class CharsetEncoderUTF16BE extends CharsetEncoderUTF16{
|
||||
|
||||
public CharsetEncoderUTF16BE(CharsetICU cs) {
|
||||
super(cs);
|
||||
implReset();
|
||||
}
|
||||
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
fromUnicodeStatus = 0;
|
||||
writeBOM = false;
|
||||
}
|
||||
}
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderUTF16BE(this);
|
||||
}
|
||||
}
|
@ -20,245 +20,33 @@ import com.ibm.icu.text.UTF16;
|
||||
/**
|
||||
* @author Niti Hantaweepant
|
||||
*/
|
||||
class CharsetUTF16LE extends CharsetICU {
|
||||
|
||||
protected byte[] fromUSubstitution = new byte[]{(byte)0xfd, (byte)0xff};
|
||||
class CharsetUTF16LE extends CharsetUTF16 {
|
||||
|
||||
public CharsetUTF16LE(String icuCanonicalName, String javaCanonicalName, String[] aliases){
|
||||
super(icuCanonicalName, javaCanonicalName, aliases);
|
||||
maxBytesPerChar = 4;
|
||||
minBytesPerChar = 2;
|
||||
maxCharsPerByte = 1;
|
||||
fromUSubstitution = new byte[]{(byte)0xfd, (byte)0xff};
|
||||
}
|
||||
class CharsetDecoderUTF16LE extends CharsetDecoderICU{
|
||||
|
||||
class CharsetDecoderUTF16LE extends CharsetDecoderUTF16{
|
||||
|
||||
public CharsetDecoderUTF16LE(CharsetICU cs) {
|
||||
super(cs);
|
||||
}
|
||||
|
||||
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
if(!source.hasRemaining() && toUnicodeStatus==0) {
|
||||
/* no input, nothing to do */
|
||||
return cr;
|
||||
}
|
||||
if(!target.hasRemaining()) {
|
||||
return CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
int sourceIndex=0, count=0, length, sourceArrayIndex;
|
||||
char c=0, trail;
|
||||
length = source.remaining();
|
||||
sourceArrayIndex = source.position();
|
||||
|
||||
/* complete a partial UChar or pair from the last call */
|
||||
if(toUnicodeStatus!=0) {
|
||||
/*
|
||||
* special case: single byte from a previous buffer,
|
||||
* where the byte turned out not to belong to a trail surrogate
|
||||
* and the preceding, unmatched lead surrogate was put into toUBytes[]
|
||||
* for error handling
|
||||
*/
|
||||
toUBytesArray[toUBytesBegin+0]=(byte)toUnicodeStatus;
|
||||
toULength=1;
|
||||
toUnicodeStatus=0;
|
||||
}
|
||||
if((count=toULength)!=0) {
|
||||
byte[] pArray=toUBytesArray;
|
||||
int pArrayIndex = toUBytesBegin;
|
||||
do {
|
||||
pArray[count++]=source.get(sourceArrayIndex++);
|
||||
++sourceIndex;
|
||||
--length;
|
||||
if(count==2) {
|
||||
c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
/* output the BMP code point */
|
||||
target.put(c);
|
||||
if(offsets!=null) {
|
||||
offsets.put(-1);
|
||||
}
|
||||
count=0;
|
||||
c=0;
|
||||
break;
|
||||
} else if(UTF16.isLeadSurrogate(c)) {
|
||||
/* continue collecting bytes for the trail surrogate */
|
||||
c=0; /* avoid unnecessary surrogate handling below */
|
||||
} else {
|
||||
/* fall through to error handling for an unmatched trail surrogate */
|
||||
break;
|
||||
}
|
||||
} else if(count==4) {
|
||||
c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
trail=(char)(((pArray[pArrayIndex+3]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+2]&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
if(UTF16.isTrailSurrogate(trail)) {
|
||||
/* output the surrogate pair */
|
||||
target.put(c);
|
||||
if(target.remaining()>=1) {
|
||||
target.put(trail);
|
||||
if(offsets!=null) {
|
||||
offsets.put(-1);
|
||||
offsets.put(-1);
|
||||
}
|
||||
} else /* targetCapacity==1 */ {
|
||||
charErrorBufferArray[charErrorBufferBegin+0]=trail;
|
||||
charErrorBufferLength=1;
|
||||
return CoderResult.OVERFLOW;
|
||||
}
|
||||
count=0;
|
||||
c=0;
|
||||
break;
|
||||
} else {
|
||||
/* unmatched lead surrogate, handle here for consistent toUBytes[] */
|
||||
|
||||
/* back out reading the code unit after it */
|
||||
if((source.position()-sourceArrayIndex)>=2) {
|
||||
sourceArrayIndex-=2;
|
||||
} else {
|
||||
/*
|
||||
* if the trail unit's first byte was in a previous buffer, then
|
||||
* we need to put it into a special place because toUBytes[] will be
|
||||
* used for the lead unit's bytes
|
||||
*/
|
||||
toUnicodeStatus=0x100|pArray[pArrayIndex+2];
|
||||
--sourceArrayIndex;
|
||||
}
|
||||
toULength=2;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while(length>0);
|
||||
toULength=(byte)count;
|
||||
}
|
||||
|
||||
/* copy an even number of bytes for complete UChars */
|
||||
count=2*target.remaining();
|
||||
if(count>length) {
|
||||
count=length&~1;
|
||||
}
|
||||
if(c==0 && count>0) {
|
||||
length-=count;
|
||||
count>>=1;
|
||||
//targetCapacity-=count;
|
||||
if(offsets==null) {
|
||||
do {
|
||||
c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
sourceArrayIndex+=2;
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
target.put(c);
|
||||
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
|
||||
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))
|
||||
) {
|
||||
sourceArrayIndex+=2;
|
||||
--count;
|
||||
target.put(c);
|
||||
target.put(trail);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while(--count>0);
|
||||
} else {
|
||||
do {
|
||||
c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
sourceArrayIndex+=2;
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
target.put(c);
|
||||
offsets.put(sourceIndex);
|
||||
sourceIndex+=2;
|
||||
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
|
||||
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))
|
||||
) {
|
||||
sourceArrayIndex+=2;
|
||||
--count;
|
||||
target.put(c);
|
||||
target.put(trail);
|
||||
offsets.put(sourceIndex);
|
||||
offsets.put(sourceIndex);
|
||||
sourceIndex+=4;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while(--count>0);
|
||||
}
|
||||
|
||||
if(count==0) {
|
||||
/* done with the loop for complete UChars */
|
||||
c=0;
|
||||
} else {
|
||||
/* keep c for surrogate handling, trail will be set there */
|
||||
length+=2*(count-1); /* one more byte pair was consumed than count decremented */
|
||||
}
|
||||
}
|
||||
|
||||
if(c!=0) {
|
||||
/*
|
||||
* c is a surrogate, and
|
||||
* - source or target too short
|
||||
* - or the surrogate is unmatched
|
||||
*/
|
||||
|
||||
toUBytesArray[toUBytesBegin+0]=(byte)c;
|
||||
toUBytesArray[toUBytesBegin+1]=(byte)(c>>>8);
|
||||
toULength=2;
|
||||
|
||||
if(UTF16.isLeadSurrogate(c)) {
|
||||
if(length>=2) {
|
||||
if(UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))) {
|
||||
/* output the surrogate pair, will overflow (see conditions comment above) */
|
||||
sourceArrayIndex+=2;
|
||||
length-=2;
|
||||
target.put(c);
|
||||
if(offsets!=null) {
|
||||
offsets.put(sourceIndex);
|
||||
}
|
||||
charErrorBufferArray[charErrorBufferBegin+0]=trail;
|
||||
charErrorBufferLength=1;
|
||||
toULength=0;
|
||||
cr = CoderResult.OVERFLOW;
|
||||
} else {
|
||||
/* unmatched lead surrogate */
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
} else {
|
||||
/* see if the trail surrogate is in the next buffer */
|
||||
}
|
||||
} else {
|
||||
/* unmatched trail surrogate */
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* check for a remaining source byte */
|
||||
if(!cr.isError()){
|
||||
if(length>0) {
|
||||
if(!target.hasRemaining()) {
|
||||
cr = CoderResult.OVERFLOW;
|
||||
} else {
|
||||
/* it must be length==1 because otherwise the above would have copied more */
|
||||
toUBytesArray[toULength++]=source.get(sourceArrayIndex++);
|
||||
}
|
||||
}
|
||||
}
|
||||
source.position(sourceArrayIndex);
|
||||
|
||||
return cr;
|
||||
protected CoderResult decodeLoopImpl(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
return decodeLoopUTF16LE(source, target, offsets, flush);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class CharsetEncoderUTF16LE extends CharsetEncoderICU{
|
||||
|
||||
|
||||
public CharsetEncoderUTF16LE(CharsetICU cs) {
|
||||
super(cs, fromUSubstitution);
|
||||
implReset();
|
||||
}
|
||||
|
||||
private final static int NEED_TO_WRITE_BOM = 1;
|
||||
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
fromUnicodeStatus = NEED_TO_WRITE_BOM;
|
||||
fromUnicodeStatus = 0;
|
||||
}
|
||||
|
||||
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){
|
||||
|
@ -29,13 +29,86 @@ class CharsetUTF32 extends CharsetICU {
|
||||
minBytesPerChar = 4;
|
||||
maxCharsPerByte = 1;
|
||||
}
|
||||
|
||||
|
||||
class CharsetDecoderUTF32 extends CharsetDecoderICU{
|
||||
|
||||
boolean isFirstBuffer;
|
||||
final int SIGNATURE_LENGTH=4;
|
||||
public CharsetDecoderUTF32(CharsetICU cs) {
|
||||
super(cs);
|
||||
isFirstBuffer = true;
|
||||
}
|
||||
|
||||
|
||||
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
int state, offsetDelta;
|
||||
int offsetsPos = (offsets==null)?0:offsets.position();
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
state=mode;
|
||||
|
||||
/*
|
||||
* If we detect a BOM in this buffer, then we must add the BOM size to the
|
||||
* offsets because the actual converter function will not see and count the BOM.
|
||||
* offsetDelta will have the number of the BOM bytes that are in the current buffer.
|
||||
*/
|
||||
offsetDelta=0;
|
||||
int pos = source.position();
|
||||
if(isFirstBuffer && toULength<SIGNATURE_LENGTH){
|
||||
while(pos < source.limit() && pos < toULength) {
|
||||
toUBytesArray[toULength++] = source.get(pos++);
|
||||
}
|
||||
if(toULength==SIGNATURE_LENGTH){
|
||||
if(toUBytesArray[0]==0x00 && toUBytesArray[1]==0x00 && toUBytesArray[2]==0xFE && toUBytesArray[3]==0xFF){
|
||||
// may be BE
|
||||
state = 1;
|
||||
offsetDelta=4;
|
||||
}else if(toUBytesArray[0]==0xFF && toUBytesArray[1]==0xFE && toUBytesArray[2]==0x00 && toUBytesArray[3]==0x00){
|
||||
//may be LE
|
||||
state = 2;
|
||||
offsetDelta=4;
|
||||
}else{
|
||||
//default to the subclass charset
|
||||
state = 3;
|
||||
toUnicodeStatus = getChar(toUBytesArray, toULength)+1;
|
||||
}
|
||||
isFirstBuffer = false;
|
||||
}
|
||||
}
|
||||
|
||||
/* add BOM size to offsets - see comment at offsetDelta declaration */
|
||||
if(offsets!=null && offsetDelta!=0) {
|
||||
int offsetsLimit=offsets.position();
|
||||
while(offsetsPos<offsetsLimit) {
|
||||
int delta = offsetDelta + offsets.get(pos);
|
||||
offsets.put(pos++, delta);
|
||||
}
|
||||
}
|
||||
|
||||
source.position(pos);
|
||||
if(!cr.isError() && source.hasRemaining()){
|
||||
cr = decodeLoopImpl(source, target, offsets, flush);
|
||||
}
|
||||
mode=state;
|
||||
return cr;
|
||||
}
|
||||
protected int getChar(byte[] bytes, int length){
|
||||
return -1;
|
||||
}
|
||||
protected CoderResult decodeLoopImpl(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
if(mode==1){
|
||||
/* call UTF-16BE */
|
||||
cr = decodeLoopUTF32BE(source, target, offsets, flush);
|
||||
}else if(mode==2){
|
||||
/* call UTF-16LE */
|
||||
cr =decodeLoopUTF32LE(source, target, offsets, flush);
|
||||
}else{
|
||||
/* should not occur */
|
||||
cr = decodeLoopUTF32BE(source, target, offsets, flush);
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
final CoderResult decodeLoopUTF32BE(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
|
||||
int sourceArrayIndex = source.position();
|
||||
@ -151,6 +224,127 @@ class CharsetUTF32 extends CharsetICU {
|
||||
source.position(sourceArrayIndex);
|
||||
return cr;
|
||||
}
|
||||
|
||||
final CoderResult decodeLoopUTF32LE(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
|
||||
int sourceArrayIndex = source.position();
|
||||
int ch, i;
|
||||
|
||||
donefornow:
|
||||
{
|
||||
/* UTF-8 returns here for only non-offset, this needs to change.*/
|
||||
if (toUnicodeStatus != 0 && target.hasRemaining()) {
|
||||
i = toULength; /* restore # of bytes consumed */
|
||||
|
||||
ch = (int)(toUnicodeStatus - 1);/*Stores the previously calculated ch from a previous call*/
|
||||
toUnicodeStatus = 0;
|
||||
toULength=0;
|
||||
|
||||
while (i < 4) {
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
|
||||
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
|
||||
}
|
||||
else {
|
||||
/* stores a partially calculated target*/
|
||||
/* + 1 to make 0 a valid character */
|
||||
toUnicodeStatus = ch + 1;
|
||||
toULength = (byte) i;
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
if (ch <= UConverterConstants.MAXIMUM_UTF && !isSurrogate(ch)) {
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
if (ch <= UConverterConstants.MAXIMUM_UCS2)
|
||||
{
|
||||
/* fits in 16 bits */
|
||||
target.put((char)ch);
|
||||
}
|
||||
else {
|
||||
/* write out the surrogates */
|
||||
target.put(UTF16.getLeadSurrogate(ch));
|
||||
ch = UTF16.getTrailSurrogate(ch);
|
||||
if (target.hasRemaining()) {
|
||||
target.put((char)ch);
|
||||
}
|
||||
else {
|
||||
/* Put in overflow buffer (not handled here) */
|
||||
charErrorBufferArray[0] = (char) ch;
|
||||
charErrorBufferLength = 1;
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
toULength = (byte)i;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
while (sourceArrayIndex < source.limit() && target.hasRemaining()) {
|
||||
i = 0;
|
||||
ch = 0;
|
||||
|
||||
while (i < 4) {
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
|
||||
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
|
||||
}
|
||||
else {
|
||||
/* stores a partially calculated target*/
|
||||
/* + 1 to make 0 a valid character */
|
||||
toUnicodeStatus = ch + 1;
|
||||
toULength = (byte) i;
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UTF && !isSurrogate(ch)) {
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
|
||||
{
|
||||
/* fits in 16 bits */
|
||||
target.put((char) ch);
|
||||
}
|
||||
else {
|
||||
/* write out the surrogates */
|
||||
target.put(UTF16.getLeadSurrogate(ch));
|
||||
ch = UTF16.getTrailSurrogate(ch);
|
||||
if (target.hasRemaining()) {
|
||||
target.put((char)ch);
|
||||
}
|
||||
else {
|
||||
/* Put in overflow buffer (not handled here) */
|
||||
charErrorBufferArray[0] = (char) ch;
|
||||
charErrorBufferLength = 1;
|
||||
cr = CoderResult.OVERFLOW;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
toULength = (byte)i;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
|
||||
/* End of target buffer */
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
source.position(sourceArrayIndex);
|
||||
return cr;
|
||||
}
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
isFirstBuffer = true;
|
||||
}
|
||||
}
|
||||
|
||||
class CharsetEncoderUTF32 extends CharsetEncoderICU{
|
||||
@ -158,10 +352,9 @@ class CharsetUTF32 extends CharsetICU {
|
||||
public CharsetEncoderUTF32(CharsetICU cs) {
|
||||
super(cs, fromUSubstitution);
|
||||
implReset();
|
||||
writeBOM = true;
|
||||
}
|
||||
|
||||
private final static int NEED_TO_WRITE_BOM = 1;
|
||||
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
fromUnicodeStatus = NEED_TO_WRITE_BOM;
|
||||
|
58
icu4j/src/com/ibm/icu/charset/CharsetUTF32BE.java
Normal file
58
icu4j/src/com/ibm/icu/charset/CharsetUTF32BE.java
Normal file
@ -0,0 +1,58 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2007, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.charset;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
class CharsetUTF32BE extends CharsetUTF32 {
|
||||
public CharsetUTF32BE(String icuCanonicalName, String javaCanonicalName, String[] aliases){
|
||||
super(icuCanonicalName, javaCanonicalName, aliases);
|
||||
}
|
||||
class CharsetDecoderUTF32BE extends CharsetDecoderUTF32{
|
||||
|
||||
public CharsetDecoderUTF32BE(CharsetICU cs) {
|
||||
super(cs);
|
||||
}
|
||||
protected CoderResult decodeLoopImpl(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
return decodeLoopUTF32BE(source, target, offsets, flush);
|
||||
}
|
||||
protected int getChar(byte[] bytes, int length){
|
||||
int i=0, ch=0;
|
||||
while (i<length){
|
||||
ch |= (bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
|
||||
i++;
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
}
|
||||
public CharsetDecoder newDecoder() {
|
||||
return new CharsetDecoderUTF32BE(this);
|
||||
}
|
||||
class CharsetEncoderUTF32BE extends CharsetEncoderUTF32{
|
||||
|
||||
public CharsetEncoderUTF32BE(CharsetICU cs) {
|
||||
super(cs);
|
||||
implReset();
|
||||
}
|
||||
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
fromUnicodeStatus = 0;
|
||||
writeBOM = false;
|
||||
}
|
||||
}
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderUTF32BE(this);
|
||||
}
|
||||
}
|
@ -19,7 +19,7 @@ import com.ibm.icu.text.UTF16;
|
||||
/**
|
||||
* @author Niti Hantaweepant
|
||||
*/
|
||||
class CharsetUTF32LE extends CharsetICU {
|
||||
class CharsetUTF32LE extends CharsetUTF32 {
|
||||
|
||||
protected byte[] fromUSubstitution = new byte[]{(byte)0xfd, (byte)0xff, (byte)0, (byte)0};
|
||||
|
||||
@ -29,142 +29,34 @@ class CharsetUTF32LE extends CharsetICU {
|
||||
minBytesPerChar = 4;
|
||||
maxCharsPerByte = 1;
|
||||
}
|
||||
class CharsetDecoderUTF32LE extends CharsetDecoderICU{
|
||||
|
||||
class CharsetDecoderUTF32LE extends CharsetDecoderUTF32{
|
||||
|
||||
public CharsetDecoderUTF32LE(CharsetICU cs) {
|
||||
super(cs);
|
||||
}
|
||||
|
||||
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
|
||||
int sourceArrayIndex = source.position();
|
||||
int ch, i;
|
||||
|
||||
donefornow:
|
||||
{
|
||||
/* UTF-8 returns here for only non-offset, this needs to change.*/
|
||||
if (toUnicodeStatus != 0 && target.hasRemaining()) {
|
||||
i = toULength; /* restore # of bytes consumed */
|
||||
|
||||
ch = (int)(toUnicodeStatus - 1);/*Stores the previously calculated ch from a previous call*/
|
||||
toUnicodeStatus = 0;
|
||||
toULength=0;
|
||||
|
||||
while (i < 4) {
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
|
||||
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
|
||||
}
|
||||
else {
|
||||
/* stores a partially calculated target*/
|
||||
/* + 1 to make 0 a valid character */
|
||||
toUnicodeStatus = ch + 1;
|
||||
toULength = (byte) i;
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
if (ch <= UConverterConstants.MAXIMUM_UTF && !isSurrogate(ch)) {
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
if (ch <= UConverterConstants.MAXIMUM_UCS2)
|
||||
{
|
||||
/* fits in 16 bits */
|
||||
target.put((char)ch);
|
||||
}
|
||||
else {
|
||||
/* write out the surrogates */
|
||||
target.put(UTF16.getLeadSurrogate(ch));
|
||||
ch = UTF16.getTrailSurrogate(ch);
|
||||
if (target.hasRemaining()) {
|
||||
target.put((char)ch);
|
||||
}
|
||||
else {
|
||||
/* Put in overflow buffer (not handled here) */
|
||||
charErrorBufferArray[0] = (char) ch;
|
||||
charErrorBufferLength = 1;
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
toULength = (byte)i;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
while (sourceArrayIndex < source.limit() && target.hasRemaining()) {
|
||||
i = 0;
|
||||
ch = 0;
|
||||
|
||||
while (i < 4) {
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
|
||||
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
|
||||
}
|
||||
else {
|
||||
/* stores a partially calculated target*/
|
||||
/* + 1 to make 0 a valid character */
|
||||
toUnicodeStatus = ch + 1;
|
||||
toULength = (byte) i;
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UTF && !isSurrogate(ch)) {
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
|
||||
{
|
||||
/* fits in 16 bits */
|
||||
target.put((char) ch);
|
||||
}
|
||||
else {
|
||||
/* write out the surrogates */
|
||||
target.put(UTF16.getLeadSurrogate(ch));
|
||||
ch = UTF16.getTrailSurrogate(ch);
|
||||
if (target.hasRemaining()) {
|
||||
target.put((char)ch);
|
||||
}
|
||||
else {
|
||||
/* Put in overflow buffer (not handled here) */
|
||||
charErrorBufferArray[0] = (char) ch;
|
||||
charErrorBufferLength = 1;
|
||||
cr = CoderResult.OVERFLOW;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
toULength = (byte)i;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break;
|
||||
}
|
||||
}
|
||||
protected CoderResult decodeLoopImpl(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
return decodeLoopUTF32LE(source, target, offsets, flush);
|
||||
}
|
||||
protected int getChar(byte[] bytes, int length){
|
||||
int i=0;
|
||||
int ch=0;
|
||||
while(i<length){
|
||||
ch |= (bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
|
||||
i++;
|
||||
}
|
||||
|
||||
if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
|
||||
/* End of target buffer */
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
source.position(sourceArrayIndex);
|
||||
return cr;
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
}
|
||||
|
||||
class CharsetEncoderUTF32LE extends CharsetEncoderICU{
|
||||
|
||||
public CharsetEncoderUTF32LE(CharsetICU cs) {
|
||||
super(cs, fromUSubstitution);
|
||||
implReset();
|
||||
}
|
||||
|
||||
private final static int NEED_TO_WRITE_BOM = 1;
|
||||
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
fromUnicodeStatus = NEED_TO_WRITE_BOM;
|
||||
fromUnicodeStatus = 0;
|
||||
}
|
||||
|
||||
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){
|
||||
|
@ -47,6 +47,7 @@ public class TestCharset extends TestFmwk {
|
||||
(byte) 0x00,(byte) 0x0d,
|
||||
(byte) 0x00,(byte) 0x0a };
|
||||
static final byte[] expectedByteStr ={
|
||||
(byte) 0xfe,(byte) 0xff,
|
||||
(byte) 0x00,(byte) 'a',
|
||||
(byte) 0x00,(byte) 'b',
|
||||
(byte) 0x00,(byte) 'c',
|
||||
@ -76,7 +77,7 @@ public class TestCharset extends TestFmwk {
|
||||
}
|
||||
public void TestUTF16Converter(){
|
||||
CharsetProvider icu = new CharsetProviderICU();
|
||||
Charset cs1 = icu.charsetForName("UTF-16");
|
||||
Charset cs1 = icu.charsetForName("UTF-16BE");
|
||||
CharsetEncoder e1 = cs1.newEncoder();
|
||||
CharsetDecoder d1 = cs1.newDecoder();
|
||||
|
||||
@ -168,7 +169,7 @@ public class TestCharset extends TestFmwk {
|
||||
}
|
||||
public void TestUTF32Converter(){
|
||||
CharsetProvider icu = new CharsetProviderICU();
|
||||
Charset cs1 = icu.charsetForName("UTF-32");
|
||||
Charset cs1 = icu.charsetForName("UTF-32BE");
|
||||
CharsetEncoder e1 = cs1.newEncoder();
|
||||
CharsetDecoder d1 = cs1.newDecoder();
|
||||
|
||||
@ -176,7 +177,7 @@ public class TestCharset extends TestFmwk {
|
||||
CharsetEncoder e2 = cs2.newEncoder();
|
||||
CharsetDecoder d2 = cs2.newDecoder();
|
||||
|
||||
for(int i=0x1d827; i<0x10FFFF; i+=0xFF){
|
||||
for(int i=0x000; i<0x10FFFF; i+=0xFF){
|
||||
CharBuffer us = CharBuffer.allocate(0xFF*2);
|
||||
ByteBuffer bs1 = ByteBuffer.allocate(0xFF*8);
|
||||
ByteBuffer bs2 = ByteBuffer.allocate(0xFF*8);
|
||||
@ -868,12 +869,12 @@ public class TestCharset extends TestFmwk {
|
||||
CharBuffer inBuf = CharBuffer.allocate(in.length);
|
||||
inBuf.put(in);
|
||||
CharsetEncoder encoder = cs.newEncoder();
|
||||
ByteBuffer outBuf = ByteBuffer.allocate(in.length*2);
|
||||
ByteBuffer outBuf = ByteBuffer.allocate(in.length*2+2);
|
||||
inBuf.rewind();
|
||||
encoder.encode(inBuf, outBuf, true);
|
||||
outBuf.rewind();
|
||||
if(outBuf.remaining()> in.length*2){
|
||||
errln("The UTF16 encoder appended bom. Length returned: " + outBuf.remaining());
|
||||
if(outBuf.get(0)!= (byte)0xFE && outBuf.get(1)!= (byte)0xFF){
|
||||
errln("The UTF16 encoder did not appended bom. Length returned: " + outBuf.remaining());
|
||||
}
|
||||
while(outBuf.hasRemaining()){
|
||||
logln("0x"+hex(outBuf.get()));
|
||||
@ -881,7 +882,19 @@ public class TestCharset extends TestFmwk {
|
||||
CharsetDecoder decoder = cs.newDecoder();
|
||||
outBuf.rewind();
|
||||
CharBuffer rt = CharBuffer.allocate(in.length);
|
||||
decoder.decode(outBuf, rt, true);
|
||||
CoderResult cr = decoder.decode(outBuf, rt, true);
|
||||
if(cr.isError()){
|
||||
errln("Decoding with BOM failed. Error: "+ cr.toString());
|
||||
}
|
||||
equals(rt, in);
|
||||
{
|
||||
rt.clear();
|
||||
outBuf.rewind();
|
||||
Charset utf16 = Charset.forName("UTF-16");
|
||||
CharsetDecoder dc = utf16.newDecoder();
|
||||
cr = dc.decode(outBuf, rt, true);
|
||||
equals(rt, in);
|
||||
}
|
||||
}
|
||||
|
||||
private void smBufDecode(CharsetDecoder decoder, String encoding, ByteBuffer source, CharBuffer target) {
|
||||
@ -1531,4 +1544,49 @@ public class TestCharset extends TestFmwk {
|
||||
}
|
||||
return null;
|
||||
}
|
||||
public void TestUTF32BOM(){
|
||||
|
||||
Charset cs = (new CharsetProviderICU()).charsetForName("UTF-32");
|
||||
char[] in = new char[] { 0xd800, 0xdc00,
|
||||
0xd801, 0xdc01,
|
||||
0xdbff, 0xdfff,
|
||||
0xd900, 0xdd00,
|
||||
0x0000, 0x0041,
|
||||
0x0000, 0x0042,
|
||||
0x0000, 0x0043};
|
||||
|
||||
CharBuffer inBuf = CharBuffer.allocate(in.length);
|
||||
inBuf.put(in);
|
||||
CharsetEncoder encoder = cs.newEncoder();
|
||||
ByteBuffer outBuf = ByteBuffer.allocate(in.length*4+4);
|
||||
inBuf.rewind();
|
||||
encoder.encode(inBuf, outBuf, true);
|
||||
outBuf.rewind();
|
||||
if(outBuf.get(0)!= (byte)0x00 && outBuf.get(1)!= (byte)0x00 &&
|
||||
outBuf.get(2)!= (byte)0xFF && outBuf.get(3)!= (byte)0xFE){
|
||||
errln("The UTF16 encoder did not appended bom. Length returned: " + outBuf.remaining());
|
||||
}
|
||||
while(outBuf.hasRemaining()){
|
||||
logln("0x"+hex(outBuf.get()));
|
||||
}
|
||||
CharsetDecoder decoder = cs.newDecoder();
|
||||
outBuf.limit(outBuf.position());
|
||||
outBuf.rewind();
|
||||
CharBuffer rt = CharBuffer.allocate(in.length);
|
||||
CoderResult cr = decoder.decode(outBuf, rt, true);
|
||||
if(cr.isError()){
|
||||
errln("Decoding with BOM failed. Error: "+ cr.toString());
|
||||
}
|
||||
equals(rt, in);
|
||||
try{
|
||||
rt.clear();
|
||||
outBuf.rewind();
|
||||
Charset utf16 = Charset.forName("UTF-32");
|
||||
CharsetDecoder dc = utf16.newDecoder();
|
||||
cr = dc.decode(outBuf, rt, true);
|
||||
equals(rt, in);
|
||||
}catch(UnsupportedCharsetException ex){
|
||||
// swallow the expection.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user