ICU-3840 Update Charset ISCII to support new characters in Unicode.

X-SVN-Rev: 21869
This commit is contained in:
Michael Ow 2007-06-30 00:18:01 +00:00
parent 3fc4373c3e
commit 9ef6426440
2 changed files with 125 additions and 4 deletions

View File

@ -34,6 +34,7 @@ class CharsetISCII extends CharsetICU {
private final short ISCII_NUKTA = 0xe9;
private final short ISCII_HALANT = 0xe8;
private final short ISCII_DANDA = 0xea;
private final short ISCII_VOWEL_SIGN_E = 0xe0;
private final short ISCII_INV = 0xd9;
private final short INDIC_BLOCK_BEGIN = 0x0900;
private final short INDIC_BLOCK_END = 0x0d7f;
@ -174,7 +175,7 @@ class CharsetISCII extends CharsetICU {
/* 0xa1: 0xb8: 0x901 */ MaskEnum.DEV_MASK + MaskEnum.ZERO + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO,
/* 0xa2: 0xfe: 0x902 */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
/* 0xa3: 0xbf: 0x903 */ MaskEnum.DEV_MASK + MaskEnum.ZERO + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
/* 0x00: 0x00: 0x904 */ MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO,
/* 0x00: 0x00: 0x904 */ MaskEnum.DEV_MASK + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO,
/* 0xa4: 0xff: 0x905 */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
/* 0xa5: 0xff: 0x906 */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
/* 0xa6: 0xff: 0x907 */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
@ -296,7 +297,7 @@ class CharsetISCII extends CharsetICU {
0x00a1, /* 0x0901 */
0x00a2, /* 0x0902 */
0x00a3, /* 0x0903 */
0xFFFF, /* 0x0904 */
0xa4e0, /* 0x0904 */
0x00a4, /* 0x0905 */
0x00a5, /* 0x0906 */
0x00a6, /* 0x0907 */
@ -697,6 +698,10 @@ class CharsetISCII extends CharsetICU {
{ 0xDB, 0x0962 },
{ 0xDC, 0x0963 }
};
private static final char vowelSignESpecialCases[][] = {
{ 2 /* length of array */ , 0 },
{ 0xA4, 0x0904 }
};
private static final short lookupTable[][] = {
{ MaskEnum.ZERO, MaskEnum.ZERO }, /* DEFAULT */
@ -925,6 +930,25 @@ class CharsetISCII extends CharsetICU {
}
/* else fall through to default */
}
case ISCII_VOWEL_SIGN_E:
/* find <CHAR> + SIGN_VOWEL_E special mapping */
int i = 1;
boolean found = false;
for (; i < vowelSignESpecialCases[0][0]; i++) {
if (vowelSignESpecialCases[i][0] == ((short)data.contextCharToUnicode & UConverterConstants.UNSIGNED_BYTE_MASK)) {
targetUniChar = vowelSignESpecialCases[i][1];
found = true;
break;
}
}
if (found) {
/* find out if the mapping is valid in this state */
if ((validityTable[(byte)targetUniChar] & data.currentMaskFromUnicode) > 0) {
data.contextCharToUnicode = NO_CHAR_MARKER;
this.toUnicodeStatus = UConverterConstants.missingCharMarker;
break;
}
}
default:
targetUniChar = GetMapping(sourceChar, targetUniChar, data);
data.contextCharToUnicode = (char)sourceChar;
@ -979,7 +1003,7 @@ class CharsetISCII extends CharsetICU {
toULength = 0;
}
if (toUnicodeStatus != UConverterConstants.missingCharMarker) {
if (this.toUnicodeStatus != UConverterConstants.missingCharMarker) {
/* output a remaining target character */
WriteToTargetToU(offsets, (source.position() - 2), source, target, this.toUnicodeStatus, data.currentDeltaToUnicode);
this.toUnicodeStatus = UConverterConstants.missingCharMarker;

View File

@ -2286,6 +2286,34 @@ public class TestCharset extends TestFmwk {
errln("ISCII round trip test failed.");
}
//Test new characters in the ISCII charset
encoder = cs.newEncoder();
decoder = cs.newDecoder();
char u_pts[] = {
(char)0x0904
};
byte b_pts[] = {
/*(byte)0xef, (byte)0x42, */(byte)0xa4, (byte)0xe0
};
us = CharBuffer.allocate(u_pts.length);
bs = ByteBuffer.allocate(b_pts.length);
us.put(u_pts);
bs.put(b_pts);
bs.limit(bs.position());
bs.position(0);
us.limit(us.position());
us.position(0);
try {
smBufDecode(decoder, "ISCII-update", bs, us, true, true);
bs.position(0);
us.position(0);
smBufEncode(encoder, "ISCII-update", us, bs, true, true);
} catch (Exception ex) {
errln("Error occurred while encoding/decoding ISCII with the new characters.");
}
//The rest of the code in this method is to provide better code coverage
CharBuffer ccus = CharBuffer.allocate(0x10);
ByteBuffer ccbs = ByteBuffer.allocate(0x10);
@ -2836,6 +2864,75 @@ public class TestCharset extends TestFmwk {
errln("Exception while encoding UTF32LE (6) should have been thrown.");
} catch (Exception ex) {
}
}
//Test for charset UTF16LE to provide better code coverage
public void TestCharsetUTF16LE() {
CoderResult result = CoderResult.UNDERFLOW;
CharsetProvider provider = new CharsetProviderICU();
Charset cs = provider.charsetForName("UTF-16LE");
CharsetEncoder encoder = cs.newEncoder();
CharsetDecoder decoder = cs.newDecoder();
// Test for malform and change fromUChar32 for next call
char u_pts1[] = {
(char)0xD805,
(char)0xDC01, (char)0xDC02, (char)0xDC03,
(char)0xD901, (char)0xD902
};
byte b_pts1[] = {
(byte)0x00,
(byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00
};
CharBuffer us = CharBuffer.allocate(u_pts1.length);
ByteBuffer bs = ByteBuffer.allocate(b_pts1.length);
us.put(u_pts1);
bs.put(b_pts1);
us.limit(1);
us.position(0);
bs.limit(1);
bs.position(0);
result = encoder.encode(us, bs, true);
if (!result.isMalformed()) {
errln("Error while encoding UTF-16LE (1) should have occured.");
}
// Test for malform surrogate from previous buffer
us.limit(4);
us.position(1);
bs.limit(7);
bs.position(1);
result = encoder.encode(us, bs, true);
if (!result.isMalformed()) {
errln("Error while encoding UTF-16LE (2) should have occured.");
}
// Test for malform trail surrogate
encoder.reset();
us.limit(1);
us.position(0);
bs.limit(1);
bs.position(0);
result = encoder.encode(us, bs, true);
us.limit(6);
us.position(4);
bs.limit(4);
bs.position(1);
result = encoder.encode(us, bs, true);
if (!result.isMalformed()) {
errln("Error while encoding UTF-16LE (3) should have occured.");
}
}
}