ICU-5444 Added some fixes to the decode and encode loops.

X-SVN-Rev: 21613
This commit is contained in:
Michael Ow 2007-06-01 03:52:20 +00:00
parent 81cb18307a
commit 9c9687d072

View File

@ -21,96 +21,96 @@ import com.ibm.icu.text.UTF16;
*
*/
class CharsetISCII extends CharsetICU {
private final char UCNV_OPTIONS_VERSION_MASK = 0X0f;
private final char NUKTA = 0x093c;
private final char HALANT = 0x094d;
private final char ZWNJ = 0x200c; /* Zero Width Non Joiner */
private final char ZWJ = 0x200d; /* Zero Width Joiner */
private final char INVALID_CHAR = 0xffff;
private final char ATR = 0xef; /* Attribute code */
private final char EXT = 0xff; /* Extension code */
private final char DANDA = 0x0964;
private final char DOUBLE_DANDA = 0x0965;
private final char ISCII_NUKTA = 0xe9;
private final char ISCII_HALANT = 0xe8;
private final char ISCII_DANDA = 0xea;
private final char ISCII_INV = 0xd9;
private final char INDIC_BLOCK_BEGIN = 0x0900;
private final char INDIC_BLOCK_END = 0x0d7f;
private final char INDIC_RANGE = (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN);
private final char VOCALLIC_RR = 0x0931;
private final char LF = 0x0a;
private final char ASCII_END = 0xa0;
private final char NO_CHAR_MARKER = 0xfffe;
private final char TELUGU_DELTA = (char)(UniLang.DELTA * UniLang.TELUGU);
private final char DEV_ABBR_SIGN = 0x0970;
private final char DEV_ANUDATTA = 0x0952;
private final char EXT_RANGE_BEGIN = 0xa1;
private final char EXT_RANGE_END = 0xee;
private final short UCNV_OPTIONS_VERSION_MASK = 0X0f;
private final short NUKTA = 0x093c;
private final short HALANT = 0x094d;
private final short ZWNJ = 0x200c; /* Zero Width Non Joiner */
private final short ZWJ = 0x200d; /* Zero Width Joiner */
private final int INVALID_CHAR = 0xffff;
private final short ATR = 0xef; /* Attribute code */
private final short EXT = 0xf0; /* Extension code */
private final short DANDA = 0x0964;
private final short DOUBLE_DANDA = 0x0965;
private final short ISCII_NUKTA = 0xe9;
private final short ISCII_HALANT = 0xe8;
private final short ISCII_DANDA = 0xea;
private final short ISCII_INV = 0xd9;
private final short INDIC_BLOCK_BEGIN = 0x0900;
private final short INDIC_BLOCK_END = 0x0d7f;
private final short INDIC_RANGE = (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN);
private final short VOCALLIC_RR = 0x0931;
private final short LF = 0x0a;
private final short ASCII_END = 0xa0;
private final int NO_CHAR_MARKER = 0xfffe;
private final short TELUGU_DELTA = (UniLang.DELTA * UniLang.TELUGU);
private final short DEV_ABBR_SIGN = 0x0970;
private final short DEV_ANUDATTA = 0x0952;
private final short EXT_RANGE_BEGIN = 0xa1;
private final short EXT_RANGE_END = 0xee;
private static final class UniLang {
static final int DEVALANGARI = 0;
static final int BENGALI = DEVALANGARI + 1;
static final int GURMUKHI = BENGALI + 1;
static final int GUJARATI = GURMUKHI + 1;
static final int ORIYA = GUJARATI + 1;
static final int TAMIL = ORIYA + 1;
static final int TELUGU = TAMIL + 1;
static final int KANNADA = TELUGU + 1;
static final int MALAYALAM = KANNADA + 1;
static final int DELTA = 0x80;
static final short DEVALANGARI = 0;
static final short BENGALI = DEVALANGARI + 1;
static final short GURMUKHI = BENGALI + 1;
static final short GUJARATI = GURMUKHI + 1;
static final short ORIYA = GUJARATI + 1;
static final short TAMIL = ORIYA + 1;
static final short TELUGU = TAMIL + 1;
static final short KANNADA = TELUGU + 1;
static final short MALAYALAM = KANNADA + 1;
static final short DELTA = 0x80;
}
private static final class ISCIILang {
static final int DEF = 0x40;
static final int RMN = 0x41;
static final int DEV = 0x42;
static final int BNG = 0x43;
static final int TML = 0x44;
static final int TLG = 0x45;
static final int ASM = 0x46;
static final int ORI = 0x47;
static final int KND = 0x48;
static final int MLM = 0x49;
static final int GJR = 0x4a;
static final int PNJ = 0x4b;
static final int ARB = 0x71;
static final int PES = 0x72;
static final int URD = 0x73;
static final int SND = 0x74;
static final int KSM = 0x75;
static final int PST = 0x76;
static final short DEF = 0x40;
static final short RMN = 0x41;
static final short DEV = 0x42;
static final short BNG = 0x43;
static final short TML = 0x44;
static final short TLG = 0x45;
static final short ASM = 0x46;
static final short ORI = 0x47;
static final short KND = 0x48;
static final short MLM = 0x49;
static final short GJR = 0x4a;
static final short PNJ = 0x4b;
static final short ARB = 0x71;
static final short PES = 0x72;
static final short URD = 0x73;
static final short SND = 0x74;
static final short KSM = 0x75;
static final short PST = 0x76;
}
private static final class MaskEnum {
static final int DEV_MASK = 0x80;
static final int PNJ_MASK = 0x40;
static final int GJR_MASK = 0x20;
static final int ORI_MASK = 0x10;
static final int BNG_MASK = 0x08;
static final int KND_MASK = 0x04;
static final int MLM_MASK = 0x02;
static final int TML_MASK = 0x01;
static final int ZERO = 0x00;
static final short DEV_MASK = 0x80;
static final short PNJ_MASK = 0x40;
static final short GJR_MASK = 0x20;
static final short ORI_MASK = 0x10;
static final short BNG_MASK = 0x08;
static final short KND_MASK = 0x04;
static final short MLM_MASK = 0x02;
static final short TML_MASK = 0x01;
static final short ZERO = 0x00;
}
private final String ISCII_CNV_PREFIX = "ISCII,version=";
private final class UConverterDataISCII {
char contextCharToUnicode; /* previous Unicode codepoint for contextual analysis */
char contextCharFromUnicode; /* previous Unicode codepoint for contextual analysis */
char defDeltaToUnicode; /* delta for switching to default state when DEF is encountered */
char currentDeltaFromUnicode; /* current delta in Indic block */
char currentDeltaToUnicode; /* current delta in Indic block */
int currentMaskFromUnicode; /* mask for current state in fromUnicode */
int currentMaskToUnicode; /* mask for current state in toUnicode */
int defMaskToUnicode; /* mask for default state in toUnicode */
int contextCharToUnicode; /* previous Unicode codepoint for contextual analysis */
int contextCharFromUnicode; /* previous Unicode codepoint for contextual analysis */
short defDeltaToUnicode; /* delta for switching to default state when DEF is encountered */
short currentDeltaFromUnicode; /* current delta in Indic block */
short currentDeltaToUnicode; /* current delta in Indic block */
short currentMaskFromUnicode; /* mask for current state in fromUnicode */
short currentMaskToUnicode; /* mask for current state in toUnicode */
short defMaskToUnicode; /* mask for default state in toUnicode */
boolean isFirstBuffer; /* boolean for fromUnicode to see if we need to announce the first script */
boolean resetToDefaultToUnicode; /* boolean for reseting to default delta and mask when a newline is encountered */
String name;
UConverterDataISCII(char contextCharToUnicode, char contextCharFromUnicode, char defDeltaToUnicode, char currentDeltaFromUnicode,
char currentDeltaToUnicode, int currentMaskFromUnicode, int currentMaskToUnicode, int defMaskToUnicode,
UConverterDataISCII(int contextCharToUnicode, int contextCharFromUnicode, short defDeltaToUnicode, short currentDeltaFromUnicode,
short currentDeltaToUnicode, short currentMaskFromUnicode, short currentMaskToUnicode, short defMaskToUnicode,
boolean isFirstBuffer, boolean resetToDefaultToUnicode, String name) {
this.contextCharToUnicode = contextCharToUnicode;
this.contextCharFromUnicode = contextCharFromUnicode;
@ -127,11 +127,11 @@ class CharsetISCII extends CharsetICU {
}
private static final class LookupDataStruct {
int uniLang;
int maskEnum;
int isciiLang;
short uniLang;
short maskEnum;
short isciiLang;
LookupDataStruct(int uniLang, int maskEnum, int isciiLang) {
LookupDataStruct(short uniLang, short maskEnum, short isciiLang) {
this.uniLang = uniLang;
this.maskEnum = maskEnum;
this.isciiLang = isciiLang;
@ -166,7 +166,7 @@ class CharsetISCII extends CharsetICU {
* Telugu and Kannda have same codepoints except for Vocallic_RR which we special case
* and combine and use 1 bit to represent these languages
*/
private static final char validityTable[] = {
private static final short validityTable[] = {
/* This state table is tool generated so please do not edit unless you know exactly what you are doing */
/* Note: This table was edited to mirror the Windows XP implementation */
/* ISCII: Valid: Unicode */
@ -698,7 +698,7 @@ class CharsetISCII extends CharsetICU {
{ 0xDC, 0x0963 }
};
private static final int lookupTable[][] = {
private static final short lookupTable[][] = {
{ MaskEnum.ZERO, MaskEnum.ZERO }, /* DEFAULT */
{ MaskEnum.ZERO, MaskEnum.ZERO }, /* ROMAN */
{ UniLang.DEVALANGARI, MaskEnum.DEV_MASK },
@ -714,24 +714,28 @@ class CharsetISCII extends CharsetICU {
};
private UConverterDataISCII extraInfo = null;
protected byte[] fromUSubstitution = new byte[]{0x2b, 0x2f, 0x76}; //TODO: change this to the appropriate value
protected byte[] fromUSubstitution = new byte[]{(byte)0x1A};
public CharsetISCII(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
super(icuCanonicalName, javaCanonicalName, aliases);
//TODO: change these three to the appropriate value
maxBytesPerChar = 3;
maxBytesPerChar = 4;
minBytesPerChar = 1;
maxCharsPerByte = 1;
int test = lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].uniLang;
int test2 = UniLang.DELTA;
char temp = (char)(test * test2);
int test3 = temp + 3;
extraInfo = new UConverterDataISCII(
NO_CHAR_MARKER, /* contextCharToUnicode */
(char)0x0000, /* contextCharFromUnicode */
(char)(lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].uniLang * UniLang.DELTA), /* defDeltaToUnicode */
(char)(lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].uniLang * UniLang.DELTA), /* currentDeltaFromUnicode */
(char)(lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].uniLang * UniLang.DELTA), /* currentDeltaToUnicode */
lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].maskEnum, /* currentMaskToUnicode */
lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].maskEnum, /* currentMaskFromUnicode */
lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].maskEnum, /* defMaskToUnicode */
0x0000, /* contextCharFromUnicode */
(short)(lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].uniLang * UniLang.DELTA), /* defDeltaToUnicode */
(short)(lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].uniLang * UniLang.DELTA), /* currentDeltaFromUnicode */
(short)(lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].uniLang * UniLang.DELTA), /* currentDeltaToUnicode */
(short)lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].maskEnum, /* currentMaskToUnicode */
(short)lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].maskEnum, /* currentMaskFromUnicode */
(short)lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].maskEnum, /* defMaskToUnicode */
true, /* isFirstBuffer */
false, /* resetToDefaultToUnicode */
ISCII_CNV_PREFIX /* name */
@ -765,12 +769,13 @@ class CharsetISCII extends CharsetICU {
protected void implReset() {
super.implReset();
this.toUnicodeStatus = 0xFFFF;
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
CoderResult cr = CoderResult.UNDERFLOW;
int targetUniChar = 0x0000;
byte sourceChar = 0x0000;
short sourceChar = 0x0000;
UConverterDataISCII data;
boolean gotoCallBack = false;
@ -782,7 +787,7 @@ class CharsetISCII extends CharsetICU {
targetUniChar = UConverterConstants.missingCharMarker;
if (target.hasRemaining()) {
sourceChar = source.get();
sourceChar = (short)((short)source.get() & UConverterConstants.UNSIGNED_BYTE_MASK);
/* look at the post-context perform special processing */
if (data.contextCharToUnicode == ATR) {
@ -791,8 +796,8 @@ class CharsetISCII extends CharsetICU {
*/
/* check if the sourceChar is supported script range */
if (((short)(ISCIILang.PNJ - sourceChar) & UConverterConstants.UNSIGNED_BYTE_MASK) <= (ISCIILang.PNJ - ISCIILang.DEV)) {
data.currentDeltaToUnicode = (char)(lookupTable[sourceChar & 0x0F][0] * UniLang.DELTA);
data.currentMaskToUnicode = (int)lookupTable[sourceChar & 0x0F][1];
data.currentDeltaToUnicode = (short)(lookupTable[sourceChar & 0x0F][0] * UniLang.DELTA);
data.currentMaskToUnicode = lookupTable[sourceChar & 0x0F][1];
} else if (sourceChar == ISCIILang.DEF) {
/* switch back to default */
data.currentDeltaToUnicode = data.defDeltaToUnicode;
@ -822,7 +827,7 @@ class CharsetISCII extends CharsetICU {
data.contextCharToUnicode = NO_CHAR_MARKER;
/* write to target */
WriteToTargetToU(offsets, source, target, targetUniChar, data.currentDeltaToUnicode);
WriteToTargetToU(offsets, (source.position() - 2), source, target, targetUniChar, data.currentDeltaToUnicode);
continue;
}
@ -844,22 +849,21 @@ class CharsetISCII extends CharsetICU {
}
/* write to target */
//TODO: change the delta entry
WriteToTargetToU(offsets, source, target, targetUniChar, data.currentDeltaToUnicode);
WriteToTargetToU(offsets, (source.position() - 2), source, target, targetUniChar, data.currentDeltaToUnicode);
/* reset */
data.contextCharToUnicode = NO_CHAR_MARKER;
}
/* look at the pre-context and perform special processing */
if (!gotoCallBack) {
switch ((char)sourceChar) {
switch (sourceChar) {
case ISCII_INV:
case EXT: /* falls through */
case ATR:
data.contextCharToUnicode = (char)sourceChar;
if (this.toUnicodeStatus != UConverterConstants.missingCharMarker) {
//TODO: add write to target and add offset and offsets entry to all
WriteToTargetToU(offsets, (source.position() - 2), source, target, this.toUnicodeStatus, data.currentDeltaToUnicode);
this.toUnicodeStatus = UConverterConstants.missingCharMarker;
}
continue;
@ -875,6 +879,17 @@ class CharsetISCII extends CharsetICU {
data.contextCharToUnicode = (char)sourceChar;
}
break;
case ISCII_HALANT:
/* handle explicit halant */
if (data.contextCharToUnicode == ISCII_HALANT) {
targetUniChar = ZWNJ;
/* clear context */
data.contextCharToUnicode = NO_CHAR_MARKER;
} else {
targetUniChar = GetMapping(sourceChar, targetUniChar, data);
data.contextCharToUnicode = (char)sourceChar;
}
break;
case 0x0A:
/* fall through */
case 0x0D:
@ -920,7 +935,7 @@ class CharsetISCII extends CharsetICU {
if (this.toUnicodeStatus != UConverterConstants.missingCharMarker && !gotoCallBack) {
/* write the previously mapped codepoint */
//TODO: add WriteToTargetToU call with the correct values
WriteToTargetToU(offsets, (source.position() - 2), source, target, this.toUnicodeStatus, data.currentDeltaToUnicode);
this.toUnicodeStatus = UConverterConstants.missingCharMarker;
}
@ -967,14 +982,14 @@ class CharsetISCII extends CharsetICU {
if (toUnicodeStatus != UConverterConstants.missingCharMarker) {
/* output a remaining target character */
WriteToTargetToU(offsets, source, target, source.get(), data.currentDeltaToUnicode);
WriteToTargetToU(offsets, (source.position() - 2), source, target, this.toUnicodeStatus, data.currentDeltaToUnicode);
this.toUnicodeStatus = UConverterConstants.missingCharMarker;
}
}
return cr;
}
private CoderResult WriteToTargetToU(IntBuffer offsets, ByteBuffer source, CharBuffer target, int targetUniChar, char delta) {
private CoderResult WriteToTargetToU(IntBuffer offsets, int offset, ByteBuffer source, CharBuffer target, int targetUniChar, short delta) {
CoderResult cr = CoderResult.UNDERFLOW;
/* add offset to current Indic Block */
if (targetUniChar > ASCII_END &&
@ -988,7 +1003,7 @@ class CharsetISCII extends CharsetICU {
if (target.hasRemaining()) {
target.put((char)targetUniChar);
if (offsets != null) {
//TODO: add offsets code
offsets.put(offset);
}
} else {
charErrorBufferArray[charErrorBufferLength++] = (char)targetUniChar;
@ -997,7 +1012,7 @@ class CharsetISCII extends CharsetICU {
return cr;
}
private int GetMapping(byte sourceChar, int targetUniChar, UConverterDataISCII data) {
private int GetMapping(short sourceChar, int targetUniChar, UConverterDataISCII data) {
targetUniChar = toUnicodeTable[sourceChar];
/* is the code point valid in current script? */
if (sourceChar > ASCII_END &&
@ -1035,15 +1050,15 @@ class CharsetISCII extends CharsetICU {
int targetByteUnit = 0x0000;
int sourceChar = 0x0000;
UConverterDataISCII converterData;
char newDelta = 0;
char range = 0;
short newDelta = 0;
short range = 0;
boolean deltaChanged = false;
boolean gotoGetTrail = false;
/* initialize data */
converterData = extraInfo;
newDelta = converterData.currentDeltaFromUnicode;
range = (char)(newDelta / UniLang.DELTA);
range = (short)(newDelta / UniLang.DELTA);
if ((sourceChar = fromUChar32) != 0) {
gotoGetTrail = true;
@ -1057,8 +1072,7 @@ class CharsetISCII extends CharsetICU {
/* check if input is in ASCII and C0 control codes range */
if (sourceChar <= ASCII_END) {
//TODO: add correct parameters
cr = WriteToTargetFromU(offsets, source, target, targetByteUnit);
cr = WriteToTargetFromU(offsets, source, target, sourceChar);
if (cr.isOverflow()) {
break;
}
@ -1067,7 +1081,6 @@ class CharsetISCII extends CharsetICU {
targetByteUnit += (byte)lookupInitialData[range].isciiLang;
fromUnicodeStatus = sourceChar;
/* now append ATR and language code */
//TODO: add correct parameters
cr = WriteToTargetFromU(offsets, source, target, targetByteUnit);
if (cr.isOverflow()) {
break;
@ -1105,8 +1118,8 @@ class CharsetISCII extends CharsetICU {
*/
if (sourceChar != DANDA && sourceChar != DOUBLE_DANDA) {
/* find out to which block the sourceChar belongs */
range = (char)((sourceChar - INDIC_BLOCK_BEGIN) / UniLang.DELTA);
newDelta = (char)(range * UniLang.DELTA);
range = (short)((sourceChar - INDIC_BLOCK_BEGIN) / UniLang.DELTA);
newDelta = (short)(range * UniLang.DELTA);
/* Now are we in the same block as previous? */
if (newDelta != converterData.currentDeltaFromUnicode || converterData.isFirstBuffer) {
@ -1140,8 +1153,7 @@ class CharsetISCII extends CharsetICU {
/* reset */
deltaChanged = false;
/* now append ATR and language code */
//TODO: put in arguments
cr = WriteToTargetFromU(offsets, source, target, targetByteUnit);
cr = WriteToTargetFromU(offsets, source, target, temp);
if (cr.isOverflow()) {
break;
}
@ -1158,7 +1170,6 @@ class CharsetISCII extends CharsetICU {
converterData.contextCharFromUnicode = (char)targetByteUnit;
}
/*write targetByteUnit to target */
//TODO: add correct parameters
cr = WriteToTargetFromU(offsets, source, target, targetByteUnit);
if (cr.isOverflow()) {
break;
@ -1217,17 +1228,17 @@ class CharsetISCII extends CharsetICU {
if (targetByteUnit <= 0xFF) {
target.put((byte)targetByteUnit);
if (offsets != null) {
//TODO: add offsets code
offsets.put((source.position() - 1));
}
} else {
target.put((byte)(targetByteUnit >> 8));
if (offsets != null) {
//TODO: add offsets code
offsets.put((source.position() - 1));
}
if (target.hasRemaining()) {
target.put((byte)targetByteUnit);
if (offsets != null) {
//TODO: add offsets code
offsets.put((source.position() - 1));
}
} else {
errorBuffer[errorBufferLength++] = (byte)targetByteUnit;