ICU-6143 getUnicodeSet API merged from icu4j/branches/krajwade/GetUnicodeSet-Test [23632:23722]

X-SVN-Rev: 23723
This commit is contained in:
Kedar Rajwade 2008-04-01 19:49:22 +00:00
parent b878ede950
commit dc4b395371
14 changed files with 671 additions and 12 deletions

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2006-2007, International Business Machines Corporation and *
* Copyright (C) 2006-2008, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -15,6 +15,8 @@ import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.text.UnicodeSet;
class Charset88591 extends CharsetASCII {
public Charset88591(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
super(icuCanonicalName, javaCanonicalName, aliases);
@ -107,5 +109,8 @@ class Charset88591 extends CharsetASCII {
public CharsetEncoder newEncoder() {
return new CharsetEncoder88591(this);
}
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
setFillIn.add(0,0xff);
}
}

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2006-2007, International Business Machines Corporation and *
* Copyright (C) 2006-2008, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
@ -18,6 +18,7 @@ import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
class CharsetASCII extends CharsetICU {
protected byte[] fromUSubstitution = new byte[] { (byte) 0x1a };
@ -351,4 +352,8 @@ class CharsetASCII extends CharsetICU {
public CharsetEncoder newEncoder() {
return new CharsetEncoderASCII(this);
}
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
setFillIn.add(0,0x7f);
}
}

View File

@ -6,6 +6,8 @@
*/
package com.ibm.icu.charset;
import com.ibm.icu.text.UnicodeSet;
/**
* The purpose of this class is to set isCESU8 to true in the super class, and to allow the Charset framework to open
* the variant UTF-8 converter without extra setup work. CESU-8 encodes/decodes supplementary characters as 6 bytes
@ -15,4 +17,10 @@ class CharsetCESU8 extends CharsetUTF8 {
public CharsetCESU8(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
super(icuCanonicalName, javaCanonicalName, aliases);
}
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
getCompleteUnicodeSet(setFillIn);
}
}

View File

@ -14,6 +14,7 @@ import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
public class CharsetHZ extends CharsetICU {
@ -342,4 +343,10 @@ public class CharsetHZ extends CharsetICU {
public CharsetEncoder newEncoder() {
return new CharsetEncoderHZ(this);
}
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
setFillIn.add(0,0x7f);
// CharsetMBCS mbcshz = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
gbCharset.MBCSGetFilteredUnicodeSetForUnicode(gbCharset.sharedData, setFillIn, which, CharsetMBCS.UCNV_SET_FILTER_HZ);
}
}

View File

@ -17,6 +17,8 @@ import java.lang.reflect.InvocationTargetException;
import java.nio.charset.*;
import java.util.HashMap;
import com.ibm.icu.text.UnicodeSet;
/**
* <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters.
* This API is used to convert codepage or character encoded data to and
@ -57,6 +59,13 @@ public abstract class CharsetICU extends Charset{
byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */
//byte reserved[/*19*/]; /* +81: 19 to round out the structure */
/**
* Parameter that select the set of roundtrippable Unicode code points.
* @draft ICU 4.0
*/
public static final int ROUNDTRIP_SET=1; //UCNV_ROUNDTRIP_SET,
public static final int ROUNDTRIP_AND_FALLBACK_SET =2;
/**
*
@ -323,6 +332,53 @@ public abstract class CharsetICU extends Charset{
// /* no known Unicode signature byte sequence recognized */
// return null;
// }
abstract void getUnicodeSetImpl(UnicodeSet setFillIn, int which);
/**
* <p>Returns the set of Unicode code points that can be converted by an ICU Converter.
*
* Returns one of the several kind of set
*
* <p>ROUNDTRIP_SET
*
* The set of all Unicode code points that can be roundtrip-converted
* (converted without any data loss) with the converter.
* This set will not include code points that have fallback mappings
* or are only the result of reverse fallback mappings.
*
* <p>This is useful for example for
* - checking that a string or document can be roundtrip-converted with a converter,
* without/before actually performing the conversion
* - testing if a converter can be used for text for typical text for a certain locale,
* by comparing its roundtrip set with the set of ExemplarCharacters from
* ICU's locale data or other sources
*
*@param setFillIn A valid UnicodeSet. It will be cleared by this function before
* the converter's specific set is filled in.
*@param which A selector;
* currently ROUNDTRIP_SET is the only supported value.
*@throws IllegalArgumentException if the parameters does not match.
*@draft ICU 4.0
*@provisional This API might change or be removed in a future release.
*/
public void getUnicodeSet(UnicodeSet setFillIn, int which){
if( setFillIn == null || which != ROUNDTRIP_SET ){
throw new IllegalArgumentException();
}
setFillIn.clear();
getUnicodeSetImpl(setFillIn, which);
}
static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){
setFillIn.add(0, 0xd7ff);
setFillIn.add(0xe000, 0x10ffff);
}
static void getCompleteUnicodeSet(UnicodeSet setFillIn){
setFillIn.add(0, 0x10ffff);
}
}

View File

@ -14,6 +14,7 @@ import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* @author Michael Ow
@ -221,7 +222,7 @@ class CharsetISCII extends CharsetICU {
/* 0xc9: 0xfe: 0x92b */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.ZERO,
/* 0xca: 0xfe: 0x92c */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.ZERO,
/* 0xcb: 0xfe: 0x92d */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.ZERO,
/* 0xcc: 0xfe: 0x92e */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.ZERO,
/* 0xcc: 0xfe: 0x92e */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
/* 0xcd: 0xff: 0x92f */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
/* 0xcf: 0xff: 0x930 */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
/* 0xd0: 0x87: 0x931 */ MaskEnum.DEV_MASK + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
@ -1270,4 +1271,25 @@ class CharsetISCII extends CharsetICU {
public CharsetEncoder newEncoder() {
return new CharsetEncoderISCII(this);
}
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
int idx,script;
char mask;
setFillIn.add(0,ASCII_END );
for(script = UniLang.DEVALANGARI ; script<= UniLang.MALAYALAM ;script++){
mask = (char)lookupInitialData[script].maskEnum;
for(idx=0; idx < UniLang.DELTA ; idx++){
// Special check for telugu character
if((validityTable[idx] & mask)!=0 || (script == UniLang.TELUGU && idx==0x31)){
setFillIn.add(idx+(script*UniLang.DELTA)+INDIC_BLOCK_BEGIN );
}
}
}
setFillIn.add(DANDA);
setFillIn.add(DOUBLE_DANDA);
setFillIn.add(ZWNJ);
setFillIn.add(ZWJ);
}
}

View File

@ -17,6 +17,7 @@ import java.util.Arrays;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.charset.CharsetMBCS.MBCSToUFallback;
import com.ibm.icu.charset.CharsetMBCS.UConverterMBCSTable;
import com.ibm.icu.charset.CharsetMBCS;
@ -3077,6 +3078,98 @@ class CharsetISO2022 extends CharsetICU {
myConverterData.currentEncoder.fromUnicodeStatus = 1; /* prevLength */
}
}
void getUnicodeSetImpl(UnicodeSet setFillIn, int which) {
int i;
/*open a set and initialize it with code points that are algorithmically round-tripped */
switch(variant){
case ISO_2022_JP:
/*include JIS X 0201 which is hardcoded */
setFillIn.add(0xa5);
setFillIn.add(0x203e);
if((jpCharsetMasks[myConverterData.version]&CSM(ISO8859_1))!=0){
/*include Latin-1 some variants of JP */
setFillIn.add(0, 0xff);
}
else {
/* include ASCII for JP */
setFillIn.add(0, 0x7f);
}
if(myConverterData.version==3 || myConverterData.version==4 ||which == ROUNDTRIP_AND_FALLBACK_SET){
/*
* Do not test(jpCharsetMasks[myConverterData.version]&CSM(HWKANA_7BIT))!=0 because the bit
* is on for all JP versions although version 3 & 4 (JIS7 and JIS8) use half-width Katakana.
* This is because all ISO_2022_JP variant are lenient in that they accept (in toUnicode) half-width
* Katakana via ESC.
* However, we only emit (fromUnicode) half-width Katakana according to the
* definition of each variant.
*
* When including fallbacks,
* we need to include half-width Katakana Unicode code points for all JP variants because
* JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
*/
/* include half-width Katakana for JP */
setFillIn.add(HWKANA_START, HWKANA_END);
}
break;
case ISO_2022_CN:
/* Include ASCII for CN */
setFillIn.add(0, 0x7f);
break;
case ISO_2022_KR:
/* there is only one converter for KR */
myConverterData.currentConverter.getUnicodeSetImpl(setFillIn, which);
break;
default:
break;
}
//TODO Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until
for(i=0; i<UCNV_2022_MAX_CONVERTERS;i++){
int filter;
if(myConverterData.myConverterArray[i]!=null){
if(variant==ISO_2022_CN && myConverterData.version==0 && i==CNS_11643){
/*
*
* version -specific for CN:
* CN version 0 does not map CNS planes 3..7 although
* they are all available in the CNS conversion table;
* CN version 1 (-EXT) does map them all.
* The two versions create different Unicode sets.
*/
filter=CharsetMBCS.UCNV_SET_FILTER_2022_CN;
} else if(variant==ISO_2022_JP && i == JISX208){
/*
* Only add code points that map to Shift-JIS codes
* corrosponding to JIS X 208
*/
filter=CharsetMBCS.UCNV_SET_FILTER_SJIS;
} else if(i==KSC5601){
/*
* Some of the KSC 5601 tables (Convrtrs.txt has this aliases on multiple tables)
* are broader than GR94.
*/
filter=CharsetMBCS.UCNV_SET_FILTER_GR94DBCS;
} else {
filter=CharsetMBCS.UCNV_SET_FILTER_NONE;
}
myConverterData.currentConverter.MBCSGetFilteredUnicodeSetForUnicode(myConverterData.myConverterArray[i],setFillIn, which, filter);
}
}
/*
* ISO Converter must not convert SO/SI/ESC despite what sub-converters do by themselves
* Remove these characters from the set.
*/
setFillIn.remove(0x0e);
setFillIn.remove(0x0f);
setFillIn.remove(0x1b);
/* ISO 2022 converter do not convert C! controls either */
setFillIn.remove(0x80, 0x9f);
}
}

View File

@ -26,12 +26,25 @@ import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.InvalidFormatException;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.charset.UConverterConstants;
class CharsetMBCS extends CharsetICU {
private byte[] fromUSubstitution = null;
UConverterSharedData sharedData = null;
private static final int MAX_VERSION_LENGTH = 4;
// these variables are used in getUnicodeSet() and may be changed in future
// typedef enum UConverterSetFilter {
static final int UCNV_SET_FILTER_NONE = 1;
static final int UCNV_SET_FILTER_DBCS_ONLY = 2;
static final int UCNV_SET_FILTER_2022_CN = 3;
static final int UCNV_SET_FILTER_SJIS= 4 ;
static final int UCNV_SET_FILTER_GR94DBCS = 5;
static final int UCNV_SET_FILTER_HZ = 6;
static final int UCNV_SET_FILTER_COUNT = 7;
// } UConverterSetFilter;
/**
* Fallbacks to Unicode are stored outside the normal state table and code point structures in a vector of items of
@ -4781,5 +4794,355 @@ class CharsetMBCS extends CharsetICU {
public CharsetEncoder newEncoder() {
return new CharsetEncoderMBCS(this);
}
void MBCSGetFilteredUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which, int filter){
UConverterMBCSTable mbcsTable;
char[] table;
char st1,maxStage1, st2;
int st3;
int c ;
mbcsTable = data.mbcs;
table = mbcsTable.fromUnicodeTable;
if((mbcsTable.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY)!=0){
maxStage1 = 0x440;
}
else{
maxStage1 = 0x40;
}
c=0; /* keep track of current code point while enumerating */
if(mbcsTable.outputType==MBCS_OUTPUT_1){
char stage2, stage3;
char minValue;
CharBuffer results;
results = ByteBuffer.wrap(mbcsTable.fromUnicodeBytes).asCharBuffer();
if(which==ROUNDTRIP_SET) {
/* use only roundtrips */
minValue=0xf00;
} else {
/* use all roundtrip and fallback results */
minValue=0x800;
}
for(st1=0;st1<maxStage1;++st1){
st2 = table[st1];
if(st2>maxStage1){
stage2 = st2;
for(st2=0; st2<64; ++st2){
st3 = table[stage2 + st2];
if(st3!=0){
/*read the stage 3 block */
stage3 = (char)st3;
do {
if(results.get(stage3++)>=minValue){
setFillIn.add(c);
}
}while((++c&0xf) !=0);
} else {
c+= 16; /*empty stage 2 block */
}
}
} else {
c+=1024; /* empty stage 2 block */
}
}
} else {
int stage2,stage3;
byte[] bytes;
int st3Multiplier;
int value;
boolean useFallBack;
bytes = mbcsTable.fromUnicodeBytes;
useFallBack = (which == ROUNDTRIP_AND_FALLBACK_SET);
switch(mbcsTable.outputType) {
case MBCS_OUTPUT_3:
case MBCS_OUTPUT_4_EUC:
st3Multiplier = 3;
break;
case MBCS_OUTPUT_4:
st3Multiplier =4;
break;
default:
st3Multiplier =2;
break;
}
//ByteBuffer buffer = (ByteBuffer)charTobyte(table);
for(st1=0;st1<maxStage1;++st1){
st2 = table[st1];
if(st2>(maxStage1>>1)){
stage2 = st2 ;
for(st2=0;st2<128;++st2){
/*read the stage 3 block */
st3 = table[stage2*2 + st2]<<16;
st3+=table[stage2*2 + ++st2];
if(st3!=0){
//if((st3=table[stage2+st2])!=0){
stage3 = st3Multiplier*16*(int)(st3&UConverterConstants.UNSIGNED_SHORT_MASK);
/* get the roundtrip flags for the stage 3 block */
st3>>=16;
st3 &= UConverterConstants.UNSIGNED_SHORT_MASK;
switch(filter) {
case UCNV_SET_FILTER_NONE:
do {
if((st3&1)!=0){
setFillIn.add(c);
stage3+=st3Multiplier;
}else if (useFallBack) {
char b =0;
switch(st3Multiplier) {
case 4 :
b|= ByteBuffer.wrap(bytes).getChar(stage3++);
case 3 :
b|= ByteBuffer.wrap(bytes).getChar(stage3++);
case 2 :
b|= ByteBuffer.wrap(bytes).getChar(stage3) | ByteBuffer.wrap(bytes).getChar(stage3+1);
stage3+=2;
default:
break;
}
if(b!=0) {
setFillIn.add(c);
}
}
st3>>=1;
}while((++c&0xf)!=0);
break;
case UCNV_SET_FILTER_DBCS_ONLY:
/* Ignore single bytes results (<0x100). */
do {
if(((st3&1) != 0 || useFallBack) && (UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))) >= 0x100){
setFillIn.add(c);
}
st3>>=1;
stage3+=2;
}while((++c&0xf) != 0);
break;
case UCNV_SET_FILTER_2022_CN :
/* only add code points that map to CNS 11643 planes 1&2 for non-EXT ISO-2202-CN. */
do {
if(((st3&1) != 0 || useFallBack) && ((value= (UConverterConstants.UNSIGNED_BYTE_MASK & (ByteBuffer.wrap(bytes).get(stage3))))==0x81
|| value==0x82) ){
setFillIn.add(c);
}
st3>>=1;
stage3+=3;
}while((++c&0xf)!=0);
break;
case UCNV_SET_FILTER_SJIS:
/* only add code points that map tp Shift-JIS codes corrosponding to JIS X 0280. */
do{
if(((st3&1) != 0 || useFallBack) && (value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))>=0x8140 && value<=0xeffc){
setFillIn.add(c);
}
st3>>=1;
stage3+=2;
}while((++c&0xf)!=0);
break;
case UCNV_SET_FILTER_GR94DBCS:
/* only add code points that maps to ISO 2022 GR 94 DBCS codes*/
do {
if(((st3&1) != 0 || useFallBack) && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))- 0xa1a1))<=(0xfefe - 0xa1a1) &&
( UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){
setFillIn.add(c);
}
st3>>=1;
stage3+=2;
}while((++c&0xf)!=0);
break;
case UCNV_SET_FILTER_HZ:
/*Only add code points that are suitable for HZ DBCS*/
do {
if( ((st3&1) != 0 || useFallBack) && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))-0xa1a1))<=(0xfdfe - 0xa1a1) &&
(UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){
setFillIn.add(c);
}
st3>>=1;
stage3+=2;
}while((++c&0xf) != 0);
break;
default:
return;
}
} else {
c+=16; /* empty stage 3 block */
}
}
} else {
c+=1024; /*empty stage2 block */
}
}
}
extGetUnicodeSet(setFillIn, which, filter, data);
}
static void extGetUnicodeSetString(ByteBuffer cx,UnicodeSet setFillIn, boolean useFallback,
int minLength, int c, char s[],int length,int sectionIndex){
CharBuffer fromUSectionUChar;
IntBuffer fromUSectionValues;
fromUSectionUChar = (CharBuffer)ARRAY(cx, EXT_FROM_U_UCHARS_INDEX,char.class );
fromUSectionValues = (IntBuffer)ARRAY(cx, EXT_FROM_U_VALUES_INDEX,int.class );
int fromUSectionUCharIndex = fromUSectionUChar.position()+sectionIndex;
int fromUSectionValuesIndex = fromUSectionValues.position()+sectionIndex;
int value, i, count;
/* read first pair of the section */
count = fromUSectionUChar.get(fromUSectionUCharIndex++);
value = UConverterConstants.UNSIGNED_SHORT_MASK & fromUSectionValues.get(fromUSectionValuesIndex++);
if(value!=0 && (FROM_U_IS_ROUNDTRIP(value) || useFallback) && FROM_U_GET_LENGTH(value)>=minLength) {
if(c>=0){
setFillIn.add(c);
} else {
for(int j=0;j<length;j++){
setFillIn.add(s[j]);
}
}
}
for(i=0; i<count; ++i){
s[length] = fromUSectionUChar.get(fromUSectionUCharIndex + i);
value = fromUSectionValues.get(fromUSectionValuesIndex + i);
if(value==0) {
/* no mapping, do nothing */
} else if (FROM_U_IS_PARTIAL(value)) {
extGetUnicodeSetString( cx, setFillIn, useFallback, minLength, (char)UConverterConstants.U_SENTINEL, s, length+1,
FROM_U_GET_PARTIAL_INDEX(value));
} else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0:((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))==FROM_U_ROUNDTRIP_FLAG))
&& FROM_U_GET_LENGTH(value)>=minLength) {
for(int j=0; j<(length+1);j++){
setFillIn.add(s[j]);
}
}
}
}
static void extGetUnicodeSet(UnicodeSet setFillIn, int which, int filter, UConverterSharedData Data){
int st1, stage1Length, st2, st3, minLength;
int ps2, ps3;
CharBuffer stage12, stage3;
int value, length;
IntBuffer stage3b;
boolean useFallback;
char s[] = new char[MAX_UCHARS];
int c;
ByteBuffer cx = Data.mbcs.extIndexes;
if(cx == null){
return;
}
stage12 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX,char.class );
stage3 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX,char.class );
stage3b = (IntBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX,int.class );
stage1Length = cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH);
useFallback =(boolean)(which==ROUNDTRIP_AND_FALLBACK_SET);
c = 0;
if(filter == UCNV_SET_FILTER_2022_CN) {
minLength = 3;
} else if (Data.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY || filter != UCNV_SET_FILTER_NONE) {
/* DBCS-only, ignore single-byte results */
minLength = 2;
} else {
minLength = 1;
}
for(st1=0; st1< stage1Length; ++st1){
st2 = stage12.get(st1);
if(st2>stage1Length) {
ps2 = st2;
for(st2=0;st2<64;++st2){
if((st3=((int) stage12.get(ps2+st2))<<STAGE_2_LEFT_SHIFT)!= 0){
ps3 = st3;
do {
value = stage3b.get((int)(UConverterConstants.UNSIGNED_SHORT_MASK&stage3.get(ps3++)));
if(value==0){
/* no mapping do nothing */
}else if (FROM_U_IS_PARTIAL(value)){
length = 0;
UTF16.append(s, length, c);
extGetUnicodeSetString(cx,setFillIn,useFallback,minLength,c,s,length,(int)FROM_U_GET_PARTIAL_INDEX(value));
} else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0
:((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))==
FROM_U_ROUNDTRIP_FLAG)) && FROM_U_GET_LENGTH(value)>=minLength){
switch(filter) {
case UCNV_SET_FILTER_2022_CN:
if(!(FROM_U_GET_LENGTH(value)==3 && FROM_U_GET_DATA(value)<=0x82ffff)){
continue;
}
break;
case UCNV_SET_FILTER_SJIS:
if(!(FROM_U_GET_LENGTH(value)==2 && (value=FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)){
continue;
}
break;
case UCNV_SET_FILTER_GR94DBCS:
if(!(FROM_U_GET_LENGTH(value)==2 && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=FROM_U_GET_DATA(value)) - 0xa1a1))<=(0xfefe - 0xa1a1)
&& (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){
continue;
}
break;
case UCNV_SET_FILTER_HZ:
if(!(FROM_U_GET_LENGTH(value)==2 && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=FROM_U_GET_DATA(value)) - 0xa1a1))<=(0xfdfe - 0xa1a1)
&& (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){
continue;
}
break;
default:
/*
* UCNV_SET_FILTER_NONE,
* or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
*/
break;
}
setFillIn.add(c);
}
}while((++c&0xf) != 0);
} else {
c+=16; /* emplty stage3 block */
}
}
} else {
c+=1024; /* empty stage 2 block*/
}
}
}
void MBCSGetUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which){
MBCSGetFilteredUnicodeSetForUnicode(data, setFillIn, which, this.sharedData.mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
UCNV_SET_FILTER_DBCS_ONLY :
UCNV_SET_FILTER_NONE );
}
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
if((options & MBCS_OPTION_GB18030)!=0){
setFillIn.add(0, 0xd7ff);
setFillIn.add(0xe000, 0x10ffff);
}
else {
this.MBCSGetUnicodeSetForUnicode(sharedData, setFillIn, which);
}
}
}

View File

@ -14,6 +14,7 @@ import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* @author Niti Hantaweepant
@ -280,4 +281,8 @@ class CharsetUTF16 extends CharsetICU {
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF16(this);
}
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
getNonSurrogateUnicodeSet(setFillIn);
}
}

View File

@ -14,6 +14,7 @@ import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* @author Niti Hantaweepant
@ -242,4 +243,9 @@ class CharsetUTF32 extends CharsetICU {
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF32(this);
}
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
getNonSurrogateUnicodeSet(setFillIn);
}
}

View File

@ -13,6 +13,8 @@ import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.text.UnicodeSet;
/**
* @author Michael Ow
*
@ -747,4 +749,8 @@ class CharsetUTF7 extends CharsetICU {
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF7(this);
}
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
getCompleteUnicodeSet(setFillIn);
}
}

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2006-2007, International Business Machines Corporation and *
* Copyright (C) 2006-2008, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
@ -17,6 +17,7 @@ import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* @author Niti Hantaweepant
@ -684,4 +685,9 @@ class CharsetUTF8 extends CharsetICU {
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF8(this);
}
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
getNonSurrogateUnicodeSet(setFillIn);
}
}

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* Copyright (C) 2008, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*

View File

@ -21,10 +21,12 @@ import java.util.Iterator;
import com.ibm.icu.charset.CharsetCallback;
import com.ibm.icu.charset.CharsetEncoderICU;
import com.ibm.icu.charset.CharsetDecoderICU;
import com.ibm.icu.charset.CharsetICU;
import com.ibm.icu.charset.CharsetProviderICU;
import com.ibm.icu.dev.test.ModuleTest;
import com.ibm.icu.dev.test.TestDataModule.DataMap;
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.text.UnicodeSet;
/**
* This maps to convtest.c which tests the test file for data-driven conversion tests.
@ -828,6 +830,7 @@ public class TestConversion extends ModuleTest {
return checkResultsToUnicode(cc, cc.unicode, cc.toUnicodeResult);
}
private void TestGetUnicodeSet(DataMap testcase) {
/*
* charset - will be opened, and ucnv_getUnicodeSet() called on it //
@ -836,20 +839,94 @@ public class TestConversion extends ModuleTest {
* returned set // which - numeric UConverterUnicodeSet value Headers {
* "charset", "map", "mapnot", "which" }
*/
// retrieve test case data
ConversionCase cc = new ConversionCase();
// retrieve test case data
CharsetProviderICU provider = new CharsetProviderICU();
CharsetICU charset ;
UnicodeSet mapset = new UnicodeSet();
UnicodeSet mapnotset = new UnicodeSet();
UnicodeSet unicodeset = new UnicodeSet();
String ellipsis = "0x2e";
cc.charset = ((ICUResourceBundle) testcase.getObject("charset"))
.getString();
cc.map = ((ICUResourceBundle) testcase.getObject("map")).getString();
cc.mapnot = ((ICUResourceBundle) testcase.getObject("mapnot"))
.getString();
cc.which = ((ICUResourceBundle) testcase.getObject("which")).getUInt();
// create charset and encoder for each test case
logln("TestGetUnicodeSet not supported at this time");
int which = 1; // only checking for ROUNDTRIP_SET
try{
// if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata
charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*')
? (CharsetICU) provider.charsetForName(cc.charset.substring(1), "../dev/data/testdata")
: (CharsetICU) provider.charsetForName(cc.charset);
//checking for converter that are not supported at this point
try{
if(charset.name()=="BOCU-1" ||charset.name()== "SCSU"|| charset.name()=="lmbcs1" || charset.name()== "lmbcs2"
|| charset.name()== "lmbcs3" || charset.name()== "lmbcs4" || charset.name()=="lmbcs5" || charset.name()=="lmbcs6" ||
charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" || charset.name()=="lmbcs18"
|| charset.name()=="lmbcs19"){
logln("Converter not supported at this point :" +charset.displayName());
}
}catch(Exception e){
return;
}
mapset.clear();
mapnotset.clear();
mapset.applyPattern(cc.map.toString(),false);
mapnotset.applyPattern(cc.mapnot,false);
charset.getUnicodeSet(unicodeset, which);
UnicodeSet diffset = new UnicodeSet();
//are there items that must be in unicodeset but are not?
(diffset = mapset).removeAll(unicodeset);
if(!diffset.isEmpty()){
StringBuffer s = new StringBuffer(diffset.toPattern(true));
if(s.length()>100){
s.replace(0, 0x7fffffff, ellipsis);
}
logln("error in missing items - conversion/getUnicodeSet test case "+cc.charset);
logln(s.toString());
}
//are the items that must not be in unicodeset but are?
(diffset=mapnotset).retainAll(unicodeset);
if(!diffset.isEmpty()){
StringBuffer s = new StringBuffer(diffset.toPattern(true));
if(s.length()>100){
s.replace(0, 0x7fffffff, ellipsis);
}
logln("contains unexpected items - conversion/getUnicodeSet test case "+cc.charset);
logln(s.toString());
}
} catch (Exception e) {
logln("getUnicodeSet returned an error code");
logln("ErrorCode expected is: " + cc.outErrorCode);
logln("Error Result is: " + e.toString());
return;
}
}
/**
* This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
* start of the stream for example U+FEFF (the Unicode BOM/signature