ICU-6143 getUnicodeSet API merged from icu4j/branches/krajwade/GetUnicodeSet-Test [23632:23722]

X-SVN-Rev: 23723
2008-04-01 19:49:22 +00:00 · 2008-04-01 19:49:22 +00:00 · dc4b395371
commit dc4b395371
parent b878ede950
14 changed files with 671 additions and 12 deletions
--- a/icu4j/src/com/ibm/icu/charset/Charset88591.java
+++ b/icu4j/src/com/ibm/icu/charset/Charset88591.java
@ -1,6 +1,6 @@
 /**
 *******************************************************************************
- * Copyright (C) 2006-2007, International Business Machines Corporation and    *
+ * Copyright (C) 2006-2008, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
@ -15,6 +15,8 @@ import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;

+import com.ibm.icu.text.UnicodeSet;
+
 class Charset88591 extends CharsetASCII {
    public Charset88591(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
        super(icuCanonicalName, javaCanonicalName, aliases);
@ -107,5 +109,8 @@ class Charset88591 extends CharsetASCII {
    public CharsetEncoder newEncoder() {
        return new CharsetEncoder88591(this);
    }
-
+    
+    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
+        setFillIn.add(0,0xff);
+     }
 }
--- a/icu4j/src/com/ibm/icu/charset/CharsetASCII.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetASCII.java
@ -1,6 +1,6 @@
 /**
 *******************************************************************************
- * Copyright (C) 2006-2007, International Business Machines Corporation and    *
+ * Copyright (C) 2006-2008, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
@ -18,6 +18,7 @@ import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;

 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;

 class CharsetASCII extends CharsetICU {
    protected byte[] fromUSubstitution = new byte[] { (byte) 0x1a };
@ -351,4 +352,8 @@ class CharsetASCII extends CharsetICU {
    public CharsetEncoder newEncoder() {
        return new CharsetEncoderASCII(this);
    }
+    
+    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
+        setFillIn.add(0,0x7f);
+     }
 }
--- a/icu4j/src/com/ibm/icu/charset/CharsetCESU8.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetCESU8.java
@ -6,6 +6,8 @@
 */
 package com.ibm.icu.charset;

+import com.ibm.icu.text.UnicodeSet;
+
 /**
 * The purpose of this class is to set isCESU8 to true in the super class, and to allow the Charset framework to open
 * the variant UTF-8 converter without extra setup work. CESU-8 encodes/decodes supplementary characters as 6 bytes
@ -15,4 +17,10 @@ class CharsetCESU8 extends CharsetUTF8 {
    public CharsetCESU8(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
        super(icuCanonicalName, javaCanonicalName, aliases);
    }
+    
+    
+    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
+        getCompleteUnicodeSet(setFillIn);
+            
+    }
 }
--- a/icu4j/src/com/ibm/icu/charset/CharsetHZ.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetHZ.java
@ -14,6 +14,7 @@ import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;

 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;

 public class CharsetHZ extends CharsetICU {

@ -342,4 +343,10 @@ public class CharsetHZ extends CharsetICU {
    public CharsetEncoder newEncoder() {
        return new CharsetEncoderHZ(this);
    }
+    
+    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
+        setFillIn.add(0,0x7f);
+       // CharsetMBCS mbcshz = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
+        gbCharset.MBCSGetFilteredUnicodeSetForUnicode(gbCharset.sharedData, setFillIn, which, CharsetMBCS.UCNV_SET_FILTER_HZ);
+    }
 }
--- a/icu4j/src/com/ibm/icu/charset/CharsetICU.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetICU.java
@ -17,6 +17,8 @@ import java.lang.reflect.InvocationTargetException;
 import java.nio.charset.*;
 import java.util.HashMap;

+import com.ibm.icu.text.UnicodeSet;
+
 /**
 * <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters.
 * This API is used to convert codepage or character encoded data to and
@ -57,6 +59,13 @@ public abstract class CharsetICU extends Charset{
     byte subChar1;               /* +80: 1  single-byte substitution character for IBM MBCS (0 if none) */
     //byte reserved[/*19*/];           /* +81: 19 to round out the structure */
     
+     /** 
+      * Parameter that select the set of roundtrippable Unicode code points. 
+      * @draft ICU 4.0
+      */
+      public static final int ROUNDTRIP_SET=1; //UCNV_ROUNDTRIP_SET,
+      public static final int ROUNDTRIP_AND_FALLBACK_SET =2;
+     
     
    /**
     * 
@ -323,6 +332,53 @@ public abstract class CharsetICU extends Charset{
 //        /* no known Unicode signature byte sequence recognized */
 //        return null;
 //    }
+    
+    
+    abstract void getUnicodeSetImpl(UnicodeSet setFillIn, int which);
+    
+    /**
+    * <p>Returns the set of Unicode code points that can be converted by an ICU Converter. 
+    * 
+    * Returns one of the several kind of set
+    *
+    * <p>ROUNDTRIP_SET
+    * 
+    * The set of all Unicode code points that can be roundtrip-converted
+    * (converted without any data loss) with the converter.
+    * This set will not include code points that have fallback mappings
+    * or are only the result of reverse fallback mappings.
+    * 
+    * <p>This is useful for example for
+    * - checking that a string or document can be roundtrip-converted with a converter,
+    *   without/before actually performing the conversion
+    * - testing if a converter can be used for text for typical text for a certain locale,
+    *   by comparing its roundtrip set with the set of ExemplarCharacters from
+    *   ICU's locale data or other sources
+    *
+    *@param setFillIn A valid UnicodeSet. It will be cleared by this function before
+    *            the converter's specific set is filled in.
+    *@param which A selector;
+    *              currently ROUNDTRIP_SET is the only supported value.
+    *@throws IllegalArgumentException if the parameters does not match.              
+    *@draft ICU 4.0
+    *@provisional This API might change or be removed in a future release.
+    */
+       public void getUnicodeSet(UnicodeSet setFillIn, int which){
+           if( setFillIn == null || which != ROUNDTRIP_SET ){
+               throw new IllegalArgumentException();
+           }
+           setFillIn.clear();
+           getUnicodeSetImpl(setFillIn, which);
+       }
+      
+       static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){
+           setFillIn.add(0, 0xd7ff);
+           setFillIn.add(0xe000, 0x10ffff);
+       }
+       
+       static void getCompleteUnicodeSet(UnicodeSet setFillIn){
+           setFillIn.add(0, 0x10ffff);
+       }

 }

--- a/icu4j/src/com/ibm/icu/charset/CharsetISCII.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetISCII.java
@ -14,6 +14,7 @@ import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;

 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;

 /**
 * @author Michael Ow
@ -221,7 +222,7 @@ class CharsetISCII extends CharsetICU {
        /* 0xc9: 0xfe: 0x92b */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.ZERO,
        /* 0xca: 0xfe: 0x92c */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.ZERO,
        /* 0xcb: 0xfe: 0x92d */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.ZERO,
-        /* 0xcc: 0xfe: 0x92e */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.ZERO,
+        /* 0xcc: 0xfe: 0x92e */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
        /* 0xcd: 0xff: 0x92f */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
        /* 0xcf: 0xff: 0x930 */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
        /* 0xd0: 0x87: 0x931 */ MaskEnum.DEV_MASK + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
@ -1270,4 +1271,25 @@ class CharsetISCII extends CharsetICU {
    public CharsetEncoder newEncoder() {
        return new CharsetEncoderISCII(this);
    }
+    
+    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
+        int idx,script;
+        char mask;
+        
+        setFillIn.add(0,ASCII_END );
+        for(script = UniLang.DEVALANGARI ; script<= UniLang.MALAYALAM ;script++){
+            mask = (char)lookupInitialData[script].maskEnum;
+            for(idx=0; idx < UniLang.DELTA ; idx++){
+                // Special check for telugu character
+                if((validityTable[idx] & mask)!=0 || (script == UniLang.TELUGU && idx==0x31)){ 
+                   setFillIn.add(idx+(script*UniLang.DELTA)+INDIC_BLOCK_BEGIN );
+                }
+            }
+        }
+        setFillIn.add(DANDA);
+        setFillIn.add(DOUBLE_DANDA);
+        setFillIn.add(ZWNJ);
+        setFillIn.add(ZWJ);
+             
+    }
 }
--- a/icu4j/src/com/ibm/icu/charset/CharsetISO2022.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetISO2022.java
@ -17,6 +17,7 @@ import java.util.Arrays;

 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.charset.CharsetMBCS.MBCSToUFallback;
 import com.ibm.icu.charset.CharsetMBCS.UConverterMBCSTable;
 import com.ibm.icu.charset.CharsetMBCS;
@ -3077,6 +3078,98 @@ class CharsetISO2022 extends CharsetICU {
            myConverterData.currentEncoder.fromUnicodeStatus = 1; /* prevLength */
        }
    }
+    
+    void getUnicodeSetImpl(UnicodeSet setFillIn, int which) {
+        int i;
+        /*open a set and initialize it with code points that are algorithmically round-tripped */
+        
+        switch(variant){
+        case ISO_2022_JP:
+           /*include JIS X 0201 which is hardcoded */
+            setFillIn.add(0xa5);
+            setFillIn.add(0x203e);
+            if((jpCharsetMasks[myConverterData.version]&CSM(ISO8859_1))!=0){
+                /*include Latin-1 some variants of JP */
+                setFillIn.add(0, 0xff);
+            
+            }
+            else {
+                /* include ASCII for JP */
+                setFillIn.add(0, 0x7f);
+             }
+            if(myConverterData.version==3 || myConverterData.version==4 ||which == ROUNDTRIP_AND_FALLBACK_SET){
+            /*
+             * Do not test(jpCharsetMasks[myConverterData.version]&CSM(HWKANA_7BIT))!=0 because the bit
+             * is on for all JP versions although version 3 & 4 (JIS7 and JIS8) use half-width Katakana.
+             * This is because all ISO_2022_JP variant are lenient in that they accept (in toUnicode) half-width
+             * Katakana via ESC.
+             * However, we only emit (fromUnicode) half-width Katakana according to the
+             * definition of each variant.
+             *
+             * When including fallbacks,
+             * we need to include half-width Katakana Unicode code points for all JP variants because
+             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
+             */
+            /* include half-width Katakana for JP */
+                setFillIn.add(HWKANA_START, HWKANA_END);
+             }
+            break;
+        case ISO_2022_CN:
+            /* Include ASCII for CN */
+            setFillIn.add(0, 0x7f);
+            break;
+        case ISO_2022_KR:
+            /* there is only one converter for KR */
+          myConverterData.currentConverter.getUnicodeSetImpl(setFillIn, which);
+          break;
+        default:
+            break;
+        }
+        
+        //TODO Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until
+        for(i=0; i<UCNV_2022_MAX_CONVERTERS;i++){
+            int filter;
+            if(myConverterData.myConverterArray[i]!=null){
+                if(variant==ISO_2022_CN && myConverterData.version==0 && i==CNS_11643){
+                    /*
+                     * 
+                     * version -specific for CN:
+                     * CN version 0 does not map CNS planes 3..7 although
+                     * they are all available in the CNS conversion table;
+                     * CN version 1 (-EXT) does map them all.
+                     * The two versions create different Unicode sets.
+                     */
+                    filter=CharsetMBCS.UCNV_SET_FILTER_2022_CN;
+                } else if(variant==ISO_2022_JP && i == JISX208){
+                    /* 
+                     * Only add code points that map to Shift-JIS codes
+                     * corrosponding to JIS X 208
+                     */
+                    filter=CharsetMBCS.UCNV_SET_FILTER_SJIS;
+                } else if(i==KSC5601){
+                    /*
+                     * Some of the KSC 5601 tables (Convrtrs.txt has this aliases on multiple tables)
+                     * are broader than GR94.
+                     */
+                    filter=CharsetMBCS.UCNV_SET_FILTER_GR94DBCS;
+                } else {
+                    filter=CharsetMBCS.UCNV_SET_FILTER_NONE;
+                }
+                
+                myConverterData.currentConverter.MBCSGetFilteredUnicodeSetForUnicode(myConverterData.myConverterArray[i],setFillIn, which, filter);
+           }
+        }
+        /*
+         * ISO Converter must not convert SO/SI/ESC despite what sub-converters do by themselves
+         * Remove these characters from the set.
+         */
+        setFillIn.remove(0x0e);
+        setFillIn.remove(0x0f);
+        setFillIn.remove(0x1b);
+        
+        /* ISO 2022 converter do not convert C! controls either */
+        setFillIn.remove(0x80, 0x9f);
+    }
 }


--- a/icu4j/src/com/ibm/icu/charset/CharsetMBCS.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetMBCS.java
@ -26,12 +26,25 @@ import com.ibm.icu.impl.ICUResourceBundle;
 import com.ibm.icu.impl.InvalidFormatException;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.charset.UConverterConstants;

 class CharsetMBCS extends CharsetICU {

    private byte[] fromUSubstitution = null;
    UConverterSharedData sharedData = null;
    private static final int MAX_VERSION_LENGTH = 4;
+    
+ // these variables are used in getUnicodeSet() and may be changed in future
+    // typedef enum UConverterSetFilter {
+      static final int UCNV_SET_FILTER_NONE = 1;
+      static final int UCNV_SET_FILTER_DBCS_ONLY = 2;
+      static final int UCNV_SET_FILTER_2022_CN = 3;
+      static final int UCNV_SET_FILTER_SJIS= 4 ;
+      static final int UCNV_SET_FILTER_GR94DBCS = 5;
+      static final int UCNV_SET_FILTER_HZ = 6;
+      static final int UCNV_SET_FILTER_COUNT = 7;
+   //  } UConverterSetFilter;

    /**
     * Fallbacks to Unicode are stored outside the normal state table and code point structures in a vector of items of
@ -4781,5 +4794,355 @@ class CharsetMBCS extends CharsetICU {
    public CharsetEncoder newEncoder() {
        return new CharsetEncoderMBCS(this);
    }
+    
+    void MBCSGetFilteredUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which, int filter){
+        UConverterMBCSTable mbcsTable;
+        char[] table;
+        char st1,maxStage1, st2;
+        int st3;
+        int c ;
+        
+        mbcsTable = data.mbcs;
+        table = mbcsTable.fromUnicodeTable; 
+        if((mbcsTable.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY)!=0){
+            maxStage1 = 0x440;
+        }
+        else{
+            maxStage1 = 0x40;
+        }
+        c=0; /* keep track of current code point while enumerating */
+        
+        if(mbcsTable.outputType==MBCS_OUTPUT_1){
+            char stage2, stage3;
+            char minValue;
+            CharBuffer results;
+            results = ByteBuffer.wrap(mbcsTable.fromUnicodeBytes).asCharBuffer();
+                                   
+            if(which==ROUNDTRIP_SET) {
+                /* use only roundtrips */
+                minValue=0xf00;
+            } else {
+                /* use all roundtrip and fallback results */
+                minValue=0x800;
+            }
+            for(st1=0;st1<maxStage1;++st1){
+                st2 = table[st1];
+                if(st2>maxStage1){
+                    stage2 = st2;
+                    for(st2=0; st2<64; ++st2){
+                        st3 = table[stage2 + st2];
+                        if(st3!=0){
+                            /*read the stage 3 block */
+                            stage3 = (char)st3;
+                            do {
+                                if(results.get(stage3++)>=minValue){
+                                     setFillIn.add(c);
+                                }
+                               
+                            }while((++c&0xf) !=0);
+                          } else {
+                            c+= 16; /*empty stage 2 block */
+                        }
+                    }
+                } else {
+                    c+=1024; /* empty stage 2 block */
+                }
+            }
+        } else {
+            int stage2,stage3;
+            byte[] bytes;
+            int st3Multiplier;
+            int value;
+            boolean useFallBack;
+            bytes = mbcsTable.fromUnicodeBytes;
+            useFallBack = (which == ROUNDTRIP_AND_FALLBACK_SET);
+            switch(mbcsTable.outputType) {
+            case MBCS_OUTPUT_3:
+            case MBCS_OUTPUT_4_EUC:
+                st3Multiplier = 3;
+                break;
+            case MBCS_OUTPUT_4:
+                st3Multiplier =4;
+                break;
+            default:
+                st3Multiplier =2;
+                break;
+            }
+            //ByteBuffer buffer = (ByteBuffer)charTobyte(table);
+            
+            for(st1=0;st1<maxStage1;++st1){
+                st2 = table[st1]; 
+                if(st2>(maxStage1>>1)){
+                    stage2 =  st2 ;
+                    for(st2=0;st2<128;++st2){
+                        /*read the stage 3 block */
+                        st3 = table[stage2*2 + st2]<<16;
+                        st3+=table[stage2*2 + ++st2];
+                        if(st3!=0){
+                        //if((st3=table[stage2+st2])!=0){
+                            stage3 = st3Multiplier*16*(int)(st3&UConverterConstants.UNSIGNED_SHORT_MASK);
+                            
+                            /* get the roundtrip flags for the stage 3 block */
+                            st3>>=16;
+                            st3 &= UConverterConstants.UNSIGNED_SHORT_MASK;
+                            switch(filter) {
+                            case UCNV_SET_FILTER_NONE:
+                                do {
+                                    
+                                   if((st3&1)!=0){
+                                        setFillIn.add(c);
+                                        stage3+=st3Multiplier;
+                                   }else if (useFallBack) {
+                                        
+                                        char b =0;
+                                        switch(st3Multiplier) {
+                                        case 4 :
+                                           
+                                            b|= ByteBuffer.wrap(bytes).getChar(stage3++);
+                                           
+                                        case 3 :
+                                            
+                                            b|= ByteBuffer.wrap(bytes).getChar(stage3++);
+                                           
+                                        case 2 :
+                                           
+                                            b|= ByteBuffer.wrap(bytes).getChar(stage3) | ByteBuffer.wrap(bytes).getChar(stage3+1);
+                                            stage3+=2;
+                                        default:
+                                            break;
+                                        }
+                                        if(b!=0) {
+                                            setFillIn.add(c);
+                                        }
+                                    }
+                                    st3>>=1;
+                                }while((++c&0xf)!=0);
+                                break;
+                            case UCNV_SET_FILTER_DBCS_ONLY:
+                                /* Ignore single bytes results (<0x100). */
+                                do {
+                                    if(((st3&1) != 0 || useFallBack) && (UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))) >= 0x100){
+                                        setFillIn.add(c);
+                                    }
+                                    st3>>=1;
+                                    stage3+=2;
+                                }while((++c&0xf) != 0);
+                               break;
+                            case UCNV_SET_FILTER_2022_CN :
+                                /* only add code points that map to CNS 11643 planes 1&2 for non-EXT ISO-2202-CN. */
+                                do {
+                                    if(((st3&1) != 0 || useFallBack) && ((value= (UConverterConstants.UNSIGNED_BYTE_MASK & (ByteBuffer.wrap(bytes).get(stage3))))==0x81
+                                            || value==0x82) ){
+                                        setFillIn.add(c);
+                                    }
+                                    st3>>=1;
+                                    stage3+=3;
+                                }while((++c&0xf)!=0);
+                                break;
+                            case UCNV_SET_FILTER_SJIS:
+                                /* only add code points that map tp Shift-JIS codes corrosponding to JIS X 0280. */
+                                do{
+                                    
+                                    if(((st3&1) != 0 || useFallBack) && (value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))>=0x8140 && value<=0xeffc){
+                                        setFillIn.add(c);
+                                    }
+                                    st3>>=1;
+                                    stage3+=2;
+                                }while((++c&0xf)!=0);
+                                break;
+                            case UCNV_SET_FILTER_GR94DBCS:
+                                /* only add code points that maps to ISO 2022 GR 94 DBCS codes*/
+                                do {
+                                    if(((st3&1) != 0 || useFallBack) && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))- 0xa1a1))<=(0xfefe - 0xa1a1) && 
+                                            ( UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){
+                                        setFillIn.add(c);
+                                    }
+                                    st3>>=1;
+                                    stage3+=2;
+                                }while((++c&0xf)!=0);
+                                break;
+                            case UCNV_SET_FILTER_HZ:
+                                /*Only add code points that are suitable for HZ DBCS*/
+                                do {
+                                    if( ((st3&1) != 0 || useFallBack) && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))-0xa1a1))<=(0xfdfe - 0xa1a1) &&
+                                            (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){
+                                        setFillIn.add(c);
+                                    }
+                                    st3>>=1;
+                                    stage3+=2;
+                                }while((++c&0xf) != 0);
+                                break;
+                            default:
+                                return;
+                            }
+                        } else {
+                            c+=16; /* empty stage 3 block */
+                        }
+                    }
+                } else {
+                    c+=1024; /*empty stage2 block */
+                }
+            }
+        }
+        extGetUnicodeSet(setFillIn, which, filter, data);
+    }
+   
+    static void extGetUnicodeSetString(ByteBuffer cx,UnicodeSet setFillIn, boolean useFallback, 
+            int minLength, int c, char s[],int length,int sectionIndex){
+        CharBuffer fromUSectionUChar;
+        IntBuffer fromUSectionValues;
+        fromUSectionUChar = (CharBuffer)ARRAY(cx, EXT_FROM_U_UCHARS_INDEX,char.class );
+        fromUSectionValues = (IntBuffer)ARRAY(cx, EXT_FROM_U_VALUES_INDEX,int.class );
+        int fromUSectionUCharIndex = fromUSectionUChar.position()+sectionIndex;
+        int fromUSectionValuesIndex = fromUSectionValues.position()+sectionIndex;
+        int value, i, count;
+        
+        /* read first pair of the section */
+       count = fromUSectionUChar.get(fromUSectionUCharIndex++);
+       value = UConverterConstants.UNSIGNED_SHORT_MASK & fromUSectionValues.get(fromUSectionValuesIndex++);
+       
+       if(value!=0 && (FROM_U_IS_ROUNDTRIP(value) || useFallback) && FROM_U_GET_LENGTH(value)>=minLength) {
+           if(c>=0){
+               setFillIn.add(c);
+           } else {
+               for(int j=0;j<length;j++){
+                   setFillIn.add(s[j]);
+               }
+             
+             }
+       }
+       
+       for(i=0; i<count; ++i){
+           s[length] = fromUSectionUChar.get(fromUSectionUCharIndex + i);
+           value = fromUSectionValues.get(fromUSectionValuesIndex + i);
+           
+           if(value==0) {
+               /* no mapping, do nothing */
+           } else if (FROM_U_IS_PARTIAL(value)) {
+               extGetUnicodeSetString( cx, setFillIn, useFallback, minLength, (char)UConverterConstants.U_SENTINEL, s, length+1,
+                       FROM_U_GET_PARTIAL_INDEX(value));
+           } else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0:((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))==FROM_U_ROUNDTRIP_FLAG)) 
+                   && FROM_U_GET_LENGTH(value)>=minLength) {
+               for(int j=0; j<(length+1);j++){
+                   setFillIn.add(s[j]);
+               }
+             
+           }
+       }
+        
+    }
+    
+    
+    static void extGetUnicodeSet(UnicodeSet setFillIn, int which, int filter, UConverterSharedData Data){
+        int st1, stage1Length, st2, st3, minLength;
+        int ps2, ps3;
+        
+        CharBuffer stage12, stage3;
+        int value, length;
+        IntBuffer stage3b;
+        boolean useFallback;
+        char s[] = new char[MAX_UCHARS];
+        int c;
+        ByteBuffer cx = Data.mbcs.extIndexes;
+        if(cx == null){
+            return;
+        }
+        stage12 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX,char.class );
+        stage3 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX,char.class );
+        stage3b = (IntBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX,int.class );
+        
+        stage1Length = cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH);
+        useFallback =(boolean)(which==ROUNDTRIP_AND_FALLBACK_SET);
+        
+        c = 0;
+        if(filter == UCNV_SET_FILTER_2022_CN) {
+            minLength = 3;
+        } else if (Data.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY || filter != UCNV_SET_FILTER_NONE) {
+            /* DBCS-only, ignore single-byte results */
+            minLength = 2;
+        } else {
+            minLength = 1;
+        }
+        
+        for(st1=0; st1< stage1Length; ++st1){
+            st2 = stage12.get(st1);
+            if(st2>stage1Length) {
+                ps2 = st2;
+                for(st2=0;st2<64;++st2){
+                    
+                    if((st3=((int) stage12.get(ps2+st2))<<STAGE_2_LEFT_SHIFT)!= 0){
+                        ps3 = st3;
+                        do {
+                            value = stage3b.get((int)(UConverterConstants.UNSIGNED_SHORT_MASK&stage3.get(ps3++)));
+                            if(value==0){
+                                /* no mapping do nothing */
+                            }else if (FROM_U_IS_PARTIAL(value)){
+                                length = 0;
+                                UTF16.append(s, length, c);
+                                extGetUnicodeSetString(cx,setFillIn,useFallback,minLength,c,s,length,(int)FROM_U_GET_PARTIAL_INDEX(value));
+                            } else if ((useFallback ?  (value&FROM_U_RESERVED_MASK)==0 
+                                        :((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))== 
+                                        FROM_U_ROUNDTRIP_FLAG)) && FROM_U_GET_LENGTH(value)>=minLength){
+                                            switch(filter) {
+                                            case UCNV_SET_FILTER_2022_CN:
+                                                if(!(FROM_U_GET_LENGTH(value)==3 && FROM_U_GET_DATA(value)<=0x82ffff)){
+                                                    continue;
+                                                }
+                                                break;
+                                            case UCNV_SET_FILTER_SJIS:
+                                                if(!(FROM_U_GET_LENGTH(value)==2 && (value=FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)){
+                                                    continue;
+                                                }
+                                                break;
+                                            case UCNV_SET_FILTER_GR94DBCS:
+                                                if(!(FROM_U_GET_LENGTH(value)==2 && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=FROM_U_GET_DATA(value)) - 0xa1a1))<=(0xfefe - 0xa1a1) 
+                                                        && (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){
+                                                    
+                                                    continue;
+                                                }
+                                                break;
+                                            case UCNV_SET_FILTER_HZ:
+                                                if(!(FROM_U_GET_LENGTH(value)==2 && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=FROM_U_GET_DATA(value)) - 0xa1a1))<=(0xfdfe - 0xa1a1) 
+                                                        && (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){
+                                                    continue;
+                                                }
+                                                break;
+                                            default:
+                                                /*
+                                                 * UCNV_SET_FILTER_NONE,
+                                                 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
+                                                 */
+                                                break;
+                                            }
+                                            setFillIn.add(c);
+                                          
+                                        }
+                        }while((++c&0xf) != 0);
+                      
+                    } else {
+                        c+=16;   /* emplty stage3 block */
+                    }
+                }
+            } else {
+                c+=1024;  /* empty stage 2 block*/
+            }
+        }
+    }
+    
+    void MBCSGetUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which){
+        MBCSGetFilteredUnicodeSetForUnicode(data, setFillIn, which, this.sharedData.mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
+                UCNV_SET_FILTER_DBCS_ONLY :
+                    UCNV_SET_FILTER_NONE );
+    }
+    
+    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
+        if((options & MBCS_OPTION_GB18030)!=0){
+            setFillIn.add(0, 0xd7ff);
+            setFillIn.add(0xe000, 0x10ffff);
+        }
+        else {
+            this.MBCSGetUnicodeSetForUnicode(sharedData, setFillIn, which);
+        }
+    }

 }
--- a/icu4j/src/com/ibm/icu/charset/CharsetUTF16.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetUTF16.java
@ -14,6 +14,7 @@ import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;

 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;

 /**
 * @author Niti Hantaweepant
@ -280,4 +281,8 @@ class CharsetUTF16 extends CharsetICU {
    public CharsetEncoder newEncoder() {
        return new CharsetEncoderUTF16(this);
    }
+    
+    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
+        getNonSurrogateUnicodeSet(setFillIn);            
+    }
 }
--- a/icu4j/src/com/ibm/icu/charset/CharsetUTF32.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetUTF32.java
@ -14,6 +14,7 @@ import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;

 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;

 /**
 * @author Niti Hantaweepant
@ -242,4 +243,9 @@ class CharsetUTF32 extends CharsetICU {
    public CharsetEncoder newEncoder() {
        return new CharsetEncoderUTF32(this);
    }
+    
+    
+    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
+        getNonSurrogateUnicodeSet(setFillIn);                    
+    }
 }
--- a/icu4j/src/com/ibm/icu/charset/CharsetUTF7.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetUTF7.java
@ -13,6 +13,8 @@ import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;

+import com.ibm.icu.text.UnicodeSet;
+
 /**
 * @author Michael Ow
 *
@ -747,4 +749,8 @@ class CharsetUTF7 extends CharsetICU {
    public CharsetEncoder newEncoder() {
        return new CharsetEncoderUTF7(this);
    }
+    
+    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
+        getCompleteUnicodeSet(setFillIn);
+    }
 }
--- a/icu4j/src/com/ibm/icu/charset/CharsetUTF8.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetUTF8.java
@ -1,6 +1,6 @@
 /**
 *******************************************************************************
- * Copyright (C) 2006-2007, International Business Machines Corporation and    *
+ * Copyright (C) 2006-2008, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
@ -17,6 +17,7 @@ import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;

 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;

 /**
 * @author Niti Hantaweepant
@ -684,4 +685,9 @@ class CharsetUTF8 extends CharsetICU {
    public CharsetEncoder newEncoder() {
        return new CharsetEncoderUTF8(this);
    }
+    
+    
+    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
+        getNonSurrogateUnicodeSet(setFillIn);
+    }
 }
--- a/icu4j/src/com/ibm/icu/dev/test/charset/TestAll.java
+++ b/icu4j/src/com/ibm/icu/dev/test/charset/TestAll.java
@ -1,6 +1,6 @@
 /**
 *******************************************************************************
-* Copyright (C) 2006, International Business Machines Corporation and    *
+* Copyright (C) 2008, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
--- a/icu4j/src/com/ibm/icu/dev/test/charset/TestConversion.java
+++ b/icu4j/src/com/ibm/icu/dev/test/charset/TestConversion.java
@ -21,10 +21,12 @@ import java.util.Iterator;
 import com.ibm.icu.charset.CharsetCallback;
 import com.ibm.icu.charset.CharsetEncoderICU;
 import com.ibm.icu.charset.CharsetDecoderICU;
+import com.ibm.icu.charset.CharsetICU;
 import com.ibm.icu.charset.CharsetProviderICU;
 import com.ibm.icu.dev.test.ModuleTest;
 import com.ibm.icu.dev.test.TestDataModule.DataMap;
 import com.ibm.icu.impl.ICUResourceBundle;
+import com.ibm.icu.text.UnicodeSet;

 /**
 * This maps to convtest.c which tests the test file for data-driven conversion tests. 
@ -828,6 +830,7 @@ public class TestConversion extends ModuleTest {
        return checkResultsToUnicode(cc, cc.unicode, cc.toUnicodeResult);
    }

+    
    private void TestGetUnicodeSet(DataMap testcase) {
        /*
         * charset - will be opened, and ucnv_getUnicodeSet() called on it //
@ -836,20 +839,94 @@ public class TestConversion extends ModuleTest {
         * returned set // which - numeric UConverterUnicodeSet value Headers {
         * "charset", "map", "mapnot", "which" }
         */
+       
+        
+  // retrieve test case data
        ConversionCase cc = new ConversionCase();
-        // retrieve test case data
+        CharsetProviderICU provider = new CharsetProviderICU();
+        CharsetICU charset  ;
+       
+             
+        UnicodeSet mapset = new UnicodeSet();
+        UnicodeSet mapnotset = new UnicodeSet();
+        UnicodeSet unicodeset = new UnicodeSet();
+        String ellipsis = "0x2e";
        cc.charset = ((ICUResourceBundle) testcase.getObject("charset"))
                .getString();
        cc.map = ((ICUResourceBundle) testcase.getObject("map")).getString();
        cc.mapnot = ((ICUResourceBundle) testcase.getObject("mapnot"))
                .getString();
-        cc.which = ((ICUResourceBundle) testcase.getObject("which")).getUInt();
-
-        // create charset and encoder for each test case
-        logln("TestGetUnicodeSet not supported at this time");
-
+        
+     
+        int which = 1; // only checking for ROUNDTRIP_SET
+       try{
+           // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata
+           charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*')
+                   ? (CharsetICU) provider.charsetForName(cc.charset.substring(1), "../dev/data/testdata")
+                   : (CharsetICU) provider.charsetForName(cc.charset);
+           
+                  
+           
+                  
+           //checking for converter that are not supported at this point        
+           try{
+               if(charset.name()=="BOCU-1" ||charset.name()== "SCSU"|| charset.name()=="lmbcs1" || charset.name()== "lmbcs2" 
+           
+               || charset.name()== "lmbcs3" || charset.name()== "lmbcs4" || charset.name()=="lmbcs5" || charset.name()=="lmbcs6" ||
+               charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" || charset.name()=="lmbcs18"
+                  || charset.name()=="lmbcs19"){
+               logln("Converter not supported at this point :" +charset.displayName());
+               }
+               
+           }catch(Exception e){
+               return;
+           }
+           mapset.clear();
+           mapnotset.clear();
+                   
+           mapset.applyPattern(cc.map.toString(),false);
+           mapnotset.applyPattern(cc.mapnot,false);
+           charset.getUnicodeSet(unicodeset, which);
+           UnicodeSet diffset = new UnicodeSet();
+                     
+           //are there items that must be in unicodeset but are not?
+          
+           (diffset = mapset).removeAll(unicodeset);
+           
+           if(!diffset.isEmpty()){
+               StringBuffer s = new StringBuffer(diffset.toPattern(true));
+               if(s.length()>100){
+                   s.replace(0, 0x7fffffff, ellipsis);
+               }
+               logln("error in missing items - conversion/getUnicodeSet test case "+cc.charset);
+               logln(s.toString());
+           }
+           
+          //are the items that must not be in unicodeset but are?
+           
+           
+           (diffset=mapnotset).retainAll(unicodeset);
+           
+           if(!diffset.isEmpty()){
+               StringBuffer s = new StringBuffer(diffset.toPattern(true));
+               if(s.length()>100){
+                   s.replace(0, 0x7fffffff, ellipsis);
+               }
+               logln("contains unexpected items - conversion/getUnicodeSet test case "+cc.charset);
+               logln(s.toString());
+           }
+         
+         } catch (Exception e) {
+             logln("getUnicodeSet returned an error code");
+             logln("ErrorCode expected is: " + cc.outErrorCode);
+             logln("Error Result is: " + e.toString());
+             return;
+       } 
+      
    }

+    
+
    /**
     * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
     * start of the stream for example U+FEFF (the Unicode BOM/signature