ICU-6143 getUnicodeSet API merged from icu4j/branches/krajwade/GetUnicodeSet-Test [23632:23722]
X-SVN-Rev: 23723
This commit is contained in:
parent
b878ede950
commit
dc4b395371
@ -1,6 +1,6 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006-2007, International Business Machines Corporation and *
|
||||
* Copyright (C) 2006-2008, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -15,6 +15,8 @@ import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
class Charset88591 extends CharsetASCII {
|
||||
public Charset88591(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
|
||||
super(icuCanonicalName, javaCanonicalName, aliases);
|
||||
@ -107,5 +109,8 @@ class Charset88591 extends CharsetASCII {
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoder88591(this);
|
||||
}
|
||||
|
||||
|
||||
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
|
||||
setFillIn.add(0,0xff);
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006-2007, International Business Machines Corporation and *
|
||||
* Copyright (C) 2006-2008, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
@ -18,6 +18,7 @@ import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
class CharsetASCII extends CharsetICU {
|
||||
protected byte[] fromUSubstitution = new byte[] { (byte) 0x1a };
|
||||
@ -351,4 +352,8 @@ class CharsetASCII extends CharsetICU {
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderASCII(this);
|
||||
}
|
||||
|
||||
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
|
||||
setFillIn.add(0,0x7f);
|
||||
}
|
||||
}
|
||||
|
@ -6,6 +6,8 @@
|
||||
*/
|
||||
package com.ibm.icu.charset;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* The purpose of this class is to set isCESU8 to true in the super class, and to allow the Charset framework to open
|
||||
* the variant UTF-8 converter without extra setup work. CESU-8 encodes/decodes supplementary characters as 6 bytes
|
||||
@ -15,4 +17,10 @@ class CharsetCESU8 extends CharsetUTF8 {
|
||||
public CharsetCESU8(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
|
||||
super(icuCanonicalName, javaCanonicalName, aliases);
|
||||
}
|
||||
|
||||
|
||||
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
|
||||
getCompleteUnicodeSet(setFillIn);
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -14,6 +14,7 @@ import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
public class CharsetHZ extends CharsetICU {
|
||||
|
||||
@ -342,4 +343,10 @@ public class CharsetHZ extends CharsetICU {
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderHZ(this);
|
||||
}
|
||||
|
||||
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
|
||||
setFillIn.add(0,0x7f);
|
||||
// CharsetMBCS mbcshz = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
|
||||
gbCharset.MBCSGetFilteredUnicodeSetForUnicode(gbCharset.sharedData, setFillIn, which, CharsetMBCS.UCNV_SET_FILTER_HZ);
|
||||
}
|
||||
}
|
||||
|
@ -17,6 +17,8 @@ import java.lang.reflect.InvocationTargetException;
|
||||
import java.nio.charset.*;
|
||||
import java.util.HashMap;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters.
|
||||
* This API is used to convert codepage or character encoded data to and
|
||||
@ -57,6 +59,13 @@ public abstract class CharsetICU extends Charset{
|
||||
byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */
|
||||
//byte reserved[/*19*/]; /* +81: 19 to round out the structure */
|
||||
|
||||
/**
|
||||
* Parameter that select the set of roundtrippable Unicode code points.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
public static final int ROUNDTRIP_SET=1; //UCNV_ROUNDTRIP_SET,
|
||||
public static final int ROUNDTRIP_AND_FALLBACK_SET =2;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
@ -323,6 +332,53 @@ public abstract class CharsetICU extends Charset{
|
||||
// /* no known Unicode signature byte sequence recognized */
|
||||
// return null;
|
||||
// }
|
||||
|
||||
|
||||
abstract void getUnicodeSetImpl(UnicodeSet setFillIn, int which);
|
||||
|
||||
/**
|
||||
* <p>Returns the set of Unicode code points that can be converted by an ICU Converter.
|
||||
*
|
||||
* Returns one of the several kind of set
|
||||
*
|
||||
* <p>ROUNDTRIP_SET
|
||||
*
|
||||
* The set of all Unicode code points that can be roundtrip-converted
|
||||
* (converted without any data loss) with the converter.
|
||||
* This set will not include code points that have fallback mappings
|
||||
* or are only the result of reverse fallback mappings.
|
||||
*
|
||||
* <p>This is useful for example for
|
||||
* - checking that a string or document can be roundtrip-converted with a converter,
|
||||
* without/before actually performing the conversion
|
||||
* - testing if a converter can be used for text for typical text for a certain locale,
|
||||
* by comparing its roundtrip set with the set of ExemplarCharacters from
|
||||
* ICU's locale data or other sources
|
||||
*
|
||||
*@param setFillIn A valid UnicodeSet. It will be cleared by this function before
|
||||
* the converter's specific set is filled in.
|
||||
*@param which A selector;
|
||||
* currently ROUNDTRIP_SET is the only supported value.
|
||||
*@throws IllegalArgumentException if the parameters does not match.
|
||||
*@draft ICU 4.0
|
||||
*@provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public void getUnicodeSet(UnicodeSet setFillIn, int which){
|
||||
if( setFillIn == null || which != ROUNDTRIP_SET ){
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
setFillIn.clear();
|
||||
getUnicodeSetImpl(setFillIn, which);
|
||||
}
|
||||
|
||||
static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){
|
||||
setFillIn.add(0, 0xd7ff);
|
||||
setFillIn.add(0xe000, 0x10ffff);
|
||||
}
|
||||
|
||||
static void getCompleteUnicodeSet(UnicodeSet setFillIn){
|
||||
setFillIn.add(0, 0x10ffff);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -14,6 +14,7 @@ import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* @author Michael Ow
|
||||
@ -221,7 +222,7 @@ class CharsetISCII extends CharsetICU {
|
||||
/* 0xc9: 0xfe: 0x92b */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.ZERO,
|
||||
/* 0xca: 0xfe: 0x92c */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.ZERO,
|
||||
/* 0xcb: 0xfe: 0x92d */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.ZERO,
|
||||
/* 0xcc: 0xfe: 0x92e */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.ZERO,
|
||||
/* 0xcc: 0xfe: 0x92e */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
|
||||
/* 0xcd: 0xff: 0x92f */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
|
||||
/* 0xcf: 0xff: 0x930 */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
|
||||
/* 0xd0: 0x87: 0x931 */ MaskEnum.DEV_MASK + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
|
||||
@ -1270,4 +1271,25 @@ class CharsetISCII extends CharsetICU {
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderISCII(this);
|
||||
}
|
||||
|
||||
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
|
||||
int idx,script;
|
||||
char mask;
|
||||
|
||||
setFillIn.add(0,ASCII_END );
|
||||
for(script = UniLang.DEVALANGARI ; script<= UniLang.MALAYALAM ;script++){
|
||||
mask = (char)lookupInitialData[script].maskEnum;
|
||||
for(idx=0; idx < UniLang.DELTA ; idx++){
|
||||
// Special check for telugu character
|
||||
if((validityTable[idx] & mask)!=0 || (script == UniLang.TELUGU && idx==0x31)){
|
||||
setFillIn.add(idx+(script*UniLang.DELTA)+INDIC_BLOCK_BEGIN );
|
||||
}
|
||||
}
|
||||
}
|
||||
setFillIn.add(DANDA);
|
||||
setFillIn.add(DOUBLE_DANDA);
|
||||
setFillIn.add(ZWNJ);
|
||||
setFillIn.add(ZWJ);
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -17,6 +17,7 @@ import java.util.Arrays;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.charset.CharsetMBCS.MBCSToUFallback;
|
||||
import com.ibm.icu.charset.CharsetMBCS.UConverterMBCSTable;
|
||||
import com.ibm.icu.charset.CharsetMBCS;
|
||||
@ -3077,6 +3078,98 @@ class CharsetISO2022 extends CharsetICU {
|
||||
myConverterData.currentEncoder.fromUnicodeStatus = 1; /* prevLength */
|
||||
}
|
||||
}
|
||||
|
||||
void getUnicodeSetImpl(UnicodeSet setFillIn, int which) {
|
||||
int i;
|
||||
/*open a set and initialize it with code points that are algorithmically round-tripped */
|
||||
|
||||
switch(variant){
|
||||
case ISO_2022_JP:
|
||||
/*include JIS X 0201 which is hardcoded */
|
||||
setFillIn.add(0xa5);
|
||||
setFillIn.add(0x203e);
|
||||
if((jpCharsetMasks[myConverterData.version]&CSM(ISO8859_1))!=0){
|
||||
/*include Latin-1 some variants of JP */
|
||||
setFillIn.add(0, 0xff);
|
||||
|
||||
}
|
||||
else {
|
||||
/* include ASCII for JP */
|
||||
setFillIn.add(0, 0x7f);
|
||||
}
|
||||
if(myConverterData.version==3 || myConverterData.version==4 ||which == ROUNDTRIP_AND_FALLBACK_SET){
|
||||
/*
|
||||
* Do not test(jpCharsetMasks[myConverterData.version]&CSM(HWKANA_7BIT))!=0 because the bit
|
||||
* is on for all JP versions although version 3 & 4 (JIS7 and JIS8) use half-width Katakana.
|
||||
* This is because all ISO_2022_JP variant are lenient in that they accept (in toUnicode) half-width
|
||||
* Katakana via ESC.
|
||||
* However, we only emit (fromUnicode) half-width Katakana according to the
|
||||
* definition of each variant.
|
||||
*
|
||||
* When including fallbacks,
|
||||
* we need to include half-width Katakana Unicode code points for all JP variants because
|
||||
* JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
|
||||
*/
|
||||
/* include half-width Katakana for JP */
|
||||
setFillIn.add(HWKANA_START, HWKANA_END);
|
||||
}
|
||||
break;
|
||||
case ISO_2022_CN:
|
||||
/* Include ASCII for CN */
|
||||
setFillIn.add(0, 0x7f);
|
||||
break;
|
||||
case ISO_2022_KR:
|
||||
/* there is only one converter for KR */
|
||||
myConverterData.currentConverter.getUnicodeSetImpl(setFillIn, which);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
//TODO Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until
|
||||
for(i=0; i<UCNV_2022_MAX_CONVERTERS;i++){
|
||||
int filter;
|
||||
if(myConverterData.myConverterArray[i]!=null){
|
||||
if(variant==ISO_2022_CN && myConverterData.version==0 && i==CNS_11643){
|
||||
/*
|
||||
*
|
||||
* version -specific for CN:
|
||||
* CN version 0 does not map CNS planes 3..7 although
|
||||
* they are all available in the CNS conversion table;
|
||||
* CN version 1 (-EXT) does map them all.
|
||||
* The two versions create different Unicode sets.
|
||||
*/
|
||||
filter=CharsetMBCS.UCNV_SET_FILTER_2022_CN;
|
||||
} else if(variant==ISO_2022_JP && i == JISX208){
|
||||
/*
|
||||
* Only add code points that map to Shift-JIS codes
|
||||
* corrosponding to JIS X 208
|
||||
*/
|
||||
filter=CharsetMBCS.UCNV_SET_FILTER_SJIS;
|
||||
} else if(i==KSC5601){
|
||||
/*
|
||||
* Some of the KSC 5601 tables (Convrtrs.txt has this aliases on multiple tables)
|
||||
* are broader than GR94.
|
||||
*/
|
||||
filter=CharsetMBCS.UCNV_SET_FILTER_GR94DBCS;
|
||||
} else {
|
||||
filter=CharsetMBCS.UCNV_SET_FILTER_NONE;
|
||||
}
|
||||
|
||||
myConverterData.currentConverter.MBCSGetFilteredUnicodeSetForUnicode(myConverterData.myConverterArray[i],setFillIn, which, filter);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* ISO Converter must not convert SO/SI/ESC despite what sub-converters do by themselves
|
||||
* Remove these characters from the set.
|
||||
*/
|
||||
setFillIn.remove(0x0e);
|
||||
setFillIn.remove(0x0f);
|
||||
setFillIn.remove(0x1b);
|
||||
|
||||
/* ISO 2022 converter do not convert C! controls either */
|
||||
setFillIn.remove(0x80, 0x9f);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -26,12 +26,25 @@ import com.ibm.icu.impl.ICUResourceBundle;
|
||||
import com.ibm.icu.impl.InvalidFormatException;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.charset.UConverterConstants;
|
||||
|
||||
class CharsetMBCS extends CharsetICU {
|
||||
|
||||
private byte[] fromUSubstitution = null;
|
||||
UConverterSharedData sharedData = null;
|
||||
private static final int MAX_VERSION_LENGTH = 4;
|
||||
|
||||
// these variables are used in getUnicodeSet() and may be changed in future
|
||||
// typedef enum UConverterSetFilter {
|
||||
static final int UCNV_SET_FILTER_NONE = 1;
|
||||
static final int UCNV_SET_FILTER_DBCS_ONLY = 2;
|
||||
static final int UCNV_SET_FILTER_2022_CN = 3;
|
||||
static final int UCNV_SET_FILTER_SJIS= 4 ;
|
||||
static final int UCNV_SET_FILTER_GR94DBCS = 5;
|
||||
static final int UCNV_SET_FILTER_HZ = 6;
|
||||
static final int UCNV_SET_FILTER_COUNT = 7;
|
||||
// } UConverterSetFilter;
|
||||
|
||||
/**
|
||||
* Fallbacks to Unicode are stored outside the normal state table and code point structures in a vector of items of
|
||||
@ -4781,5 +4794,355 @@ class CharsetMBCS extends CharsetICU {
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderMBCS(this);
|
||||
}
|
||||
|
||||
void MBCSGetFilteredUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which, int filter){
|
||||
UConverterMBCSTable mbcsTable;
|
||||
char[] table;
|
||||
char st1,maxStage1, st2;
|
||||
int st3;
|
||||
int c ;
|
||||
|
||||
mbcsTable = data.mbcs;
|
||||
table = mbcsTable.fromUnicodeTable;
|
||||
if((mbcsTable.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY)!=0){
|
||||
maxStage1 = 0x440;
|
||||
}
|
||||
else{
|
||||
maxStage1 = 0x40;
|
||||
}
|
||||
c=0; /* keep track of current code point while enumerating */
|
||||
|
||||
if(mbcsTable.outputType==MBCS_OUTPUT_1){
|
||||
char stage2, stage3;
|
||||
char minValue;
|
||||
CharBuffer results;
|
||||
results = ByteBuffer.wrap(mbcsTable.fromUnicodeBytes).asCharBuffer();
|
||||
|
||||
if(which==ROUNDTRIP_SET) {
|
||||
/* use only roundtrips */
|
||||
minValue=0xf00;
|
||||
} else {
|
||||
/* use all roundtrip and fallback results */
|
||||
minValue=0x800;
|
||||
}
|
||||
for(st1=0;st1<maxStage1;++st1){
|
||||
st2 = table[st1];
|
||||
if(st2>maxStage1){
|
||||
stage2 = st2;
|
||||
for(st2=0; st2<64; ++st2){
|
||||
st3 = table[stage2 + st2];
|
||||
if(st3!=0){
|
||||
/*read the stage 3 block */
|
||||
stage3 = (char)st3;
|
||||
do {
|
||||
if(results.get(stage3++)>=minValue){
|
||||
setFillIn.add(c);
|
||||
}
|
||||
|
||||
}while((++c&0xf) !=0);
|
||||
} else {
|
||||
c+= 16; /*empty stage 2 block */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
c+=1024; /* empty stage 2 block */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int stage2,stage3;
|
||||
byte[] bytes;
|
||||
int st3Multiplier;
|
||||
int value;
|
||||
boolean useFallBack;
|
||||
bytes = mbcsTable.fromUnicodeBytes;
|
||||
useFallBack = (which == ROUNDTRIP_AND_FALLBACK_SET);
|
||||
switch(mbcsTable.outputType) {
|
||||
case MBCS_OUTPUT_3:
|
||||
case MBCS_OUTPUT_4_EUC:
|
||||
st3Multiplier = 3;
|
||||
break;
|
||||
case MBCS_OUTPUT_4:
|
||||
st3Multiplier =4;
|
||||
break;
|
||||
default:
|
||||
st3Multiplier =2;
|
||||
break;
|
||||
}
|
||||
//ByteBuffer buffer = (ByteBuffer)charTobyte(table);
|
||||
|
||||
for(st1=0;st1<maxStage1;++st1){
|
||||
st2 = table[st1];
|
||||
if(st2>(maxStage1>>1)){
|
||||
stage2 = st2 ;
|
||||
for(st2=0;st2<128;++st2){
|
||||
/*read the stage 3 block */
|
||||
st3 = table[stage2*2 + st2]<<16;
|
||||
st3+=table[stage2*2 + ++st2];
|
||||
if(st3!=0){
|
||||
//if((st3=table[stage2+st2])!=0){
|
||||
stage3 = st3Multiplier*16*(int)(st3&UConverterConstants.UNSIGNED_SHORT_MASK);
|
||||
|
||||
/* get the roundtrip flags for the stage 3 block */
|
||||
st3>>=16;
|
||||
st3 &= UConverterConstants.UNSIGNED_SHORT_MASK;
|
||||
switch(filter) {
|
||||
case UCNV_SET_FILTER_NONE:
|
||||
do {
|
||||
|
||||
if((st3&1)!=0){
|
||||
setFillIn.add(c);
|
||||
stage3+=st3Multiplier;
|
||||
}else if (useFallBack) {
|
||||
|
||||
char b =0;
|
||||
switch(st3Multiplier) {
|
||||
case 4 :
|
||||
|
||||
b|= ByteBuffer.wrap(bytes).getChar(stage3++);
|
||||
|
||||
case 3 :
|
||||
|
||||
b|= ByteBuffer.wrap(bytes).getChar(stage3++);
|
||||
|
||||
case 2 :
|
||||
|
||||
b|= ByteBuffer.wrap(bytes).getChar(stage3) | ByteBuffer.wrap(bytes).getChar(stage3+1);
|
||||
stage3+=2;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if(b!=0) {
|
||||
setFillIn.add(c);
|
||||
}
|
||||
}
|
||||
st3>>=1;
|
||||
}while((++c&0xf)!=0);
|
||||
break;
|
||||
case UCNV_SET_FILTER_DBCS_ONLY:
|
||||
/* Ignore single bytes results (<0x100). */
|
||||
do {
|
||||
if(((st3&1) != 0 || useFallBack) && (UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))) >= 0x100){
|
||||
setFillIn.add(c);
|
||||
}
|
||||
st3>>=1;
|
||||
stage3+=2;
|
||||
}while((++c&0xf) != 0);
|
||||
break;
|
||||
case UCNV_SET_FILTER_2022_CN :
|
||||
/* only add code points that map to CNS 11643 planes 1&2 for non-EXT ISO-2202-CN. */
|
||||
do {
|
||||
if(((st3&1) != 0 || useFallBack) && ((value= (UConverterConstants.UNSIGNED_BYTE_MASK & (ByteBuffer.wrap(bytes).get(stage3))))==0x81
|
||||
|| value==0x82) ){
|
||||
setFillIn.add(c);
|
||||
}
|
||||
st3>>=1;
|
||||
stage3+=3;
|
||||
}while((++c&0xf)!=0);
|
||||
break;
|
||||
case UCNV_SET_FILTER_SJIS:
|
||||
/* only add code points that map tp Shift-JIS codes corrosponding to JIS X 0280. */
|
||||
do{
|
||||
|
||||
if(((st3&1) != 0 || useFallBack) && (value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))>=0x8140 && value<=0xeffc){
|
||||
setFillIn.add(c);
|
||||
}
|
||||
st3>>=1;
|
||||
stage3+=2;
|
||||
}while((++c&0xf)!=0);
|
||||
break;
|
||||
case UCNV_SET_FILTER_GR94DBCS:
|
||||
/* only add code points that maps to ISO 2022 GR 94 DBCS codes*/
|
||||
do {
|
||||
if(((st3&1) != 0 || useFallBack) && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))- 0xa1a1))<=(0xfefe - 0xa1a1) &&
|
||||
( UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){
|
||||
setFillIn.add(c);
|
||||
}
|
||||
st3>>=1;
|
||||
stage3+=2;
|
||||
}while((++c&0xf)!=0);
|
||||
break;
|
||||
case UCNV_SET_FILTER_HZ:
|
||||
/*Only add code points that are suitable for HZ DBCS*/
|
||||
do {
|
||||
if( ((st3&1) != 0 || useFallBack) && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))-0xa1a1))<=(0xfdfe - 0xa1a1) &&
|
||||
(UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){
|
||||
setFillIn.add(c);
|
||||
}
|
||||
st3>>=1;
|
||||
stage3+=2;
|
||||
}while((++c&0xf) != 0);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
c+=16; /* empty stage 3 block */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
c+=1024; /*empty stage2 block */
|
||||
}
|
||||
}
|
||||
}
|
||||
extGetUnicodeSet(setFillIn, which, filter, data);
|
||||
}
|
||||
|
||||
static void extGetUnicodeSetString(ByteBuffer cx,UnicodeSet setFillIn, boolean useFallback,
|
||||
int minLength, int c, char s[],int length,int sectionIndex){
|
||||
CharBuffer fromUSectionUChar;
|
||||
IntBuffer fromUSectionValues;
|
||||
fromUSectionUChar = (CharBuffer)ARRAY(cx, EXT_FROM_U_UCHARS_INDEX,char.class );
|
||||
fromUSectionValues = (IntBuffer)ARRAY(cx, EXT_FROM_U_VALUES_INDEX,int.class );
|
||||
int fromUSectionUCharIndex = fromUSectionUChar.position()+sectionIndex;
|
||||
int fromUSectionValuesIndex = fromUSectionValues.position()+sectionIndex;
|
||||
int value, i, count;
|
||||
|
||||
/* read first pair of the section */
|
||||
count = fromUSectionUChar.get(fromUSectionUCharIndex++);
|
||||
value = UConverterConstants.UNSIGNED_SHORT_MASK & fromUSectionValues.get(fromUSectionValuesIndex++);
|
||||
|
||||
if(value!=0 && (FROM_U_IS_ROUNDTRIP(value) || useFallback) && FROM_U_GET_LENGTH(value)>=minLength) {
|
||||
if(c>=0){
|
||||
setFillIn.add(c);
|
||||
} else {
|
||||
for(int j=0;j<length;j++){
|
||||
setFillIn.add(s[j]);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
for(i=0; i<count; ++i){
|
||||
s[length] = fromUSectionUChar.get(fromUSectionUCharIndex + i);
|
||||
value = fromUSectionValues.get(fromUSectionValuesIndex + i);
|
||||
|
||||
if(value==0) {
|
||||
/* no mapping, do nothing */
|
||||
} else if (FROM_U_IS_PARTIAL(value)) {
|
||||
extGetUnicodeSetString( cx, setFillIn, useFallback, minLength, (char)UConverterConstants.U_SENTINEL, s, length+1,
|
||||
FROM_U_GET_PARTIAL_INDEX(value));
|
||||
} else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0:((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))==FROM_U_ROUNDTRIP_FLAG))
|
||||
&& FROM_U_GET_LENGTH(value)>=minLength) {
|
||||
for(int j=0; j<(length+1);j++){
|
||||
setFillIn.add(s[j]);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void extGetUnicodeSet(UnicodeSet setFillIn, int which, int filter, UConverterSharedData Data){
|
||||
int st1, stage1Length, st2, st3, minLength;
|
||||
int ps2, ps3;
|
||||
|
||||
CharBuffer stage12, stage3;
|
||||
int value, length;
|
||||
IntBuffer stage3b;
|
||||
boolean useFallback;
|
||||
char s[] = new char[MAX_UCHARS];
|
||||
int c;
|
||||
ByteBuffer cx = Data.mbcs.extIndexes;
|
||||
if(cx == null){
|
||||
return;
|
||||
}
|
||||
stage12 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX,char.class );
|
||||
stage3 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX,char.class );
|
||||
stage3b = (IntBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX,int.class );
|
||||
|
||||
stage1Length = cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH);
|
||||
useFallback =(boolean)(which==ROUNDTRIP_AND_FALLBACK_SET);
|
||||
|
||||
c = 0;
|
||||
if(filter == UCNV_SET_FILTER_2022_CN) {
|
||||
minLength = 3;
|
||||
} else if (Data.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY || filter != UCNV_SET_FILTER_NONE) {
|
||||
/* DBCS-only, ignore single-byte results */
|
||||
minLength = 2;
|
||||
} else {
|
||||
minLength = 1;
|
||||
}
|
||||
|
||||
for(st1=0; st1< stage1Length; ++st1){
|
||||
st2 = stage12.get(st1);
|
||||
if(st2>stage1Length) {
|
||||
ps2 = st2;
|
||||
for(st2=0;st2<64;++st2){
|
||||
|
||||
if((st3=((int) stage12.get(ps2+st2))<<STAGE_2_LEFT_SHIFT)!= 0){
|
||||
ps3 = st3;
|
||||
do {
|
||||
value = stage3b.get((int)(UConverterConstants.UNSIGNED_SHORT_MASK&stage3.get(ps3++)));
|
||||
if(value==0){
|
||||
/* no mapping do nothing */
|
||||
}else if (FROM_U_IS_PARTIAL(value)){
|
||||
length = 0;
|
||||
UTF16.append(s, length, c);
|
||||
extGetUnicodeSetString(cx,setFillIn,useFallback,minLength,c,s,length,(int)FROM_U_GET_PARTIAL_INDEX(value));
|
||||
} else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0
|
||||
:((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))==
|
||||
FROM_U_ROUNDTRIP_FLAG)) && FROM_U_GET_LENGTH(value)>=minLength){
|
||||
switch(filter) {
|
||||
case UCNV_SET_FILTER_2022_CN:
|
||||
if(!(FROM_U_GET_LENGTH(value)==3 && FROM_U_GET_DATA(value)<=0x82ffff)){
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case UCNV_SET_FILTER_SJIS:
|
||||
if(!(FROM_U_GET_LENGTH(value)==2 && (value=FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)){
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case UCNV_SET_FILTER_GR94DBCS:
|
||||
if(!(FROM_U_GET_LENGTH(value)==2 && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=FROM_U_GET_DATA(value)) - 0xa1a1))<=(0xfefe - 0xa1a1)
|
||||
&& (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){
|
||||
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case UCNV_SET_FILTER_HZ:
|
||||
if(!(FROM_U_GET_LENGTH(value)==2 && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=FROM_U_GET_DATA(value)) - 0xa1a1))<=(0xfdfe - 0xa1a1)
|
||||
&& (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
/*
|
||||
* UCNV_SET_FILTER_NONE,
|
||||
* or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
|
||||
*/
|
||||
break;
|
||||
}
|
||||
setFillIn.add(c);
|
||||
|
||||
}
|
||||
}while((++c&0xf) != 0);
|
||||
|
||||
} else {
|
||||
c+=16; /* emplty stage3 block */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
c+=1024; /* empty stage 2 block*/
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MBCSGetUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which){
|
||||
MBCSGetFilteredUnicodeSetForUnicode(data, setFillIn, which, this.sharedData.mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
|
||||
UCNV_SET_FILTER_DBCS_ONLY :
|
||||
UCNV_SET_FILTER_NONE );
|
||||
}
|
||||
|
||||
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
|
||||
if((options & MBCS_OPTION_GB18030)!=0){
|
||||
setFillIn.add(0, 0xd7ff);
|
||||
setFillIn.add(0xe000, 0x10ffff);
|
||||
}
|
||||
else {
|
||||
this.MBCSGetUnicodeSetForUnicode(sharedData, setFillIn, which);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -14,6 +14,7 @@ import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* @author Niti Hantaweepant
|
||||
@ -280,4 +281,8 @@ class CharsetUTF16 extends CharsetICU {
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderUTF16(this);
|
||||
}
|
||||
|
||||
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
|
||||
getNonSurrogateUnicodeSet(setFillIn);
|
||||
}
|
||||
}
|
||||
|
@ -14,6 +14,7 @@ import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* @author Niti Hantaweepant
|
||||
@ -242,4 +243,9 @@ class CharsetUTF32 extends CharsetICU {
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderUTF32(this);
|
||||
}
|
||||
|
||||
|
||||
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
|
||||
getNonSurrogateUnicodeSet(setFillIn);
|
||||
}
|
||||
}
|
||||
|
@ -13,6 +13,8 @@ import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* @author Michael Ow
|
||||
*
|
||||
@ -747,4 +749,8 @@ class CharsetUTF7 extends CharsetICU {
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderUTF7(this);
|
||||
}
|
||||
|
||||
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
|
||||
getCompleteUnicodeSet(setFillIn);
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006-2007, International Business Machines Corporation and *
|
||||
* Copyright (C) 2006-2008, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
@ -17,6 +17,7 @@ import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* @author Niti Hantaweepant
|
||||
@ -684,4 +685,9 @@ class CharsetUTF8 extends CharsetICU {
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderUTF8(this);
|
||||
}
|
||||
|
||||
|
||||
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
|
||||
getNonSurrogateUnicodeSet(setFillIn);
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* Copyright (C) 2008, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
|
@ -21,10 +21,12 @@ import java.util.Iterator;
|
||||
import com.ibm.icu.charset.CharsetCallback;
|
||||
import com.ibm.icu.charset.CharsetEncoderICU;
|
||||
import com.ibm.icu.charset.CharsetDecoderICU;
|
||||
import com.ibm.icu.charset.CharsetICU;
|
||||
import com.ibm.icu.charset.CharsetProviderICU;
|
||||
import com.ibm.icu.dev.test.ModuleTest;
|
||||
import com.ibm.icu.dev.test.TestDataModule.DataMap;
|
||||
import com.ibm.icu.impl.ICUResourceBundle;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* This maps to convtest.c which tests the test file for data-driven conversion tests.
|
||||
@ -828,6 +830,7 @@ public class TestConversion extends ModuleTest {
|
||||
return checkResultsToUnicode(cc, cc.unicode, cc.toUnicodeResult);
|
||||
}
|
||||
|
||||
|
||||
private void TestGetUnicodeSet(DataMap testcase) {
|
||||
/*
|
||||
* charset - will be opened, and ucnv_getUnicodeSet() called on it //
|
||||
@ -836,20 +839,94 @@ public class TestConversion extends ModuleTest {
|
||||
* returned set // which - numeric UConverterUnicodeSet value Headers {
|
||||
* "charset", "map", "mapnot", "which" }
|
||||
*/
|
||||
|
||||
|
||||
// retrieve test case data
|
||||
ConversionCase cc = new ConversionCase();
|
||||
// retrieve test case data
|
||||
CharsetProviderICU provider = new CharsetProviderICU();
|
||||
CharsetICU charset ;
|
||||
|
||||
|
||||
UnicodeSet mapset = new UnicodeSet();
|
||||
UnicodeSet mapnotset = new UnicodeSet();
|
||||
UnicodeSet unicodeset = new UnicodeSet();
|
||||
String ellipsis = "0x2e";
|
||||
cc.charset = ((ICUResourceBundle) testcase.getObject("charset"))
|
||||
.getString();
|
||||
cc.map = ((ICUResourceBundle) testcase.getObject("map")).getString();
|
||||
cc.mapnot = ((ICUResourceBundle) testcase.getObject("mapnot"))
|
||||
.getString();
|
||||
cc.which = ((ICUResourceBundle) testcase.getObject("which")).getUInt();
|
||||
|
||||
// create charset and encoder for each test case
|
||||
logln("TestGetUnicodeSet not supported at this time");
|
||||
|
||||
|
||||
|
||||
int which = 1; // only checking for ROUNDTRIP_SET
|
||||
try{
|
||||
// if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata
|
||||
charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*')
|
||||
? (CharsetICU) provider.charsetForName(cc.charset.substring(1), "../dev/data/testdata")
|
||||
: (CharsetICU) provider.charsetForName(cc.charset);
|
||||
|
||||
|
||||
|
||||
|
||||
//checking for converter that are not supported at this point
|
||||
try{
|
||||
if(charset.name()=="BOCU-1" ||charset.name()== "SCSU"|| charset.name()=="lmbcs1" || charset.name()== "lmbcs2"
|
||||
|
||||
|| charset.name()== "lmbcs3" || charset.name()== "lmbcs4" || charset.name()=="lmbcs5" || charset.name()=="lmbcs6" ||
|
||||
charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" || charset.name()=="lmbcs18"
|
||||
|| charset.name()=="lmbcs19"){
|
||||
logln("Converter not supported at this point :" +charset.displayName());
|
||||
}
|
||||
|
||||
}catch(Exception e){
|
||||
return;
|
||||
}
|
||||
mapset.clear();
|
||||
mapnotset.clear();
|
||||
|
||||
mapset.applyPattern(cc.map.toString(),false);
|
||||
mapnotset.applyPattern(cc.mapnot,false);
|
||||
charset.getUnicodeSet(unicodeset, which);
|
||||
UnicodeSet diffset = new UnicodeSet();
|
||||
|
||||
//are there items that must be in unicodeset but are not?
|
||||
|
||||
(diffset = mapset).removeAll(unicodeset);
|
||||
|
||||
if(!diffset.isEmpty()){
|
||||
StringBuffer s = new StringBuffer(diffset.toPattern(true));
|
||||
if(s.length()>100){
|
||||
s.replace(0, 0x7fffffff, ellipsis);
|
||||
}
|
||||
logln("error in missing items - conversion/getUnicodeSet test case "+cc.charset);
|
||||
logln(s.toString());
|
||||
}
|
||||
|
||||
//are the items that must not be in unicodeset but are?
|
||||
|
||||
|
||||
(diffset=mapnotset).retainAll(unicodeset);
|
||||
|
||||
if(!diffset.isEmpty()){
|
||||
StringBuffer s = new StringBuffer(diffset.toPattern(true));
|
||||
if(s.length()>100){
|
||||
s.replace(0, 0x7fffffff, ellipsis);
|
||||
}
|
||||
logln("contains unexpected items - conversion/getUnicodeSet test case "+cc.charset);
|
||||
logln(s.toString());
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
logln("getUnicodeSet returned an error code");
|
||||
logln("ErrorCode expected is: " + cc.outErrorCode);
|
||||
logln("Error Result is: " + e.toString());
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
|
||||
* start of the stream for example U+FEFF (the Unicode BOM/signature
|
||||
|
Loading…
Reference in New Issue
Block a user