ICU-2404 implement ucnv_getUnicodeSet() for extensions

X-SVN-Rev: 13527
This commit is contained in:
Markus Scherer 2003-10-29 22:06:57 +00:00
parent f8248fe84d
commit f24d153bdb
6 changed files with 286 additions and 28 deletions

View File

@ -20,18 +20,12 @@
#if !UCONFIG_NO_LEGACY_CONVERSION
#include "unicode/uset.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#include "ucnv_ext.h"
#include "cmemory.h"
/*
* ### TODO
*
* implement getUnicodeSet for the extension table
* implement data swapping for it
*/
/*
* ### TODO: probably need pointer to baseTableSharedData
* and also copy the base table's pointers for the base table arrays etc.
@ -935,26 +929,139 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
}
}
/*
* ### TODO
*
* - test toU() functions
*
* - EBCDIC_STATEFUL: support extensions, but the charset string must be
* either one single-byte character or a sequence of double-byte ones,
* to avoid state transitions inside the mapping and to avoid having to
* store character boundaries.
* The extension functions will need an additional EBCDIC state in/out
* parameter and will have to be able to insert an SI or SO before writing
* the mapping result.
* - EBCDIC_STATEFUL: toU() may need to check if in DB mode, do nothing if in SB
* - EBCDIC_STATEFUL: fix prefix checking to keep SBCS & DBCS separate
* - make dbcsonly work with extensions
*
* - test |2 to <subchar1> for regular code point, prefix code point,
* multiple code points
* - test fallback from non-zero to 00
* - try a smaller U_CNV_SAFECLONE_BUFFERSIZE and try ccapitst/TestConvertSafeClone()
*/
static void
ucnv_extGetUnicodeSetString(const UConverter *cnv,
const int32_t *cx,
USet *set,
UConverterUnicodeSet which,
UChar32 c,
UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
int32_t sectionIndex,
UErrorCode *pErrorCode) {
const UChar *fromUSectionUChars;
const uint32_t *fromUSectionValues;
uint32_t value;
int32_t i, count;
fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex;
fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex;
/* read first pair of the section */
count=*fromUSectionUChars++;
value=*fromUSectionValues++;
if( value!=0 &&
UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
UCNV_EXT_FROM_U_GET_LENGTH(value)>0
) {
if(c>=0) {
/* add the initial code point */
uset_add(set, c);
} else {
/* add the string so far */
uset_addString(set, s, length);
}
}
for(i=0; i<count; ++i) {
/* append this code unit and recurse or add the string */
s[length]=fromUSectionUChars[i];
value=fromUSectionValues[i];
if(value==0) {
/* no mapping, do nothing */
} else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
ucnv_extGetUnicodeSetString(
cnv, cx, set, which,
U_SENTINEL, s, length+1,
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
pErrorCode);
} else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
UCNV_EXT_FROM_U_GET_LENGTH(value)>0
) {
uset_addString(set, s, length+1);
}
}
}
U_CFUNC void
ucnv_extGetUnicodeSet(const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
const int32_t *cx;
const uint16_t *stage12, *stage3, *ps2, *ps3;
const uint32_t *stage3b;
uint32_t value;
int32_t st1, stage1Length, st2, st3;
UChar s[UCNV_EXT_MAX_UCHARS];
UChar32 c;
int32_t length;
cx=cnv->sharedData->table->mbcs.extIndexes;
if(cx==NULL) {
return;
}
stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
/* enumerate the from-Unicode trie table */
c=0; /* keep track of the current code point while enumerating */
/*
* the trie enumeration is almost the same as
* in _MBCSGetUnicodeSet() for MBCS_OUTPUT_1
*/
for(st1=0; st1<stage1Length; ++st1) {
st2=stage12[st1];
if(st2>stage1Length) {
ps2=stage12+st2;
for(st2=0; st2<64; ++st2) {
if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) {
/* read the stage 3 block */
ps3=stage3+st3;
/*
* Add code points for which the roundtrip flag is set.
* Do not add <subchar1> entries or other (future?) pseudo-entries
* with an output length of 0, or entries with reserved bits set.
* Recurse for partial results.
*/
do {
value=stage3b[*ps3++];
if(value==0) {
/* no mapping, do nothing */
} else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
length=0;
U16_APPEND_UNSAFE(s, length, c);
ucnv_extGetUnicodeSetString(
cnv, cx, set, which,
c, s, length,
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
pErrorCode);
} else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
UCNV_EXT_FROM_U_GET_LENGTH(value)>0
) {
uset_add(set, c);
}
} while((++c&0xf)!=0);
} else {
c+=16; /* empty stage 3 block */
}
}
} else {
c+=1024; /* empty stage 2 block */
}
}
}
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

View File

@ -350,6 +350,12 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
UErrorCode *pErrorCode);
U_CFUNC void
ucnv_extGetUnicodeSet(const UConverter *cnv,
USet *set,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);
/* toUnicode helpers -------------------------------------------------------- */
#define UCNV_EXT_TO_U_BYTE_SHIFT 24

View File

@ -519,6 +519,8 @@ _MBCSGetUnicodeSet(const UConverter *cnv,
}
}
}
ucnv_extGetUnicodeSet(cnv, set, which, pErrorCode);
}
/* conversion extensions for input not in the main table -------------------- */

View File

@ -19,6 +19,8 @@
#include "unicode/utypes.h"
#include "unicode/ucnv.h"
#include "unicode/unistr.h"
#include "unicode/parsepos.h"
#include "unicode/uniset.h"
#include "unicode/ustring.h"
#include "unicode/ures.h"
#include "convtest.h"
@ -44,6 +46,7 @@ ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
switch (index) {
case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
default: name=""; break; //needed to end loop
}
}
@ -294,6 +297,120 @@ ConversionTest::TestFromUnicode() {
}
}
void
ConversionTest::TestGetUnicodeSet() {
char charset[100];
UnicodeString s, map, mapnot;
int32_t which;
ParsePosition pos;
UnicodeSet cnvSet, mapSet, mapnotSet, diffSet;
UConverter *cnv;
TestLog testLog;
TestDataModule *dataModule;
TestData *testData;
const DataMap *testCase;
UErrorCode errorCode;
int32_t i;
errorCode=U_ZERO_ERROR;
dataModule=TestDataModule::getTestDataModule("conversion", testLog, errorCode);
if(U_SUCCESS(errorCode)) {
testData=dataModule->createTestData("getUnicodeSet", errorCode);
if(U_SUCCESS(errorCode)) {
for(i=0; testData->nextCase(testCase, errorCode); ++i) {
if(U_FAILURE(errorCode)) {
errln("error retrieving conversion/getUnicodeSet test case %d - %s",
i, u_errorName(errorCode));
errorCode=U_ZERO_ERROR;
continue;
}
s=testCase->getString("charset", errorCode);
s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
map=testCase->getString("map", errorCode);
mapnot=testCase->getString("mapnot", errorCode);
which=testCase->getInt28("which", errorCode);
if(U_FAILURE(errorCode)) {
errln("error parsing conversion/getUnicodeSet test case %d - %s",
i, u_errorName(errorCode));
errorCode=U_ZERO_ERROR;
continue;
}
// test this test case
mapSet.clear();
mapnotSet.clear();
pos.setIndex(0);
mapSet.applyPattern(map, pos, 0, NULL, errorCode);
if(U_FAILURE(errorCode) || pos.getIndex()!=map.length()) {
errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n"
" error index %d index %d U+%04x",
i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex()));
errorCode=U_ZERO_ERROR;
continue;
}
pos.setIndex(0);
mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode);
if(U_FAILURE(errorCode) || pos.getIndex()!=mapnot.length()) {
errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n"
" error index %d index %d U+%04x",
i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex()));
errorCode=U_ZERO_ERROR;
continue;
}
cnv=cnv_open(charset, errorCode);
if(U_FAILURE(errorCode)) {
errln("error opening \"%s\" for conversion/getUnicodeSet test case %d - %s",
charset, i, u_errorName(errorCode));
errorCode=U_ZERO_ERROR;
continue;
}
ucnv_getUnicodeSet(cnv, (USet *)&cnvSet, (UConverterUnicodeSet)which, &errorCode);
ucnv_close(cnv);
if(U_FAILURE(errorCode)) {
errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s",
charset, i, u_errorName(errorCode));
errorCode=U_ZERO_ERROR;
continue;
}
// are there items that must be in cnvSet but are not?
(diffSet=mapSet).removeAll(cnvSet);
if(!diffSet.isEmpty()) {
diffSet.toPattern(s, TRUE);
errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d",
charset, i);
errln(s);
}
// are there items that must not be in cnvSet but are?
(diffSet=mapnotSet).retainAll(cnvSet);
if(!diffSet.isEmpty()) {
diffSet.toPattern(s, TRUE);
errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d",
charset, i);
errln(s);
}
}
delete testData;
}
delete dataModule;
}
else {
errln("Failed: could not load test conversion data");
}
}
// open testdata or ICU data converter ------------------------------------- ***
UConverter *

View File

@ -58,6 +58,7 @@ public:
void TestToUnicode();
void TestFromUnicode();
void TestGetUnicodeSet();
private:
UBool

View File

@ -280,5 +280,30 @@ conversion {
{ "UTF-8", "a\U0010FFFF", :bin{ 61F48FBFBF }, :intvector{ 0, 1, 1, 1, 1 }, :int{1}, :int{0}, "", "", "" }
}
}
getUnicodeSet {
// charset - will be opened, and ucnv_getUnicodeSet() called on it
// map - set of code points and strings that must be in the returned set
// mapnot - set of code points and strings that must *not* be in the returned set
// which - numeric UConverterUnicodeSet value
Headers { "charset", "map", "mapnot", "which" }
Cases {
{
"ibm-1390",
"[\x00-\x0d\x10-\u017f\u0254\u309b-\u30ff\u4e00-\u4e05\U00023d00\U000243bc\U0002a6b2"
"{\u0254\u0300}{\u0254\u0301}{\u304b\u309a}{\u30ad\u309a}{\u30af\u309a}]",
"[\x0e\x0f\u0200-\u024f\U00010000-\U0001ffff\U0002a61b-\U0002a6b1]",
:int{0}
}
{
"*test3",
"[\x05\x0b\xc0\u20ac\U00023456\U00101234"
"{\U00101234\U00050005\U00060006}{\U00101234\U00050005}{\U00101234\U00060006}{\xc4\xc4\U00101234\x05}]",
"[\x06\x0e\U00034567\U000febcd{\U00101234\U00070007}]",
:int{0}
}
}
}
}
}