ICU-2404 implement ucnv_getUnicodeSet() for extensions
X-SVN-Rev: 13527
This commit is contained in:
parent
f8248fe84d
commit
f24d153bdb
@ -20,18 +20,12 @@
|
||||
|
||||
#if !UCONFIG_NO_LEGACY_CONVERSION
|
||||
|
||||
#include "unicode/uset.h"
|
||||
#include "ucnv_bld.h"
|
||||
#include "ucnv_cnv.h"
|
||||
#include "ucnv_ext.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
/*
|
||||
* ### TODO
|
||||
*
|
||||
* implement getUnicodeSet for the extension table
|
||||
* implement data swapping for it
|
||||
*/
|
||||
|
||||
/*
|
||||
* ### TODO: probably need pointer to baseTableSharedData
|
||||
* and also copy the base table's pointers for the base table arrays etc.
|
||||
@ -935,26 +929,139 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ### TODO
|
||||
*
|
||||
* - test toU() functions
|
||||
*
|
||||
* - EBCDIC_STATEFUL: support extensions, but the charset string must be
|
||||
* either one single-byte character or a sequence of double-byte ones,
|
||||
* to avoid state transitions inside the mapping and to avoid having to
|
||||
* store character boundaries.
|
||||
* The extension functions will need an additional EBCDIC state in/out
|
||||
* parameter and will have to be able to insert an SI or SO before writing
|
||||
* the mapping result.
|
||||
* - EBCDIC_STATEFUL: toU() may need to check if in DB mode, do nothing if in SB
|
||||
* - EBCDIC_STATEFUL: fix prefix checking to keep SBCS & DBCS separate
|
||||
* - make dbcsonly work with extensions
|
||||
*
|
||||
* - test |2 to <subchar1> for regular code point, prefix code point,
|
||||
* multiple code points
|
||||
* - test fallback from non-zero to 00
|
||||
* - try a smaller U_CNV_SAFECLONE_BUFFERSIZE and try ccapitst/TestConvertSafeClone()
|
||||
*/
|
||||
static void
|
||||
ucnv_extGetUnicodeSetString(const UConverter *cnv,
|
||||
const int32_t *cx,
|
||||
USet *set,
|
||||
UConverterUnicodeSet which,
|
||||
UChar32 c,
|
||||
UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
|
||||
int32_t sectionIndex,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UChar *fromUSectionUChars;
|
||||
const uint32_t *fromUSectionValues;
|
||||
|
||||
uint32_t value;
|
||||
int32_t i, count;
|
||||
|
||||
fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex;
|
||||
fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex;
|
||||
|
||||
/* read first pair of the section */
|
||||
count=*fromUSectionUChars++;
|
||||
value=*fromUSectionValues++;
|
||||
|
||||
if( value!=0 &&
|
||||
UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
|
||||
UCNV_EXT_FROM_U_GET_LENGTH(value)>0
|
||||
) {
|
||||
if(c>=0) {
|
||||
/* add the initial code point */
|
||||
uset_add(set, c);
|
||||
} else {
|
||||
/* add the string so far */
|
||||
uset_addString(set, s, length);
|
||||
}
|
||||
}
|
||||
|
||||
for(i=0; i<count; ++i) {
|
||||
/* append this code unit and recurse or add the string */
|
||||
s[length]=fromUSectionUChars[i];
|
||||
value=fromUSectionValues[i];
|
||||
|
||||
if(value==0) {
|
||||
/* no mapping, do nothing */
|
||||
} else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
|
||||
ucnv_extGetUnicodeSetString(
|
||||
cnv, cx, set, which,
|
||||
U_SENTINEL, s, length+1,
|
||||
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
|
||||
pErrorCode);
|
||||
} else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
|
||||
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
|
||||
UCNV_EXT_FROM_U_GET_LENGTH(value)>0
|
||||
) {
|
||||
uset_addString(set, s, length+1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_extGetUnicodeSet(const UConverter *cnv,
|
||||
USet *set,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode) {
|
||||
const int32_t *cx;
|
||||
const uint16_t *stage12, *stage3, *ps2, *ps3;
|
||||
const uint32_t *stage3b;
|
||||
|
||||
uint32_t value;
|
||||
int32_t st1, stage1Length, st2, st3;
|
||||
|
||||
UChar s[UCNV_EXT_MAX_UCHARS];
|
||||
UChar32 c;
|
||||
int32_t length;
|
||||
|
||||
cx=cnv->sharedData->table->mbcs.extIndexes;
|
||||
if(cx==NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
|
||||
stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
|
||||
stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
|
||||
|
||||
stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
|
||||
|
||||
/* enumerate the from-Unicode trie table */
|
||||
c=0; /* keep track of the current code point while enumerating */
|
||||
|
||||
/*
|
||||
* the trie enumeration is almost the same as
|
||||
* in _MBCSGetUnicodeSet() for MBCS_OUTPUT_1
|
||||
*/
|
||||
for(st1=0; st1<stage1Length; ++st1) {
|
||||
st2=stage12[st1];
|
||||
if(st2>stage1Length) {
|
||||
ps2=stage12+st2;
|
||||
for(st2=0; st2<64; ++st2) {
|
||||
if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) {
|
||||
/* read the stage 3 block */
|
||||
ps3=stage3+st3;
|
||||
|
||||
/*
|
||||
* Add code points for which the roundtrip flag is set.
|
||||
* Do not add <subchar1> entries or other (future?) pseudo-entries
|
||||
* with an output length of 0, or entries with reserved bits set.
|
||||
* Recurse for partial results.
|
||||
*/
|
||||
do {
|
||||
value=stage3b[*ps3++];
|
||||
if(value==0) {
|
||||
/* no mapping, do nothing */
|
||||
} else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
|
||||
length=0;
|
||||
U16_APPEND_UNSAFE(s, length, c);
|
||||
ucnv_extGetUnicodeSetString(
|
||||
cnv, cx, set, which,
|
||||
c, s, length,
|
||||
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
|
||||
pErrorCode);
|
||||
} else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
|
||||
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
|
||||
UCNV_EXT_FROM_U_GET_LENGTH(value)>0
|
||||
) {
|
||||
uset_add(set, c);
|
||||
}
|
||||
} while((++c&0xf)!=0);
|
||||
} else {
|
||||
c+=16; /* empty stage 3 block */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
c+=1024; /* empty stage 2 block */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
|
||||
|
@ -350,6 +350,12 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
|
||||
UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_extGetUnicodeSet(const UConverter *cnv,
|
||||
USet *set,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/* toUnicode helpers -------------------------------------------------------- */
|
||||
|
||||
#define UCNV_EXT_TO_U_BYTE_SHIFT 24
|
||||
|
@ -519,6 +519,8 @@ _MBCSGetUnicodeSet(const UConverter *cnv,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ucnv_extGetUnicodeSet(cnv, set, which, pErrorCode);
|
||||
}
|
||||
|
||||
/* conversion extensions for input not in the main table -------------------- */
|
||||
|
@ -19,6 +19,8 @@
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "convtest.h"
|
||||
@ -44,6 +46,7 @@ ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
|
||||
switch (index) {
|
||||
case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
|
||||
case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
|
||||
case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
|
||||
default: name=""; break; //needed to end loop
|
||||
}
|
||||
}
|
||||
@ -294,6 +297,120 @@ ConversionTest::TestFromUnicode() {
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ConversionTest::TestGetUnicodeSet() {
|
||||
char charset[100];
|
||||
UnicodeString s, map, mapnot;
|
||||
int32_t which;
|
||||
|
||||
ParsePosition pos;
|
||||
UnicodeSet cnvSet, mapSet, mapnotSet, diffSet;
|
||||
UConverter *cnv;
|
||||
|
||||
TestLog testLog;
|
||||
TestDataModule *dataModule;
|
||||
TestData *testData;
|
||||
const DataMap *testCase;
|
||||
UErrorCode errorCode;
|
||||
int32_t i;
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
dataModule=TestDataModule::getTestDataModule("conversion", testLog, errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
testData=dataModule->createTestData("getUnicodeSet", errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
for(i=0; testData->nextCase(testCase, errorCode); ++i) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
errln("error retrieving conversion/getUnicodeSet test case %d - %s",
|
||||
i, u_errorName(errorCode));
|
||||
errorCode=U_ZERO_ERROR;
|
||||
continue;
|
||||
}
|
||||
|
||||
s=testCase->getString("charset", errorCode);
|
||||
s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
|
||||
|
||||
map=testCase->getString("map", errorCode);
|
||||
mapnot=testCase->getString("mapnot", errorCode);
|
||||
|
||||
which=testCase->getInt28("which", errorCode);
|
||||
|
||||
if(U_FAILURE(errorCode)) {
|
||||
errln("error parsing conversion/getUnicodeSet test case %d - %s",
|
||||
i, u_errorName(errorCode));
|
||||
errorCode=U_ZERO_ERROR;
|
||||
continue;
|
||||
}
|
||||
|
||||
// test this test case
|
||||
mapSet.clear();
|
||||
mapnotSet.clear();
|
||||
|
||||
pos.setIndex(0);
|
||||
mapSet.applyPattern(map, pos, 0, NULL, errorCode);
|
||||
if(U_FAILURE(errorCode) || pos.getIndex()!=map.length()) {
|
||||
errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n"
|
||||
" error index %d index %d U+%04x",
|
||||
i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex()));
|
||||
errorCode=U_ZERO_ERROR;
|
||||
continue;
|
||||
}
|
||||
|
||||
pos.setIndex(0);
|
||||
mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode);
|
||||
if(U_FAILURE(errorCode) || pos.getIndex()!=mapnot.length()) {
|
||||
errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n"
|
||||
" error index %d index %d U+%04x",
|
||||
i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex()));
|
||||
errorCode=U_ZERO_ERROR;
|
||||
continue;
|
||||
}
|
||||
|
||||
cnv=cnv_open(charset, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
errln("error opening \"%s\" for conversion/getUnicodeSet test case %d - %s",
|
||||
charset, i, u_errorName(errorCode));
|
||||
errorCode=U_ZERO_ERROR;
|
||||
continue;
|
||||
}
|
||||
|
||||
ucnv_getUnicodeSet(cnv, (USet *)&cnvSet, (UConverterUnicodeSet)which, &errorCode);
|
||||
ucnv_close(cnv);
|
||||
|
||||
if(U_FAILURE(errorCode)) {
|
||||
errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s",
|
||||
charset, i, u_errorName(errorCode));
|
||||
errorCode=U_ZERO_ERROR;
|
||||
continue;
|
||||
}
|
||||
|
||||
// are there items that must be in cnvSet but are not?
|
||||
(diffSet=mapSet).removeAll(cnvSet);
|
||||
if(!diffSet.isEmpty()) {
|
||||
diffSet.toPattern(s, TRUE);
|
||||
errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d",
|
||||
charset, i);
|
||||
errln(s);
|
||||
}
|
||||
|
||||
// are there items that must not be in cnvSet but are?
|
||||
(diffSet=mapnotSet).retainAll(cnvSet);
|
||||
if(!diffSet.isEmpty()) {
|
||||
diffSet.toPattern(s, TRUE);
|
||||
errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d",
|
||||
charset, i);
|
||||
errln(s);
|
||||
}
|
||||
}
|
||||
delete testData;
|
||||
}
|
||||
delete dataModule;
|
||||
}
|
||||
else {
|
||||
errln("Failed: could not load test conversion data");
|
||||
}
|
||||
}
|
||||
|
||||
// open testdata or ICU data converter ------------------------------------- ***
|
||||
|
||||
UConverter *
|
||||
|
@ -58,6 +58,7 @@ public:
|
||||
|
||||
void TestToUnicode();
|
||||
void TestFromUnicode();
|
||||
void TestGetUnicodeSet();
|
||||
|
||||
private:
|
||||
UBool
|
||||
|
25
icu4c/source/test/testdata/conversion.txt
vendored
25
icu4c/source/test/testdata/conversion.txt
vendored
@ -280,5 +280,30 @@ conversion {
|
||||
{ "UTF-8", "a\U0010FFFF", :bin{ 61F48FBFBF }, :intvector{ 0, 1, 1, 1, 1 }, :int{1}, :int{0}, "", "", "" }
|
||||
}
|
||||
}
|
||||
|
||||
getUnicodeSet {
|
||||
// charset - will be opened, and ucnv_getUnicodeSet() called on it
|
||||
// map - set of code points and strings that must be in the returned set
|
||||
// mapnot - set of code points and strings that must *not* be in the returned set
|
||||
// which - numeric UConverterUnicodeSet value
|
||||
Headers { "charset", "map", "mapnot", "which" }
|
||||
Cases {
|
||||
{
|
||||
"ibm-1390",
|
||||
"[\x00-\x0d\x10-\u017f\u0254\u309b-\u30ff\u4e00-\u4e05\U00023d00\U000243bc\U0002a6b2"
|
||||
"{\u0254\u0300}{\u0254\u0301}{\u304b\u309a}{\u30ad\u309a}{\u30af\u309a}]",
|
||||
"[\x0e\x0f\u0200-\u024f\U00010000-\U0001ffff\U0002a61b-\U0002a6b1]",
|
||||
:int{0}
|
||||
}
|
||||
|
||||
{
|
||||
"*test3",
|
||||
"[\x05\x0b\xc0\u20ac\U00023456\U00101234"
|
||||
"{\U00101234\U00050005\U00060006}{\U00101234\U00050005}{\U00101234\U00060006}{\xc4\xc4\U00101234\x05}]",
|
||||
"[\x06\x0e\U00034567\U000febcd{\U00101234\U00070007}]",
|
||||
:int{0}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user