From f24d153bdbead9b6687ae98621bd071a2ef7b725 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Wed, 29 Oct 2003 22:06:57 +0000 Subject: [PATCH] ICU-2404 implement ucnv_getUnicodeSet() for extensions X-SVN-Rev: 13527 --- icu4c/source/common/ucnv_ext.c | 163 ++++++++++++++++++---- icu4c/source/common/ucnv_ext.h | 6 + icu4c/source/common/ucnvmbcs.c | 2 + icu4c/source/test/intltest/convtest.cpp | 117 ++++++++++++++++ icu4c/source/test/intltest/convtest.h | 1 + icu4c/source/test/testdata/conversion.txt | 25 ++++ 6 files changed, 286 insertions(+), 28 deletions(-) diff --git a/icu4c/source/common/ucnv_ext.c b/icu4c/source/common/ucnv_ext.c index e358a8b2fe..cd30ba2ca2 100644 --- a/icu4c/source/common/ucnv_ext.c +++ b/icu4c/source/common/ucnv_ext.c @@ -20,18 +20,12 @@ #if !UCONFIG_NO_LEGACY_CONVERSION +#include "unicode/uset.h" #include "ucnv_bld.h" #include "ucnv_cnv.h" #include "ucnv_ext.h" #include "cmemory.h" -/* - * ### TODO - * - * implement getUnicodeSet for the extension table - * implement data swapping for it - */ - /* * ### TODO: probably need pointer to baseTableSharedData * and also copy the base table's pointers for the base table arrays etc. @@ -935,26 +929,139 @@ ucnv_extContinueMatchFromU(UConverter *cnv, } } -/* - * ### TODO - * - * - test toU() functions - * - * - EBCDIC_STATEFUL: support extensions, but the charset string must be - * either one single-byte character or a sequence of double-byte ones, - * to avoid state transitions inside the mapping and to avoid having to - * store character boundaries. - * The extension functions will need an additional EBCDIC state in/out - * parameter and will have to be able to insert an SI or SO before writing - * the mapping result. - * - EBCDIC_STATEFUL: toU() may need to check if in DB mode, do nothing if in SB - * - EBCDIC_STATEFUL: fix prefix checking to keep SBCS & DBCS separate - * - make dbcsonly work with extensions - * - * - test |2 to for regular code point, prefix code point, - * multiple code points - * - test fallback from non-zero to 00 - * - try a smaller U_CNV_SAFECLONE_BUFFERSIZE and try ccapitst/TestConvertSafeClone() - */ +static void +ucnv_extGetUnicodeSetString(const UConverter *cnv, + const int32_t *cx, + USet *set, + UConverterUnicodeSet which, + UChar32 c, + UChar s[UCNV_EXT_MAX_UCHARS], int32_t length, + int32_t sectionIndex, + UErrorCode *pErrorCode) { + const UChar *fromUSectionUChars; + const uint32_t *fromUSectionValues; + + uint32_t value; + int32_t i, count; + + fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex; + fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex; + + /* read first pair of the section */ + count=*fromUSectionUChars++; + value=*fromUSectionValues++; + + if( value!=0 && + UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) && + UCNV_EXT_FROM_U_GET_LENGTH(value)>0 + ) { + if(c>=0) { + /* add the initial code point */ + uset_add(set, c); + } else { + /* add the string so far */ + uset_addString(set, s, length); + } + } + + for(i=0; i0 + ) { + uset_addString(set, s, length+1); + } + } +} + +U_CFUNC void +ucnv_extGetUnicodeSet(const UConverter *cnv, + USet *set, + UConverterUnicodeSet which, + UErrorCode *pErrorCode) { + const int32_t *cx; + const uint16_t *stage12, *stage3, *ps2, *ps3; + const uint32_t *stage3b; + + uint32_t value; + int32_t st1, stage1Length, st2, st3; + + UChar s[UCNV_EXT_MAX_UCHARS]; + UChar32 c; + int32_t length; + + cx=cnv->sharedData->table->mbcs.extIndexes; + if(cx==NULL) { + return; + } + + stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); + stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); + stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); + + stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; + + /* enumerate the from-Unicode trie table */ + c=0; /* keep track of the current code point while enumerating */ + + /* + * the trie enumeration is almost the same as + * in _MBCSGetUnicodeSet() for MBCS_OUTPUT_1 + */ + for(st1=0; st1stage1Length) { + ps2=stage12+st2; + for(st2=0; st2<64; ++st2) { + if((st3=(int32_t)ps2[st2]< entries or other (future?) pseudo-entries + * with an output length of 0, or entries with reserved bits set. + * Recurse for partial results. + */ + do { + value=stage3b[*ps3++]; + if(value==0) { + /* no mapping, do nothing */ + } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { + length=0; + U16_APPEND_UNSAFE(s, length, c); + ucnv_extGetUnicodeSetString( + cnv, cx, set, which, + c, s, length, + (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), + pErrorCode); + } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== + UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && + UCNV_EXT_FROM_U_GET_LENGTH(value)>0 + ) { + uset_add(set, c); + } + } while((++c&0xf)!=0); + } else { + c+=16; /* empty stage 3 block */ + } + } + } else { + c+=1024; /* empty stage 2 block */ + } + } +} #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ diff --git a/icu4c/source/common/ucnv_ext.h b/icu4c/source/common/ucnv_ext.h index 361ba73948..b74e411a60 100644 --- a/icu4c/source/common/ucnv_ext.h +++ b/icu4c/source/common/ucnv_ext.h @@ -350,6 +350,12 @@ ucnv_extContinueMatchFromU(UConverter *cnv, UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, UErrorCode *pErrorCode); +U_CFUNC void +ucnv_extGetUnicodeSet(const UConverter *cnv, + USet *set, + UConverterUnicodeSet which, + UErrorCode *pErrorCode); + /* toUnicode helpers -------------------------------------------------------- */ #define UCNV_EXT_TO_U_BYTE_SHIFT 24 diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c index d0b805b4c2..e6086c914b 100644 --- a/icu4c/source/common/ucnvmbcs.c +++ b/icu4c/source/common/ucnvmbcs.c @@ -519,6 +519,8 @@ _MBCSGetUnicodeSet(const UConverter *cnv, } } } + + ucnv_extGetUnicodeSet(cnv, set, which, pErrorCode); } /* conversion extensions for input not in the main table -------------------- */ diff --git a/icu4c/source/test/intltest/convtest.cpp b/icu4c/source/test/intltest/convtest.cpp index 4eef73e52b..82db9ac518 100644 --- a/icu4c/source/test/intltest/convtest.cpp +++ b/icu4c/source/test/intltest/convtest.cpp @@ -19,6 +19,8 @@ #include "unicode/utypes.h" #include "unicode/ucnv.h" #include "unicode/unistr.h" +#include "unicode/parsepos.h" +#include "unicode/uniset.h" #include "unicode/ustring.h" #include "unicode/ures.h" #include "convtest.h" @@ -44,6 +46,7 @@ ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha switch (index) { case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break; case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break; + case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break; default: name=""; break; //needed to end loop } } @@ -294,6 +297,120 @@ ConversionTest::TestFromUnicode() { } } +void +ConversionTest::TestGetUnicodeSet() { + char charset[100]; + UnicodeString s, map, mapnot; + int32_t which; + + ParsePosition pos; + UnicodeSet cnvSet, mapSet, mapnotSet, diffSet; + UConverter *cnv; + + TestLog testLog; + TestDataModule *dataModule; + TestData *testData; + const DataMap *testCase; + UErrorCode errorCode; + int32_t i; + + errorCode=U_ZERO_ERROR; + dataModule=TestDataModule::getTestDataModule("conversion", testLog, errorCode); + if(U_SUCCESS(errorCode)) { + testData=dataModule->createTestData("getUnicodeSet", errorCode); + if(U_SUCCESS(errorCode)) { + for(i=0; testData->nextCase(testCase, errorCode); ++i) { + if(U_FAILURE(errorCode)) { + errln("error retrieving conversion/getUnicodeSet test case %d - %s", + i, u_errorName(errorCode)); + errorCode=U_ZERO_ERROR; + continue; + } + + s=testCase->getString("charset", errorCode); + s.extract(0, 0x7fffffff, charset, sizeof(charset), ""); + + map=testCase->getString("map", errorCode); + mapnot=testCase->getString("mapnot", errorCode); + + which=testCase->getInt28("which", errorCode); + + if(U_FAILURE(errorCode)) { + errln("error parsing conversion/getUnicodeSet test case %d - %s", + i, u_errorName(errorCode)); + errorCode=U_ZERO_ERROR; + continue; + } + + // test this test case + mapSet.clear(); + mapnotSet.clear(); + + pos.setIndex(0); + mapSet.applyPattern(map, pos, 0, NULL, errorCode); + if(U_FAILURE(errorCode) || pos.getIndex()!=map.length()) { + errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n" + " error index %d index %d U+%04x", + i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex())); + errorCode=U_ZERO_ERROR; + continue; + } + + pos.setIndex(0); + mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode); + if(U_FAILURE(errorCode) || pos.getIndex()!=mapnot.length()) { + errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n" + " error index %d index %d U+%04x", + i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex())); + errorCode=U_ZERO_ERROR; + continue; + } + + cnv=cnv_open(charset, errorCode); + if(U_FAILURE(errorCode)) { + errln("error opening \"%s\" for conversion/getUnicodeSet test case %d - %s", + charset, i, u_errorName(errorCode)); + errorCode=U_ZERO_ERROR; + continue; + } + + ucnv_getUnicodeSet(cnv, (USet *)&cnvSet, (UConverterUnicodeSet)which, &errorCode); + ucnv_close(cnv); + + if(U_FAILURE(errorCode)) { + errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s", + charset, i, u_errorName(errorCode)); + errorCode=U_ZERO_ERROR; + continue; + } + + // are there items that must be in cnvSet but are not? + (diffSet=mapSet).removeAll(cnvSet); + if(!diffSet.isEmpty()) { + diffSet.toPattern(s, TRUE); + errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d", + charset, i); + errln(s); + } + + // are there items that must not be in cnvSet but are? + (diffSet=mapnotSet).retainAll(cnvSet); + if(!diffSet.isEmpty()) { + diffSet.toPattern(s, TRUE); + errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d", + charset, i); + errln(s); + } + } + delete testData; + } + delete dataModule; + } + else { + errln("Failed: could not load test conversion data"); + } +} + // open testdata or ICU data converter ------------------------------------- *** UConverter * diff --git a/icu4c/source/test/intltest/convtest.h b/icu4c/source/test/intltest/convtest.h index eb55da0f65..d7805d97d7 100644 --- a/icu4c/source/test/intltest/convtest.h +++ b/icu4c/source/test/intltest/convtest.h @@ -58,6 +58,7 @@ public: void TestToUnicode(); void TestFromUnicode(); + void TestGetUnicodeSet(); private: UBool diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt index b2f6ef7c49..d0fb549563 100644 --- a/icu4c/source/test/testdata/conversion.txt +++ b/icu4c/source/test/testdata/conversion.txt @@ -280,5 +280,30 @@ conversion { { "UTF-8", "a\U0010FFFF", :bin{ 61F48FBFBF }, :intvector{ 0, 1, 1, 1, 1 }, :int{1}, :int{0}, "", "", "" } } } + + getUnicodeSet { + // charset - will be opened, and ucnv_getUnicodeSet() called on it + // map - set of code points and strings that must be in the returned set + // mapnot - set of code points and strings that must *not* be in the returned set + // which - numeric UConverterUnicodeSet value + Headers { "charset", "map", "mapnot", "which" } + Cases { + { + "ibm-1390", + "[\x00-\x0d\x10-\u017f\u0254\u309b-\u30ff\u4e00-\u4e05\U00023d00\U000243bc\U0002a6b2" + "{\u0254\u0300}{\u0254\u0301}{\u304b\u309a}{\u30ad\u309a}{\u30af\u309a}]", + "[\x0e\x0f\u0200-\u024f\U00010000-\U0001ffff\U0002a61b-\U0002a6b1]", + :int{0} + } + + { + "*test3", + "[\x05\x0b\xc0\u20ac\U00023456\U00101234" + "{\U00101234\U00050005\U00060006}{\U00101234\U00050005}{\U00101234\U00060006}{\xc4\xc4\U00101234\x05}]", + "[\x06\x0e\U00034567\U000febcd{\U00101234\U00070007}]", + :int{0} + } + } + } } }