From f24d153bdbead9b6687ae98621bd071a2ef7b725 Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Wed, 29 Oct 2003 22:06:57 +0000
Subject: [PATCH] ICU-2404 implement ucnv_getUnicodeSet() for extensions

X-SVN-Rev: 13527
---
 icu4c/source/common/ucnv_ext.c            | 163 ++++++++++++++++++----
 icu4c/source/common/ucnv_ext.h            |   6 +
 icu4c/source/common/ucnvmbcs.c            |   2 +
 icu4c/source/test/intltest/convtest.cpp   | 117 ++++++++++++++++
 icu4c/source/test/intltest/convtest.h     |   1 +
 icu4c/source/test/testdata/conversion.txt |  25 ++++
 6 files changed, 286 insertions(+), 28 deletions(-)
diff --git a/icu4c/source/common/ucnv_ext.c b/icu4c/source/common/ucnv_ext.c
index e358a8b2fe..cd30ba2ca2 100644
--- a/icu4c/source/common/ucnv_ext.c
+++ b/icu4c/source/common/ucnv_ext.c
@@ -20,18 +20,12 @@
 
 #if !UCONFIG_NO_LEGACY_CONVERSION
 
+#include "unicode/uset.h"
 #include "ucnv_bld.h"
 #include "ucnv_cnv.h"
 #include "ucnv_ext.h"
 #include "cmemory.h"
 
-/*
- * ### TODO
- *
- * implement getUnicodeSet for the extension table
- * implement data swapping for it
- */
-
 /*
  * ### TODO: probably need pointer to baseTableSharedData
  * and also copy the base table's pointers for the base table arrays etc.
@@ -935,26 +929,139 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
     }
 }
 
-/*
- * ### TODO
- *
- * - test toU() functions
- *
- * - EBCDIC_STATEFUL: support extensions, but the charset string must be
- *   either one single-byte character or a sequence of double-byte ones,
- *   to avoid state transitions inside the mapping and to avoid having to
- *   store character boundaries.
- *   The extension functions will need an additional EBCDIC state in/out
- *   parameter and will have to be able to insert an SI or SO before writing
- *   the mapping result.
- * - EBCDIC_STATEFUL: toU() may need to check if in DB mode, do nothing if in SB
- * - EBCDIC_STATEFUL: fix prefix checking to keep SBCS & DBCS separate
- * - make dbcsonly work with extensions
- *
- * - test |2 to <subchar1> for regular code point, prefix code point,
- *   multiple code points
- * - test fallback from non-zero to 00
- * - try a smaller U_CNV_SAFECLONE_BUFFERSIZE and try ccapitst/TestConvertSafeClone()
- */
+static void
+ucnv_extGetUnicodeSetString(const UConverter *cnv,
+                            const int32_t *cx,
+                            USet *set,
+                            UConverterUnicodeSet which,
+                            UChar32 c,
+                            UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
+                            int32_t sectionIndex,
+                            UErrorCode *pErrorCode) {
+    const UChar *fromUSectionUChars;
+    const uint32_t *fromUSectionValues;
+
+    uint32_t value;
+    int32_t i, count;
+
+    fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex;
+    fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex;
+
+    /* read first pair of the section */
+    count=*fromUSectionUChars++;
+    value=*fromUSectionValues++;
+
+    if( value!=0 &&
+        UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
+        UCNV_EXT_FROM_U_GET_LENGTH(value)>0
+    ) {
+        if(c>=0) {
+            /* add the initial code point */
+            uset_add(set, c);
+        } else {
+            /* add the string so far */
+            uset_addString(set, s, length);
+        }
+    }
+
+    for(i=0; i<count; ++i) {
+        /* append this code unit and recurse or add the string */
+        s[length]=fromUSectionUChars[i];
+        value=fromUSectionValues[i];
+
+        if(value==0) {
+            /* no mapping, do nothing */
+        } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
+            ucnv_extGetUnicodeSetString(
+                cnv, cx, set, which,
+                U_SENTINEL, s, length+1,
+                (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
+                pErrorCode);
+        } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
+                           UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
+                  UCNV_EXT_FROM_U_GET_LENGTH(value)>0
+        ) {
+            uset_addString(set, s, length+1);
+        }
+    }
+}
+
+U_CFUNC void
+ucnv_extGetUnicodeSet(const UConverter *cnv,
+                      USet *set,
+                      UConverterUnicodeSet which,
+                      UErrorCode *pErrorCode) {
+    const int32_t *cx;
+    const uint16_t *stage12, *stage3, *ps2, *ps3;
+    const uint32_t *stage3b;
+
+    uint32_t value;
+    int32_t st1, stage1Length, st2, st3;
+
+    UChar s[UCNV_EXT_MAX_UCHARS];
+    UChar32 c;
+    int32_t length;
+
+    cx=cnv->sharedData->table->mbcs.extIndexes;
+    if(cx==NULL) {
+        return;
+    }
+
+    stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
+    stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
+    stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
+
+    stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
+
+    /* enumerate the from-Unicode trie table */
+    c=0; /* keep track of the current code point while enumerating */
+
+    /*
+     * the trie enumeration is almost the same as
+     * in _MBCSGetUnicodeSet() for MBCS_OUTPUT_1
+     */
+    for(st1=0; st1<stage1Length; ++st1) {
+        st2=stage12[st1];
+        if(st2>stage1Length) {
+            ps2=stage12+st2;
+            for(st2=0; st2<64; ++st2) {
+                if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) {
+                    /* read the stage 3 block */
+                    ps3=stage3+st3;
+
+                    /*
+                     * Add code points for which the roundtrip flag is set.
+                     * Do not add <subchar1> entries or other (future?) pseudo-entries
+                     * with an output length of 0, or entries with reserved bits set.
+                     * Recurse for partial results.
+                     */
+                    do {
+                        value=stage3b[*ps3++];
+                        if(value==0) {
+                            /* no mapping, do nothing */
+                        } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
+                            length=0;
+                            U16_APPEND_UNSAFE(s, length, c);
+                            ucnv_extGetUnicodeSetString(
+                                cnv, cx, set, which,
+                                c, s, length,
+                                (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
+                                pErrorCode);
+                        } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
+                                           UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
+                                  UCNV_EXT_FROM_U_GET_LENGTH(value)>0
+                        ) {
+                            uset_add(set, c);
+                        }
+                    } while((++c&0xf)!=0);
+                } else {
+                    c+=16; /* empty stage 3 block */
+                }
+            }
+        } else {
+            c+=1024; /* empty stage 2 block */
+        }
+    }
+}
 
 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
diff --git a/icu4c/source/common/ucnv_ext.h b/icu4c/source/common/ucnv_ext.h
index 361ba73948..b74e411a60 100644
--- a/icu4c/source/common/ucnv_ext.h
+++ b/icu4c/source/common/ucnv_ext.h
@@ -350,6 +350,12 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
                            UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
                            UErrorCode *pErrorCode);
 
+U_CFUNC void
+ucnv_extGetUnicodeSet(const UConverter *cnv,
+                      USet *set,
+                      UConverterUnicodeSet which,
+                      UErrorCode *pErrorCode);
+
 /* toUnicode helpers -------------------------------------------------------- */
 
 #define UCNV_EXT_TO_U_BYTE_SHIFT 24
diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c
index d0b805b4c2..e6086c914b 100644
--- a/icu4c/source/common/ucnvmbcs.c
+++ b/icu4c/source/common/ucnvmbcs.c
@@ -519,6 +519,8 @@ _MBCSGetUnicodeSet(const UConverter *cnv,
             }
         }
     }
+
+    ucnv_extGetUnicodeSet(cnv, set, which, pErrorCode);
 }
 
 /* conversion extensions for input not in the main table -------------------- */
diff --git a/icu4c/source/test/intltest/convtest.cpp b/icu4c/source/test/intltest/convtest.cpp
index 4eef73e52b..82db9ac518 100644
--- a/icu4c/source/test/intltest/convtest.cpp
+++ b/icu4c/source/test/intltest/convtest.cpp
@@ -19,6 +19,8 @@
 #include "unicode/utypes.h"
 #include "unicode/ucnv.h"
 #include "unicode/unistr.h"
+#include "unicode/parsepos.h"
+#include "unicode/uniset.h"
 #include "unicode/ustring.h"
 #include "unicode/ures.h"
 #include "convtest.h"
@@ -44,6 +46,7 @@ ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
     switch (index) {
         case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
         case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
+        case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
         default: name=""; break; //needed to end loop
     }
 }
@@ -294,6 +297,120 @@ ConversionTest::TestFromUnicode() {
     }
 }
 
+void
+ConversionTest::TestGetUnicodeSet() {
+    char charset[100];
+    UnicodeString s, map, mapnot;
+    int32_t which;
+
+    ParsePosition pos;
+    UnicodeSet cnvSet, mapSet, mapnotSet, diffSet;
+    UConverter *cnv;
+
+    TestLog testLog;
+    TestDataModule *dataModule;
+    TestData *testData;
+    const DataMap *testCase;
+    UErrorCode errorCode;
+    int32_t i;
+
+    errorCode=U_ZERO_ERROR;
+    dataModule=TestDataModule::getTestDataModule("conversion", testLog, errorCode);
+    if(U_SUCCESS(errorCode)) {
+        testData=dataModule->createTestData("getUnicodeSet", errorCode);
+        if(U_SUCCESS(errorCode)) {
+            for(i=0; testData->nextCase(testCase, errorCode); ++i) {
+                if(U_FAILURE(errorCode)) {
+                    errln("error retrieving conversion/getUnicodeSet test case %d - %s",
+                            i, u_errorName(errorCode));
+                    errorCode=U_ZERO_ERROR;
+                    continue;
+                }
+
+                s=testCase->getString("charset", errorCode);
+                s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
+
+                map=testCase->getString("map", errorCode);
+                mapnot=testCase->getString("mapnot", errorCode);
+
+                which=testCase->getInt28("which", errorCode);
+
+                if(U_FAILURE(errorCode)) {
+                    errln("error parsing conversion/getUnicodeSet test case %d - %s",
+                            i, u_errorName(errorCode));
+                    errorCode=U_ZERO_ERROR;
+                    continue;
+                }
+
+                // test this test case
+                mapSet.clear();
+                mapnotSet.clear();
+
+                pos.setIndex(0);
+                mapSet.applyPattern(map, pos, 0, NULL, errorCode);
+                if(U_FAILURE(errorCode) || pos.getIndex()!=map.length()) {
+                    errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n"
+                          "    error index %d  index %d  U+%04x",
+                            i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex()));
+                    errorCode=U_ZERO_ERROR;
+                    continue;
+                }
+
+                pos.setIndex(0);
+                mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode);
+                if(U_FAILURE(errorCode) || pos.getIndex()!=mapnot.length()) {
+                    errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n"
+                          "    error index %d  index %d  U+%04x",
+                            i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex()));
+                    errorCode=U_ZERO_ERROR;
+                    continue;
+                }
+
+                cnv=cnv_open(charset, errorCode);
+                if(U_FAILURE(errorCode)) {
+                    errln("error opening \"%s\" for conversion/getUnicodeSet test case %d - %s",
+                            charset, i, u_errorName(errorCode));
+                    errorCode=U_ZERO_ERROR;
+                    continue;
+                }
+
+                ucnv_getUnicodeSet(cnv, (USet *)&cnvSet, (UConverterUnicodeSet)which, &errorCode);
+                ucnv_close(cnv);
+
+                if(U_FAILURE(errorCode)) {
+                    errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s",
+                            charset, i, u_errorName(errorCode));
+                    errorCode=U_ZERO_ERROR;
+                    continue;
+                }
+
+                // are there items that must be in cnvSet but are not?
+                (diffSet=mapSet).removeAll(cnvSet);
+                if(!diffSet.isEmpty()) {
+                    diffSet.toPattern(s, TRUE);
+                    errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d",
+                            charset, i);
+                    errln(s);
+                }
+
+                // are there items that must not be in cnvSet but are?
+                (diffSet=mapnotSet).retainAll(cnvSet);
+                if(!diffSet.isEmpty()) {
+                    diffSet.toPattern(s, TRUE);
+                    errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d",
+                            charset, i);
+                    errln(s);
+                }
+            }
+            delete testData;
+        }
+        delete dataModule;
+    }
+    else {
+        errln("Failed: could not load test conversion data");
+    }
+}
+
 // open testdata or ICU data converter ------------------------------------- ***
 
 UConverter *
diff --git a/icu4c/source/test/intltest/convtest.h b/icu4c/source/test/intltest/convtest.h
index eb55da0f65..d7805d97d7 100644
--- a/icu4c/source/test/intltest/convtest.h
+++ b/icu4c/source/test/intltest/convtest.h
@@ -58,6 +58,7 @@ public:
 
     void TestToUnicode();
     void TestFromUnicode();
+    void TestGetUnicodeSet();
 
 private:
     UBool
diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt
index b2f6ef7c49..d0fb549563 100644
--- a/icu4c/source/test/testdata/conversion.txt
+++ b/icu4c/source/test/testdata/conversion.txt
@@ -280,5 +280,30 @@ conversion {
         { "UTF-8", "a\U0010FFFF", :bin{ 61F48FBFBF }, :intvector{ 0, 1, 1, 1, 1 }, :int{1}, :int{0}, "", "", "" }
       }
     }
+
+    getUnicodeSet {
+      // charset - will be opened, and ucnv_getUnicodeSet() called on it
+      // map - set of code points and strings that must be in the returned set
+      // mapnot - set of code points and strings that must *not* be in the returned set
+      // which - numeric UConverterUnicodeSet value
+      Headers { "charset", "map", "mapnot", "which" }
+      Cases {
+        {
+          "ibm-1390",
+          "[\x00-\x0d\x10-\u017f\u0254\u309b-\u30ff\u4e00-\u4e05\U00023d00\U000243bc\U0002a6b2"
+              "{\u0254\u0300}{\u0254\u0301}{\u304b\u309a}{\u30ad\u309a}{\u30af\u309a}]",
+          "[\x0e\x0f\u0200-\u024f\U00010000-\U0001ffff\U0002a61b-\U0002a6b1]",
+          :int{0}
+        }
+
+        {
+          "*test3",
+          "[\x05\x0b\xc0\u20ac\U00023456\U00101234"
+              "{\U00101234\U00050005\U00060006}{\U00101234\U00050005}{\U00101234\U00060006}{\xc4\xc4\U00101234\x05}]",
+          "[\x06\x0e\U00034567\U000febcd{\U00101234\U00070007}]",
+          :int{0}
+        }
+      }
+    }
   }
 }