From f7b7183d7a7daf29c2e97252e4ff19ab45240d32 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 13 Sep 2004 23:33:22 +0000 Subject: [PATCH] ICU-4078 enable building a UnicodeSet from a property using/loading only the relevant data file X-SVN-Rev: 16313 --- icu4c/source/common/uchar.c | 108 ++++++++------ icu4c/source/common/uniset_props.cpp | 85 ++++++----- icu4c/source/common/uprops.c | 201 +++++++++++++++++---------- icu4c/source/common/uprops.h | 39 ++++++ 4 files changed, 271 insertions(+), 162 deletions(-) diff --git a/icu4c/source/common/uchar.c b/icu4c/source/common/uchar.c index 397253366f..5f5b896346 100644 --- a/icu4c/source/common/uchar.c +++ b/icu4c/source/common/uchar.c @@ -930,7 +930,7 @@ uprv_getMaxValues(int32_t column) { /* * get Hangul Syllable Type - * implemented here so that uchar.c (uchar_addPropertyStarts()) + * implemented here so that uchar.c (uhst_addPropertyStarts()) * does not depend on uprops.c (u_getIntPropertyValue(c, UCHAR_HANGUL_SYLLABLE_TYPE)) */ U_CFUNC UHangulSyllableType @@ -995,6 +995,69 @@ ublock_getCode(UChar32 c) { /* property starts for UnicodeSet ------------------------------------------- */ +/* for Hangul_Syllable_Type */ +U_CAPI void U_EXPORT2 +uhst_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) { + UChar32 c; + int32_t value, value2; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + if(!HAVE_DATA) { + *pErrorCode=dataErrorCode; + return; + } + + /* add code points with hardcoded properties, plus the ones following them */ + + /* + * Add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE. + * First, we add fixed boundaries for the blocks of Jamos. + * Then we check in loops to see where the current Unicode version + * actually stops assigning such Jamos. We start each loop + * at the end of the per-Jamo-block assignments in Unicode 4 or earlier. + * (These have not changed since Unicode 2.) + */ + sa->add(sa->set, 0x1100); + value=U_HST_LEADING_JAMO; + for(c=0x115a; c<=0x115f; ++c) { + value2=uchar_getHST(c); + if(value!=value2) { + value=value2; + sa->add(sa->set, c); + } + } + + sa->add(sa->set, 0x1160); + value=U_HST_VOWEL_JAMO; + for(c=0x11a3; c<=0x11a7; ++c) { + value2=uchar_getHST(c); + if(value!=value2) { + value=value2; + sa->add(sa->set, c); + } + } + + sa->add(sa->set, 0x11a8); + value=U_HST_TRAILING_JAMO; + for(c=0x11fa; c<=0x11ff; ++c) { + value2=uchar_getHST(c); + if(value!=value2) { + value=value2; + sa->add(sa->set, c); + } + } + + /* Add Hangul type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE. */ + for(c=HANGUL_BASE; c<(HANGUL_BASE+HANGUL_COUNT); c+=JAMO_T_COUNT) { + sa->add(sa->set, c); + sa->add(sa->set, c+1); + } + sa->add(sa->set, c); +} + static UBool U_CALLCONV _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) { /* add the start code point to the USet */ @@ -1007,8 +1070,9 @@ _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint U_CAPI void U_EXPORT2 uchar_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) { - UChar32 c; - int32_t value, value2; + if(U_FAILURE(*pErrorCode)) { + return; + } if(!HAVE_DATA) { *pErrorCode=dataErrorCode; @@ -1072,42 +1136,4 @@ uchar_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) { /* add for UCHAR_JOINING_TYPE */ sa->add(sa->set, ZWNJ); /* range ZWNJ..ZWJ */ sa->add(sa->set, ZWJ+1); - - /* - * Add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE. - * First, we add fixed boundaries for the blocks of Jamos. - * Then we check in loops to see where the current Unicode version - * actually stops assigning such Jamos. We start each loop - * at the end of the per-Jamo-block assignments in Unicode 4 or earlier. - * (These have not changed since Unicode 2.) - */ - sa->add(sa->set, 0x1100); - value=U_HST_LEADING_JAMO; - for(c=0x115a; c<=0x115f; ++c) { - value2=uchar_getHST(c); - if(value!=value2) { - value=value2; - sa->add(sa->set, c); - } - } - - sa->add(sa->set, 0x1160); - value=U_HST_VOWEL_JAMO; - for(c=0x11a3; c<=0x11a7; ++c) { - value2=uchar_getHST(c); - if(value!=value2) { - value=value2; - sa->add(sa->set, c); - } - } - - sa->add(sa->set, 0x11a8); - value=U_HST_TRAILING_JAMO; - for(c=0x11fa; c<=0x11ff; ++c) { - value2=uchar_getHST(c); - if(value!=value2) { - value=value2; - sa->add(sa->set, c); - } - } } diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index da9ad5bd01..282835b083 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -31,6 +31,8 @@ #include "uvector.h" #include "uprops.h" #include "propname.h" +#include "unormimp.h" +#include "ucase.h" #include "charstr.h" #include "ustrfmt.h" #include "mutex.h" @@ -149,7 +151,7 @@ static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ U_NAMESPACE_BEGIN -static UnicodeSet* INCLUSIONS = NULL; // cached uprv_getInclusions() +static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions() static Hashtable* CASE_EQUIV_HASH = NULL; // for closeOver(USET_CASE) @@ -1016,6 +1018,7 @@ static UBool intPropertyFilter(UChar32 ch, void* context) { */ void UnicodeSet::applyFilter(UnicodeSet::Filter filter, void* context, + int32_t src, UErrorCode &status) { // Walk through all Unicode characters, noting the start // and end of each range for which filter.contain(c) is @@ -1031,7 +1034,7 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter, // those properties. Scanning code points is slow. if (U_FAILURE(status)) return; - const UnicodeSet* inclusions = getInclusions(status); + const UnicodeSet* inclusions = getInclusions(src, status); if (U_FAILURE(status)) { return; } @@ -1101,38 +1104,10 @@ UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) if (U_FAILURE(ec)) return *this; if (prop == UCHAR_GENERAL_CATEGORY_MASK) { - applyFilter(generalCategoryMaskFilter, &value, ec); -#if UCONFIG_NO_NORMALIZATION - } else if(prop == UCHAR_HANGUL_SYLLABLE_TYPE) { - /* - * Special code for when normalization is off. - * HST is still available because it is hardcoded in uprops.c, but - * the inclusions set does not have the necessary code points - * for normalization properties. - * I am hardcoding HST in this case because it is the only property - * that prevents genbrk from compiling char.txt when normalization is off. - * This saves me from turning off break iteration or making more - * complicated changes in genbrk. - * - * This code is not efficient. For efficiency turn on normalization. - * - * markus 20030505 - */ - UChar32 c; - - clear(); - for(c=0x1100; c<=0xd7a3; ++c) { - if(c==0x1200) { - c=0xac00; - } - if(value == u_getIntPropertyValue(c, UCHAR_HANGUL_SYLLABLE_TYPE)) { - add(c); - } - } -#endif + applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); } else { IntPropertyContext c = {prop, value}; - applyFilter(intPropertyFilter, &c, ec); + applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); } return *this; } @@ -1205,7 +1180,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop, if (*end != 0) { FAIL(ec); } - applyFilter(numericValueFilter, &value, ec); + applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); return *this; } break; @@ -1236,7 +1211,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop, if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec); UVersionInfo version; u_versionFromString(version, buf); - applyFilter(versionFilter, &version, ec); + applyFilter(versionFilter, &version, UPROPS_SRC_CHAR, ec); return *this; } break; @@ -1274,7 +1249,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop, for (int32_t i=0; i