diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp index d865119010..0ba1ab7acc 100644 --- a/icu4c/source/common/uniset.cpp +++ b/icu4c/source/common/uniset.cpp @@ -661,10 +661,13 @@ int32_t UnicodeSet::findCodePoint(UChar32 c) const { if (c < list[0]) return 0; int32_t lo = 0; int32_t hi = len - 1; + // High runner test. c is often after the last range, so an + // initial check for this condition pays off. + if (len >= 2 && c >= list[len-2]) return len-1; // invariant: c >= list[lo] // invariant: c < list[hi] for (;;) { - int32_t i = (lo + hi) / 2; + int32_t i = (lo + hi) >> 1; if (i == lo) return hi; if (c < list[i]) { hi = i; @@ -1039,20 +1042,147 @@ UChar32 UnicodeSet::charAt(int32_t index) const { * to this set. */ UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { - if (start <= end) { + if (start < end) { UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; add(range, 2, 0); + } else if (start == end) { + add(start); } return *this; } +// #define DEBUG_US_ADD + +#ifdef DEBUG_US_ADD +#include +void dump(UChar32 c) { + if (c <= 0xFF) { + printf("%c", (char)c); + } else { + printf((c<0x10000)?"U+%04X":"U+%06X", c); + } +} +void dump(const UChar32* list, int32_t len) { + printf("["); + for (int32_t i=0; i "); +#endif + + if (c == list[i]-1) { + // c is before start of next range + list[i] = c; + // if we touched the HIGH mark, then add a new one + if (c == (UNICODESET_HIGH - 1)) { + ensureCapacity(len+1); + list[len++] = UNICODESET_HIGH; + } + if (i > 0 && c == list[i-1]) { + // collapse adjacent ranges + + // [..., start_i-1, c, c, limit_i, ..., HIGH] + // ^ + // list[i] + + //for (int32_t k=i-1; k 0 && c == list[i-1]) { + // c is after end of prior range + list[i-1]++; + // no need to chcek for collapse here + } + + else { + // At this point we know the new char is not adjacent to + // any existing ranges, and it is not 10FFFF. + + + // [..., start_i-1, limit_i-1, start_i, limit_i, ..., HIGH] + // ^ + // list[i] + + // [..., start_i-1, limit_i-1, c, c+1, start_i, limit_i, ..., HIGH] + // ^ + // list[i] + + ensureCapacity(len+2); + + //for (int32_t k=len-1; k>=i; --k) { + // list[k+2] = list[k]; + //} + UChar32* src = list + len; + UChar32* dst = src + 2; + UChar32* srclimit = list + i; + while (src > srclimit) *(--dst) = *(--src); + + list[i] = c; + list[i+1] = c+1; + len += 2; + } + +#ifdef DEBUG_US_ADD + dump(list, len); + printf("\n"); + + for (i=1; i