ICU-5585 fast, freezable UnicodeSet with span() with string support; svn merge -r 21018:22033 http://source.icu-project.org/repos/icu/icu/branches/markus/fastset and revert source/test/perf/unisetperf/draft/* and source/common/utrie.* and source/allinone/allinone.sln (experimental code)
X-SVN-Rev: 22053
This commit is contained in:
parent
4d282a7e02
commit
19446aeeba
11
.gitignore
vendored
11
.gitignore
vendored
@ -47,7 +47,9 @@ icu4c/source/config/Makefile.inc
|
||||
icu4c/source/config/icu-config
|
||||
icu4c/source/config/icu-config.1
|
||||
icu4c/source/data/*.plg
|
||||
icu4c/source/data/Debug
|
||||
icu4c/source/data/Makefile
|
||||
icu4c/source/data/Release
|
||||
icu4c/source/data/icupkg.inc
|
||||
icu4c/source/data/in
|
||||
icu4c/source/data/makedata.vcproj.*.*.user
|
||||
@ -305,6 +307,15 @@ icu4c/source/test/perf/ubrkperf/debug
|
||||
icu4c/source/test/perf/ubrkperf/release
|
||||
icu4c/source/test/perf/ubrkperf/ubrkperf
|
||||
icu4c/source/test/perf/ubrkperf/ubrkperf.vcproj.*.*.user
|
||||
icu4c/source/test/perf/unisetperf/*.d
|
||||
icu4c/source/test/perf/unisetperf/*.o
|
||||
icu4c/source/test/perf/unisetperf/Debug
|
||||
icu4c/source/test/perf/unisetperf/Makefile
|
||||
icu4c/source/test/perf/unisetperf/Release
|
||||
icu4c/source/test/perf/unisetperf/debug
|
||||
icu4c/source/test/perf/unisetperf/release
|
||||
icu4c/source/test/perf/unisetperf/unisetperf
|
||||
icu4c/source/test/perf/unisetperf/unisetperf.vcproj.*.*.user
|
||||
icu4c/source/test/perf/usetperf/*.d
|
||||
icu4c/source/test/perf/usetperf/*.o
|
||||
icu4c/source/test/perf/usetperf/Debug
|
||||
|
@ -80,7 +80,7 @@ utf_impl.o ustring.o ustrcase.o ucasemap.o cstring.o ustrfmt.o ustrtrns.o ustr_w
|
||||
normlzr.o unorm.o unormcmp.o unorm_it.o chariter.o schriter.o uchriter.o uiter.o \
|
||||
uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \
|
||||
uscript.o usc_impl.o unames.o \
|
||||
utrie.o uset_props.o uniset_props.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
|
||||
utrie.o bmpset.o unisetspan.o uset_props.o uniset_props.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
|
||||
uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o triedict.o \
|
||||
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
|
||||
serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \
|
||||
|
714
icu4c/source/common/bmpset.cpp
Normal file
714
icu4c/source/common/bmpset.cpp
Normal file
@ -0,0 +1,714 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: bmpset.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2007jan29
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "cmemory.h"
|
||||
#include "bmpset.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
|
||||
list(parentList), listLength(parentListLength) {
|
||||
uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
|
||||
uprv_memset(table7FF, 0, sizeof(table7FF));
|
||||
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
|
||||
|
||||
/*
|
||||
* Set the list indexes for binary searches for
|
||||
* U+0800, U+1000, U+2000, .., U+F000, U+10000.
|
||||
* U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
|
||||
* looked up in the bit tables.
|
||||
* The last pair of indexes is for finding supplementary code points.
|
||||
*/
|
||||
list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
|
||||
int32_t i;
|
||||
for(i=1; i<=0x10; ++i) {
|
||||
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
|
||||
}
|
||||
list4kStarts[0x11]=listLength-1;
|
||||
|
||||
initBits();
|
||||
overrideIllegal();
|
||||
}
|
||||
|
||||
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
|
||||
list(newParentList), listLength(newParentListLength) {
|
||||
uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
|
||||
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
|
||||
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
|
||||
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
|
||||
}
|
||||
|
||||
/*
|
||||
* Set bits in a bit rectangle in "vertical" bit organization.
|
||||
* start<limit<=0x800
|
||||
*/
|
||||
static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
|
||||
int32_t lead=start>>6;
|
||||
int32_t trail=start&0x3f;
|
||||
|
||||
// Set one bit indicating an all-one block.
|
||||
uint32_t bits=(uint32_t)1<<lead;
|
||||
if((start+1)==limit) { // Single-character shortcut.
|
||||
table[trail]|=bits;
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t limitLead=limit>>6;
|
||||
int32_t limitTrail=limit&0x3f;
|
||||
|
||||
if(lead==limitLead) {
|
||||
// Partial vertical bit column.
|
||||
while(trail<limitTrail) {
|
||||
table[trail++]|=bits;
|
||||
}
|
||||
} else {
|
||||
// Partial vertical bit column,
|
||||
// followed by a bit rectangle,
|
||||
// followed by another partial vertical bit column.
|
||||
if(trail>0) {
|
||||
do {
|
||||
table[trail++]|=bits;
|
||||
} while(trail<64);
|
||||
++lead;
|
||||
}
|
||||
if(lead<limitLead) {
|
||||
bits=~((1<<lead)-1);
|
||||
if(limitLead<0x20) {
|
||||
bits&=(1<<limitLead)-1;
|
||||
}
|
||||
for(trail=0; trail<64; ++trail) {
|
||||
table[trail]|=bits;
|
||||
}
|
||||
}
|
||||
bits=1<<limitLead;
|
||||
for(trail=0; trail<limitTrail; ++trail) {
|
||||
table[trail]|=bits;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BMPSet::initBits() {
|
||||
UChar32 start, limit;
|
||||
int32_t listIndex=0;
|
||||
|
||||
// Set asciiBytes[].
|
||||
do {
|
||||
start=list[listIndex++];
|
||||
if(listIndex<listLength) {
|
||||
limit=list[listIndex++];
|
||||
}
|
||||
if(start>=0x80) {
|
||||
break;
|
||||
}
|
||||
do {
|
||||
asciiBytes[start++]=1;
|
||||
} while(start<limit && start<0x80);
|
||||
} while(limit<=0x80);
|
||||
|
||||
// Set table7FF[].
|
||||
while(start<0x800) {
|
||||
set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
|
||||
if(limit>0x800) {
|
||||
start=0x800;
|
||||
break;
|
||||
}
|
||||
|
||||
start=list[listIndex++];
|
||||
if(listIndex<listLength) {
|
||||
limit=list[listIndex++];
|
||||
}
|
||||
}
|
||||
|
||||
// Set bmpBlockBits[].
|
||||
int32_t minStart=0x800;
|
||||
while(start<0x10000) {
|
||||
if(limit>0x10000) {
|
||||
limit=0x10000;
|
||||
}
|
||||
|
||||
if(start<minStart) {
|
||||
start=minStart;
|
||||
}
|
||||
if(start<limit) { // Else: Another range entirely in a known mixed-value block.
|
||||
if(start&0x3f) {
|
||||
// Mixed-value block of 64 code points.
|
||||
start>>=6;
|
||||
bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
|
||||
start=(start+1)<<6; // Round up to the next block boundary.
|
||||
minStart=start; // Ignore further ranges in this block.
|
||||
}
|
||||
if(start<limit) {
|
||||
if(start<(limit&~0x3f)) {
|
||||
// Multiple all-ones blocks of 64 code points each.
|
||||
set32x64Bits(bmpBlockBits, start>>6, limit>>6);
|
||||
}
|
||||
|
||||
if(limit&0x3f) {
|
||||
// Mixed-value block of 64 code points.
|
||||
limit>>=6;
|
||||
bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
|
||||
limit=(limit+1)<<6; // Round up to the next block boundary.
|
||||
minStart=limit; // Ignore further ranges in this block.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(limit==0x10000) {
|
||||
break;
|
||||
}
|
||||
|
||||
start=list[listIndex++];
|
||||
if(listIndex<listLength) {
|
||||
limit=list[listIndex++];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Override some bits and bytes to the result of contains(FFFD)
|
||||
* for faster validity checking at runtime.
|
||||
* No need to set 0 values where they were reset to 0 in the constructor
|
||||
* and not modified by initBits().
|
||||
* (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
|
||||
* Need to set 0 values for surrogates D800..DFFF.
|
||||
*/
|
||||
void BMPSet::overrideIllegal() {
|
||||
uint32_t bits, mask;
|
||||
int32_t i;
|
||||
|
||||
if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
|
||||
// contains(FFFD)==TRUE
|
||||
for(i=0x80; i<0xc0; ++i) {
|
||||
asciiBytes[i]=1;
|
||||
}
|
||||
|
||||
bits=3; // Lead bytes 0xC0 and 0xC1.
|
||||
for(i=0; i<64; ++i) {
|
||||
table7FF[i]|=bits;
|
||||
}
|
||||
|
||||
bits=1; // Lead byte 0xE0.
|
||||
for(i=0; i<32; ++i) { // First half of 4k block.
|
||||
bmpBlockBits[i]|=bits;
|
||||
}
|
||||
|
||||
mask=~(0x10001<<0xd); // Lead byte 0xED.
|
||||
bits=1<<0xd;
|
||||
for(i=32; i<64; ++i) { // Second half of 4k block.
|
||||
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
|
||||
}
|
||||
} else {
|
||||
// contains(FFFD)==FALSE
|
||||
mask=~(0x10001<<0xd); // Lead byte 0xED.
|
||||
for(i=32; i<64; ++i) { // Second half of 4k block.
|
||||
bmpBlockBits[i]&=mask;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
|
||||
/* Examples:
|
||||
findCodePoint(c)
|
||||
set list[] c=0 1 3 4 7 8
|
||||
=== ============== ===========
|
||||
[] [110000] 0 0 0 0 0 0
|
||||
[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
|
||||
[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
|
||||
[:Any:] [0, 110000] 1 1 1 1 1 1
|
||||
*/
|
||||
|
||||
// Return the smallest i such that c < list[i]. Assume
|
||||
// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
|
||||
if (c < list[lo])
|
||||
return lo;
|
||||
// High runner test. c is often after the last range, so an
|
||||
// initial check for this condition pays off.
|
||||
if (lo >= hi || c >= list[hi-1])
|
||||
return hi;
|
||||
// invariant: c >= list[lo]
|
||||
// invariant: c < list[hi]
|
||||
for (;;) {
|
||||
int32_t i = (lo + hi) >> 1;
|
||||
if (i == lo) {
|
||||
break; // Found!
|
||||
} else if (c < list[i]) {
|
||||
hi = i;
|
||||
} else {
|
||||
lo = i;
|
||||
}
|
||||
}
|
||||
return hi;
|
||||
}
|
||||
|
||||
UBool
|
||||
BMPSet::contains(UChar32 c) const {
|
||||
if((uint32_t)c<=0x7f) {
|
||||
return (UBool)asciiBytes[c];
|
||||
} else if((uint32_t)c<=0x7ff) {
|
||||
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
|
||||
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
|
||||
int lead=c>>12;
|
||||
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
return (UBool)twoBits;
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
|
||||
}
|
||||
} else if((uint32_t)c<=0x10ffff) {
|
||||
// surrogate or supplementary code point
|
||||
return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
|
||||
} else {
|
||||
// Out-of-range code points get FALSE, consistent with long-standing
|
||||
// behavior of UnicodeSet::contains(c).
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check for sufficient length for trail unit for each surrogate pair.
|
||||
* Handle single surrogates as surrogate code points as usual in ICU.
|
||||
*/
|
||||
const UChar *
|
||||
BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
|
||||
UChar c, c2;
|
||||
|
||||
if(spanCondition) {
|
||||
// span
|
||||
do {
|
||||
c=*s;
|
||||
if(c<=0x7f) {
|
||||
if(!asciiBytes[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
|
||||
break;
|
||||
}
|
||||
} else if(c<0xd800 || c>=0xe000) {
|
||||
int lead=c>>12;
|
||||
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if(twoBits==0) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
|
||||
// surrogate code point
|
||||
if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// surrogate pair
|
||||
if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
++s;
|
||||
}
|
||||
} while(++s<limit);
|
||||
} else {
|
||||
// span not
|
||||
do {
|
||||
c=*s;
|
||||
if(c<=0x7f) {
|
||||
if(asciiBytes[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
|
||||
break;
|
||||
}
|
||||
} else if(c<0xd800 || c>=0xe000) {
|
||||
int lead=c>>12;
|
||||
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if(twoBits!=0) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
|
||||
// surrogate code point
|
||||
if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// surrogate pair
|
||||
if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
++s;
|
||||
}
|
||||
} while(++s<limit);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
/* Symmetrical with span(). */
|
||||
const UChar *
|
||||
BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
|
||||
UChar c, c2;
|
||||
|
||||
if(spanCondition) {
|
||||
// span
|
||||
for(;;) {
|
||||
c=*(--limit);
|
||||
if(c<=0x7f) {
|
||||
if(!asciiBytes[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
|
||||
break;
|
||||
}
|
||||
} else if(c<0xd800 || c>=0xe000) {
|
||||
int lead=c>>12;
|
||||
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if(twoBits==0) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
|
||||
// surrogate code point
|
||||
if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// surrogate pair
|
||||
if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
--limit;
|
||||
}
|
||||
if(s==limit) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// span not
|
||||
for(;;) {
|
||||
c=*(--limit);
|
||||
if(c<=0x7f) {
|
||||
if(asciiBytes[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
|
||||
break;
|
||||
}
|
||||
} else if(c<0xd800 || c>=0xe000) {
|
||||
int lead=c>>12;
|
||||
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if(twoBits!=0) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
|
||||
// surrogate code point
|
||||
if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// surrogate pair
|
||||
if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
--limit;
|
||||
}
|
||||
if(s==limit) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
return limit+1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Precheck for sufficient trail bytes at end of string only once per span.
|
||||
* Check validity.
|
||||
*/
|
||||
const uint8_t *
|
||||
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
|
||||
const uint8_t *limit=s+length;
|
||||
uint8_t b=*s;
|
||||
if((int8_t)b>=0) {
|
||||
// Initial all-ASCII span.
|
||||
if(spanCondition) {
|
||||
do {
|
||||
if(!asciiBytes[b] || ++s==limit) {
|
||||
return s;
|
||||
}
|
||||
b=*s;
|
||||
} while((int8_t)b>=0);
|
||||
} else {
|
||||
do {
|
||||
if(asciiBytes[b] || ++s==limit) {
|
||||
return s;
|
||||
}
|
||||
b=*s;
|
||||
} while((int8_t)b>=0);
|
||||
}
|
||||
length=(int32_t)(limit-s);
|
||||
}
|
||||
|
||||
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
|
||||
}
|
||||
|
||||
const uint8_t *limit0=limit;
|
||||
|
||||
/*
|
||||
* Make sure that the last 1/2/3/4-byte sequence before limit is complete
|
||||
* or runs into a lead byte.
|
||||
* In the span loop compare s with limit only once
|
||||
* per multi-byte character.
|
||||
*
|
||||
* Give a trailing illegal sequence the same value as the result of contains(FFFD),
|
||||
* including it if that is part of the span, otherwise set limit0 to before
|
||||
* the truncated sequence.
|
||||
*/
|
||||
b=*(limit-1);
|
||||
if((int8_t)b<0) {
|
||||
// b>=0x80: lead or trail byte
|
||||
if(b<0xc0) {
|
||||
// single trail byte, check for preceding 3- or 4-byte lead byte
|
||||
if(length>=2 && (b=*(limit-2))>=0xe0) {
|
||||
limit-=2;
|
||||
if(asciiBytes[0x80]!=spanCondition) {
|
||||
limit0=limit;
|
||||
}
|
||||
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
|
||||
// 4-byte lead byte with only two trail bytes
|
||||
limit-=3;
|
||||
if(asciiBytes[0x80]!=spanCondition) {
|
||||
limit0=limit;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// lead byte with no trail bytes
|
||||
--limit;
|
||||
if(asciiBytes[0x80]!=spanCondition) {
|
||||
limit0=limit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t t1, t2, t3;
|
||||
|
||||
while(s<limit) {
|
||||
b=*s;
|
||||
if(b<0xc0) {
|
||||
// ASCII; or trail bytes with the result of contains(FFFD).
|
||||
if(spanCondition) {
|
||||
do {
|
||||
if(!asciiBytes[b]) {
|
||||
return s;
|
||||
} else if(++s==limit) {
|
||||
return limit0;
|
||||
}
|
||||
b=*s;
|
||||
} while(b<0xc0);
|
||||
} else {
|
||||
do {
|
||||
if(asciiBytes[b]) {
|
||||
return s;
|
||||
} else if(++s==limit) {
|
||||
return limit0;
|
||||
}
|
||||
b=*s;
|
||||
} while(b<0xc0);
|
||||
}
|
||||
}
|
||||
++s; // Advance past the lead byte.
|
||||
if(b>=0xe0) {
|
||||
if(b<0xf0) {
|
||||
if( /* handle U+0000..U+FFFF inline */
|
||||
(t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
|
||||
(t2=(uint8_t)(s[1]-0x80)) <= 0x3f
|
||||
) {
|
||||
b&=0xf;
|
||||
uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with this lead byte and middle trail byte
|
||||
// are either in the set or not.
|
||||
if(twoBits!=spanCondition) {
|
||||
return s-1;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
UChar32 c=(b<<12)|(t1<<6)|t2;
|
||||
if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
|
||||
return s-1;
|
||||
}
|
||||
}
|
||||
s+=2;
|
||||
continue;
|
||||
}
|
||||
} else if( /* handle U+10000..U+10FFFF inline */
|
||||
(t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
|
||||
(t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
|
||||
(t3=(uint8_t)(s[2]-0x80)) <= 0x3f
|
||||
) {
|
||||
// Give an illegal sequence the same value as the result of contains(FFFD).
|
||||
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
|
||||
if( ( (0x10000<=c && c<=0x10ffff) ?
|
||||
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
|
||||
asciiBytes[0x80]
|
||||
) != spanCondition
|
||||
) {
|
||||
return s-1;
|
||||
}
|
||||
s+=3;
|
||||
continue;
|
||||
}
|
||||
} else /* 0xc0<=b<0xe0 */ {
|
||||
if( /* handle U+0000..U+07FF inline */
|
||||
(t1=(uint8_t)(*s-0x80)) <= 0x3f
|
||||
) {
|
||||
if(((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
|
||||
return s-1;
|
||||
}
|
||||
++s;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Give an illegal sequence the same value as the result of contains(FFFD).
|
||||
// Handle each byte of an illegal sequence separately to simplify the code;
|
||||
// no need to optimize error handling.
|
||||
if(asciiBytes[0x80]!=spanCondition) {
|
||||
return s-1;
|
||||
}
|
||||
}
|
||||
|
||||
return limit0;
|
||||
}
|
||||
|
||||
/*
|
||||
* While going backwards through UTF-8 optimize only for ASCII.
|
||||
* Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
|
||||
* possible to tell from the last byte in a multi-byte sequence how many
|
||||
* preceding bytes there should be. Therefore, going backwards through UTF-8
|
||||
* is much harder than going forward.
|
||||
*/
|
||||
int32_t
|
||||
BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
|
||||
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
|
||||
}
|
||||
|
||||
uint8_t b;
|
||||
|
||||
do {
|
||||
b=s[--length];
|
||||
if((int8_t)b>=0) {
|
||||
// ASCII sub-span
|
||||
if(spanCondition) {
|
||||
do {
|
||||
if(!asciiBytes[b]) {
|
||||
return length+1;
|
||||
} else if(length==0) {
|
||||
return 0;
|
||||
}
|
||||
b=s[--length];
|
||||
} while((int8_t)b>=0);
|
||||
} else {
|
||||
do {
|
||||
if(asciiBytes[b]) {
|
||||
return length+1;
|
||||
} else if(length==0) {
|
||||
return 0;
|
||||
}
|
||||
b=s[--length];
|
||||
} while((int8_t)b>=0);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t prev=length;
|
||||
UChar32 c;
|
||||
if(b<0xc0) {
|
||||
// trail byte: collect a multi-byte character
|
||||
c=utf8_prevCharSafeBody(s, 0, &length, b, -1);
|
||||
if(c<0) {
|
||||
c=0xfffd;
|
||||
}
|
||||
} else {
|
||||
// lead byte in last-trail position
|
||||
c=0xfffd;
|
||||
}
|
||||
// c is a valid code point, not ASCII, not a surrogate
|
||||
if(c<=0x7ff) {
|
||||
if(((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
|
||||
return prev+1;
|
||||
}
|
||||
} else if(c<=0xffff) {
|
||||
int lead=c>>12;
|
||||
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
|
||||
if(twoBits<=1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if(twoBits!=spanCondition) {
|
||||
return prev+1;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
|
||||
return prev+1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
|
||||
return prev+1;
|
||||
}
|
||||
}
|
||||
} while(length>0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
160
icu4c/source/common/bmpset.h
Normal file
160
icu4c/source/common/bmpset.h
Normal file
@ -0,0 +1,160 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: bmpset.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2007jan29
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __BMPSET_H__
|
||||
#define __BMPSET_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
* Helper class for frozen UnicodeSets, implements contains() and span()
|
||||
* optimized for BMP code points. Structured to be UTF-8-friendly.
|
||||
*
|
||||
* ASCII: Look up bytes.
|
||||
* 2-byte characters: Bits organized vertically.
|
||||
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
|
||||
* with mixed for illegal ranges.
|
||||
* Supplementary characters: Call contains() on the parent set.
|
||||
*/
|
||||
class BMPSet : public UMemory {
|
||||
public:
|
||||
BMPSet(const int32_t *parentList, int32_t parentListLength);
|
||||
BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength);
|
||||
|
||||
virtual UBool contains(UChar32 c) const;
|
||||
|
||||
/*
|
||||
* Span the initial substring for which each character c has spanCondition==contains(c).
|
||||
* It must be s<limit and spanCondition==0 or 1.
|
||||
* @return The string pointer which limits the span.
|
||||
*/
|
||||
const UChar *span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;
|
||||
|
||||
/*
|
||||
* Span the trailing substring for which each character c has spanCondition==contains(c).
|
||||
* It must be s<limit and spanCondition==0 or 1.
|
||||
* @return The string pointer which starts the span.
|
||||
*/
|
||||
const UChar *spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;
|
||||
|
||||
/*
|
||||
* Span the initial substring for which each character c has spanCondition==contains(c).
|
||||
* It must be length>0 and spanCondition==0 or 1.
|
||||
* @return The string pointer which limits the span.
|
||||
*/
|
||||
const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
/*
|
||||
* Span the trailing substring for which each character c has spanCondition==contains(c).
|
||||
* It must be length>0 and spanCondition==0 or 1.
|
||||
* @return The start of the span.
|
||||
*/
|
||||
int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
private:
|
||||
void initBits();
|
||||
void overrideIllegal();
|
||||
|
||||
/**
|
||||
* Same as UnicodeSet::findCodePoint(UChar32 c) const except that the
|
||||
* binary search is restricted for finding code points in a certain range.
|
||||
*
|
||||
* For restricting the search for finding in the range start..end,
|
||||
* pass in
|
||||
* lo=findCodePoint(start) and
|
||||
* hi=findCodePoint(end)
|
||||
* with 0<=lo<=hi<len.
|
||||
* findCodePoint(c) defaults to lo=0 and hi=len-1.
|
||||
*
|
||||
* @param c a character in a subrange of MIN_VALUE..MAX_VALUE
|
||||
* @param lo The lowest index to be returned.
|
||||
* @param hi The highest index to be returned.
|
||||
* @return the smallest integer i in the range lo..hi,
|
||||
* inclusive, such that c < list[i]
|
||||
*/
|
||||
int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const;
|
||||
|
||||
inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
|
||||
|
||||
/*
|
||||
* One byte per ASCII character, or trail byte in lead position.
|
||||
* 0 or 1 for ASCII characters.
|
||||
* The value for trail bytes is the result of contains(FFFD)
|
||||
* for faster validity checking at runtime.
|
||||
*/
|
||||
UBool asciiBytes[0xc0];
|
||||
|
||||
/*
|
||||
* One bit per code point from U+0000..U+07FF.
|
||||
* The bits are organized vertically; consecutive code points
|
||||
* correspond to the same bit positions in consecutive table words.
|
||||
* With code point parts
|
||||
* lead=c{10..6}
|
||||
* trail=c{5..0}
|
||||
* it is set.contains(c)==(table7FF[trail] bit lead)
|
||||
*
|
||||
* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD)
|
||||
* for faster validity checking at runtime.
|
||||
*/
|
||||
uint32_t table7FF[64];
|
||||
|
||||
/*
|
||||
* One bit per 64 BMP code points.
|
||||
* The bits are organized vertically; consecutive 64-code point blocks
|
||||
* correspond to the same bit position in consecutive table words.
|
||||
* With code point parts
|
||||
* lead=c{15..12}
|
||||
* t1=c{11..6}
|
||||
* test bits (lead+16) and lead in bmpBlockBits[t1].
|
||||
* If the upper bit is 0, then the lower bit indicates if contains(c)
|
||||
* for all code points in the 64-block.
|
||||
* If the upper bit is 1, then the block is mixed and set.contains(c)
|
||||
* must be called.
|
||||
*
|
||||
* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to
|
||||
* the result of contains(FFFD) for faster validity checking at runtime.
|
||||
*/
|
||||
uint32_t bmpBlockBits[64];
|
||||
|
||||
/*
|
||||
* Inversion list indexes for restricted binary searches in
|
||||
* findCodePoint(), from
|
||||
* findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000).
|
||||
* U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
|
||||
* always looked up in the bit tables.
|
||||
* The last pair of indexes is for finding supplementary code points.
|
||||
*/
|
||||
int32_t list4kStarts[18];
|
||||
|
||||
/*
|
||||
* The inversion list of the parent set, for the slower contains() implementation
|
||||
* for mixed BMP blocks and for supplementary code points.
|
||||
* The list is terminated with list[listLength-1]=0x110000.
|
||||
*/
|
||||
const int32_t *list;
|
||||
int32_t listLength;
|
||||
};
|
||||
|
||||
inline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const {
|
||||
return (UBool)(findCodePoint(c, lo, hi) & 1);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
@ -1771,6 +1771,14 @@
|
||||
<Filter
|
||||
Name="properties & sets"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\bmpset.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\bmpset.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\propname.cpp"
|
||||
>
|
||||
@ -1951,6 +1959,14 @@
|
||||
RelativePath=".\uniset_props.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unisetspan.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unisetspan.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\uprops.c"
|
||||
>
|
||||
|
@ -22,8 +22,10 @@
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class BMPSet;
|
||||
class ParsePosition;
|
||||
class SymbolTable;
|
||||
class UnicodeSetStringSpan;
|
||||
class UVector;
|
||||
class RuleCharacterIterator;
|
||||
|
||||
@ -263,6 +265,7 @@ class U_COMMON_API UnicodeSet : public UnicodeFilter {
|
||||
int32_t len; // length of list used; 0 <= len <= capacity
|
||||
int32_t capacity; // capacity of list
|
||||
UChar32* list; // MUST be terminated with HIGH
|
||||
BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
|
||||
UChar32* buffer; // internal buffer, may be NULL
|
||||
int32_t bufferCapacity; // capacity of buffer
|
||||
int32_t patLen;
|
||||
@ -278,6 +281,7 @@ class U_COMMON_API UnicodeSet : public UnicodeFilter {
|
||||
*/
|
||||
UChar *pat;
|
||||
UVector* strings; // maintained in sorted order
|
||||
UnicodeSetStringSpan *stringSpan;
|
||||
|
||||
public:
|
||||
|
||||
@ -377,6 +381,7 @@ public:
|
||||
|
||||
/**
|
||||
* Assigns this object to be a copy of another.
|
||||
* A frozen set will not be modified.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
UnicodeSet& operator=(const UnicodeSet& o);
|
||||
@ -405,6 +410,9 @@ public:
|
||||
* Returns a copy of this object. All UnicodeFunctor objects have
|
||||
* to support cloning in order to allow classes using
|
||||
* UnicodeFunctors, such as Transliterator, to implement cloning.
|
||||
* If this set is frozen, then the clone will be frozen as well.
|
||||
* Use cloneAsThawed() for a mutable clone of a frozen set.
|
||||
* @see cloneAsThawed
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
virtual UnicodeFunctor* clone() const;
|
||||
@ -418,6 +426,45 @@ public:
|
||||
*/
|
||||
virtual int32_t hashCode(void) const;
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Freezable API
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Determines whether the set has been frozen (made immutable) or not.
|
||||
* See the ICU4J Freezable interface for details.
|
||||
* @return TRUE/FALSE for whether the set has been frozen
|
||||
* @see freeze
|
||||
* @see cloneAsThawed
|
||||
* @draft ICU 3.8
|
||||
*/
|
||||
inline UBool isFrozen() const;
|
||||
|
||||
/**
|
||||
* Freeze the set (make it immutable).
|
||||
* Once frozen, it cannot be unfrozen and is therefore thread-safe
|
||||
* until it is deleted.
|
||||
* See the ICU4J Freezable interface for details.
|
||||
* Freezing the set may also make some operations faster, for example
|
||||
* contains() and span().
|
||||
* A frozen set will not be modified. (It remains frozen.)
|
||||
* @return this set.
|
||||
* @see isFrozen
|
||||
* @see cloneAsThawed
|
||||
* @draft ICU 3.8
|
||||
*/
|
||||
UnicodeFunctor *freeze();
|
||||
|
||||
/**
|
||||
* Clone the set and make the clone mutable.
|
||||
* See the ICU4J Freezable interface for details.
|
||||
* @return the mutable clone
|
||||
* @see freeze
|
||||
* @see isFrozen
|
||||
* @draft ICU 3.8
|
||||
*/
|
||||
UnicodeFunctor *cloneAsThawed() const;
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Public API
|
||||
//----------------------------------------------------------------
|
||||
@ -426,6 +473,7 @@ public:
|
||||
* Make this object represent the range <code>start - end</code>.
|
||||
* If <code>end > start</code> then this object is set to an
|
||||
* an empty range.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param start first character in the set, inclusive
|
||||
* @param end last character in the set, inclusive
|
||||
@ -445,6 +493,7 @@ public:
|
||||
* Modifies this set to represent the set specified by the given
|
||||
* pattern, optionally ignoring white space. See the class
|
||||
* description for the syntax of the pattern language.
|
||||
* A frozen set will not be modified.
|
||||
* @param pattern a string specifying what characters are in the set
|
||||
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
|
||||
* contains a syntax error.
|
||||
@ -459,6 +508,7 @@ public:
|
||||
* Modifies this set to represent the set specified by the given
|
||||
* pattern, optionally ignoring white space. See the class
|
||||
* description for the syntax of the pattern language.
|
||||
* A frozen set will not be modified.
|
||||
* @param pattern a string specifying what characters are in the set
|
||||
* @param options bitmask for options to apply to the pattern.
|
||||
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
|
||||
@ -486,6 +536,7 @@ public:
|
||||
* pairs list for the parsed pattern is returned. This method calls
|
||||
* itself recursively to parse embedded subpatterns.
|
||||
*<em> Empties the set passed before applying the pattern.</em>
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param pattern the string containing the pattern to be parsed.
|
||||
* The portion of the string from pos.getIndex(), which must be a
|
||||
@ -515,6 +566,7 @@ public:
|
||||
* Returns a string representation of this set. If the result of
|
||||
* calling this function is passed to a UnicodeSet constructor, it
|
||||
* will produce another set that is equal to this one.
|
||||
* A frozen set will not be modified.
|
||||
* @param result the string to receive the rules. Previous
|
||||
* contents will be deleted.
|
||||
* @param escapeUnprintable if TRUE then convert unprintable
|
||||
@ -530,6 +582,7 @@ public:
|
||||
* Modifies this set to contain those code points which have the given value
|
||||
* for the given binary or enumerated property, as returned by
|
||||
* u_getIntPropertyValue. Prior contents of this set are lost.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
|
||||
* or UCHAR_INT_START..UCHAR_INT_LIMIT-1
|
||||
@ -555,6 +608,7 @@ public:
|
||||
* Modifies this set to contain those code points which have the
|
||||
* given value for the given property. Prior contents of this
|
||||
* set are lost.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param prop a property alias, either short or long. The name is matched
|
||||
* loosely. See PropertyAliases.txt for names and a description of loose
|
||||
@ -603,6 +657,7 @@ public:
|
||||
|
||||
/**
|
||||
* Returns true if this set contains the given character.
|
||||
* This function works faster with a frozen set.
|
||||
* @param c character to be checked for containment
|
||||
* @return true if the test condition is met
|
||||
* @stable ICU 2.0
|
||||
@ -702,6 +757,84 @@ public:
|
||||
*/
|
||||
inline UBool containsSome(const UnicodeString& s) const;
|
||||
|
||||
/**
|
||||
* Returns the length of the initial substring of the input string which
|
||||
* consists only of characters and strings that are contained in this set
|
||||
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
|
||||
* or only of characters and strings that are not contained
|
||||
* in this set (USET_SPAN_NOT_CONTAINED).
|
||||
* See USetSpanCondition for details.
|
||||
* Similar to the strspn() C library function.
|
||||
* Unpaired surrogates are treated according to contains() of their surrogate code points.
|
||||
* This function works faster with a frozen set and with a non-negative string length argument.
|
||||
* @param s start of the string
|
||||
* @param length of the string; can be -1 for NUL-terminated
|
||||
* @spanCondition specifies the containment condition
|
||||
* @return the length of the initial substring according to the spanCondition;
|
||||
* 0 if the start of the string does not fit the spanCondition
|
||||
* @draft ICU 3.8
|
||||
* @see USetSpanCondition
|
||||
*/
|
||||
int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
/**
|
||||
* Returns the start of the trailing substring of the input string which
|
||||
* consists only of characters and strings that are contained in this set
|
||||
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
|
||||
* or only of characters and strings that are not contained
|
||||
* in this set (USET_SPAN_NOT_CONTAINED).
|
||||
* See USetSpanCondition for details.
|
||||
* Unpaired surrogates are treated according to contains() of their surrogate code points.
|
||||
* This function works faster with a frozen set and with a non-negative string length argument.
|
||||
* @param s start of the string
|
||||
* @param length of the string; can be -1 for NUL-terminated
|
||||
* @spanCondition specifies the containment condition
|
||||
* @return the start of the trailing substring according to the spanCondition;
|
||||
* the string length if the end of the string does not fit the spanCondition
|
||||
* @draft ICU 3.8
|
||||
* @see USetSpanCondition
|
||||
*/
|
||||
int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
/**
|
||||
* Returns the length of the initial substring of the input string which
|
||||
* consists only of characters and strings that are contained in this set
|
||||
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
|
||||
* or only of characters and strings that are not contained
|
||||
* in this set (USET_SPAN_NOT_CONTAINED).
|
||||
* See USetSpanCondition for details.
|
||||
* Similar to the strspn() C library function.
|
||||
* Malformed byte sequences are treated according to contains(0xfffd).
|
||||
* This function works faster with a frozen set and with a non-negative string length argument.
|
||||
* @param s start of the string (UTF-8)
|
||||
* @param length of the string; can be -1 for NUL-terminated
|
||||
* @spanCondition specifies the containment condition
|
||||
* @return the length of the initial substring according to the spanCondition;
|
||||
* 0 if the start of the string does not fit the spanCondition
|
||||
* @draft ICU 3.8
|
||||
* @see USetSpanCondition
|
||||
*/
|
||||
int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
/**
|
||||
* Returns the start of the trailing substring of the input string which
|
||||
* consists only of characters and strings that are contained in this set
|
||||
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
|
||||
* or only of characters and strings that are not contained
|
||||
* in this set (USET_SPAN_NOT_CONTAINED).
|
||||
* See USetSpanCondition for details.
|
||||
* Malformed byte sequences are treated according to contains(0xfffd).
|
||||
* This function works faster with a frozen set and with a non-negative string length argument.
|
||||
* @param s start of the string (UTF-8)
|
||||
* @param length of the string; can be -1 for NUL-terminated
|
||||
* @spanCondition specifies the containment condition
|
||||
* @return the start of the trailing substring according to the spanCondition;
|
||||
* the string length if the end of the string does not fit the spanCondition
|
||||
* @draft ICU 3.8
|
||||
* @see USetSpanCondition
|
||||
*/
|
||||
int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher::matches()
|
||||
* @stable ICU 2.4
|
||||
@ -786,6 +919,7 @@ public:
|
||||
* the call leaves this set unchanged. If <code>end > start</code>
|
||||
* then an empty range is added, leaving the set unchanged.
|
||||
* This is equivalent to a boolean logic OR, or a set UNION.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param start first character, inclusive, of range to be added
|
||||
* to this set.
|
||||
@ -799,6 +933,7 @@ public:
|
||||
* Adds the specified character to this set if it is not already
|
||||
* present. If this set already contains the specified character,
|
||||
* the call leaves this set unchanged.
|
||||
* A frozen set will not be modified.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
UnicodeSet& add(UChar32 c);
|
||||
@ -809,6 +944,7 @@ public:
|
||||
* the call leaves this set unchanged.
|
||||
* Thus "ch" => {"ch"}
|
||||
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
|
||||
* A frozen set will not be modified.
|
||||
* @param s the source string
|
||||
* @return this object, for chaining
|
||||
* @stable ICU 2.4
|
||||
@ -829,6 +965,7 @@ public:
|
||||
/**
|
||||
* Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
|
||||
* If this set already any particular character, it has no effect on that character.
|
||||
* A frozen set will not be modified.
|
||||
* @param s the source string
|
||||
* @return this object, for chaining
|
||||
* @stable ICU 2.4
|
||||
@ -838,6 +975,7 @@ public:
|
||||
/**
|
||||
* Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
|
||||
* If this set already any particular character, it has no effect on that character.
|
||||
* A frozen set will not be modified.
|
||||
* @param s the source string
|
||||
* @return this object, for chaining
|
||||
* @stable ICU 2.4
|
||||
@ -847,6 +985,7 @@ public:
|
||||
/**
|
||||
* Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
|
||||
* If this set already any particular character, it has no effect on that character.
|
||||
* A frozen set will not be modified.
|
||||
* @param s the source string
|
||||
* @return this object, for chaining
|
||||
* @stable ICU 2.4
|
||||
@ -856,6 +995,7 @@ public:
|
||||
/**
|
||||
* Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
|
||||
* If this set already any particular character, it has no effect on that character.
|
||||
* A frozen set will not be modified.
|
||||
* @param s the source string
|
||||
* @return this object, for chaining
|
||||
* @stable ICU 2.4
|
||||
@ -887,6 +1027,7 @@ public:
|
||||
* specified range. If <code>end > start</code> then an empty range is
|
||||
* retained, leaving the set empty. This is equivalent to
|
||||
* a boolean logic AND, or a set INTERSECTION.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param start first character, inclusive, of range to be retained
|
||||
* to this set.
|
||||
@ -899,6 +1040,7 @@ public:
|
||||
|
||||
/**
|
||||
* Retain the specified character from this set if it is present.
|
||||
* A frozen set will not be modified.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
UnicodeSet& retain(UChar32 c);
|
||||
@ -908,6 +1050,7 @@ public:
|
||||
* The set will not contain the specified range once the call
|
||||
* returns. If <code>end > start</code> then an empty range is
|
||||
* removed, leaving the set unchanged.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param start first character, inclusive, of range to be removed
|
||||
* from this set.
|
||||
@ -921,6 +1064,7 @@ public:
|
||||
* Removes the specified character from this set if it is present.
|
||||
* The set will not contain the specified range once the call
|
||||
* returns.
|
||||
* A frozen set will not be modified.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
UnicodeSet& remove(UChar32 c);
|
||||
@ -929,6 +1073,7 @@ public:
|
||||
* Removes the specified string from this set if it is present.
|
||||
* The set will not contain the specified character once the call
|
||||
* returns.
|
||||
* A frozen set will not be modified.
|
||||
* @param s the source string
|
||||
* @return this object, for chaining
|
||||
* @stable ICU 2.4
|
||||
@ -939,6 +1084,7 @@ public:
|
||||
* Inverts this set. This operation modifies this set so that
|
||||
* its value is its complement. This is equivalent to
|
||||
* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
|
||||
* A frozen set will not be modified.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
virtual UnicodeSet& complement(void);
|
||||
@ -949,6 +1095,7 @@ public:
|
||||
* added if it is not in this set. If <code>end > start</code>
|
||||
* then an empty range is complemented, leaving the set unchanged.
|
||||
* This is equivalent to a boolean logic XOR.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param start first character, inclusive, of range to be removed
|
||||
* from this set.
|
||||
@ -962,6 +1109,7 @@ public:
|
||||
* Complements the specified character in this set. The character
|
||||
* will be removed if it is in this set, or will be added if it is
|
||||
* not in this set.
|
||||
* A frozen set will not be modified.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
UnicodeSet& complement(UChar32 c);
|
||||
@ -971,6 +1119,7 @@ public:
|
||||
* The set will not contain the specified string once the call
|
||||
* returns.
|
||||
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
|
||||
* A frozen set will not be modified.
|
||||
* @param s the string to complement
|
||||
* @return this object, for chaining
|
||||
* @stable ICU 2.4
|
||||
@ -983,6 +1132,7 @@ public:
|
||||
* modifies this set so that its value is the <i>union</i> of the two
|
||||
* sets. The behavior of this operation is unspecified if the specified
|
||||
* collection is modified while the operation is in progress.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param c set whose elements are to be added to this set.
|
||||
* @see #add(char, char)
|
||||
@ -996,6 +1146,7 @@ public:
|
||||
* its elements that are not contained in the specified set. This
|
||||
* operation effectively modifies this set so that its value is
|
||||
* the <i>intersection</i> of the two sets.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param c set that defines which elements this set will retain.
|
||||
* @stable ICU 2.0
|
||||
@ -1007,6 +1158,7 @@ public:
|
||||
* specified set. This operation effectively modifies this
|
||||
* set so that its value is the <i>asymmetric set difference</i> of
|
||||
* the two sets.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param c set that defines which elements will be removed from
|
||||
* this set.
|
||||
@ -1018,6 +1170,7 @@ public:
|
||||
* Complements in this set all elements contained in the specified
|
||||
* set. Any character in the other set will be removed if it is
|
||||
* in this set, or will be added if it is not in this set.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param c set that defines which elements will be xor'ed from
|
||||
* this set.
|
||||
@ -1028,6 +1181,7 @@ public:
|
||||
/**
|
||||
* Removes all of the elements from this set. This set will be
|
||||
* empty after this call returns.
|
||||
* A frozen set will not be modified.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
virtual UnicodeSet& clear(void);
|
||||
@ -1049,6 +1203,8 @@ public:
|
||||
* == b denotes that the contents are the same, not pointer
|
||||
* comparison.)
|
||||
*
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param attribute bitmask for attributes to close over.
|
||||
* Currently only the USET_CASE bit is supported. Any undefined bits
|
||||
* are ignored.
|
||||
@ -1137,6 +1293,7 @@ public:
|
||||
/**
|
||||
* Reallocate this objects internal structures to take up the least
|
||||
* possible space, without changing this object's value.
|
||||
* A frozen set will not be modified.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
virtual UnicodeSet& compact();
|
||||
@ -1189,6 +1346,12 @@ private:
|
||||
|
||||
private:
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Implementation: Clone as thawed (see ICU4J Freezable)
|
||||
//----------------------------------------------------------------
|
||||
|
||||
UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Implementation: Pattern parsing
|
||||
//----------------------------------------------------------------
|
||||
@ -1324,6 +1487,10 @@ inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
|
||||
return !operator==(o);
|
||||
}
|
||||
|
||||
inline UBool UnicodeSet::isFrozen() const {
|
||||
return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
|
||||
}
|
||||
|
||||
inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
|
||||
return !containsNone(start, end);
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2006, International Business Machines
|
||||
* Copyright (C) 2002-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -97,6 +97,120 @@ enum {
|
||||
USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8
|
||||
};
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Argument values for whether span() and similar functions continue while
|
||||
* the current character is contained vs. not contained in the set.
|
||||
*
|
||||
* The functionality is straightforward for sets with only single code points,
|
||||
* without strings (which is the common case):
|
||||
* - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE
|
||||
* work the same.
|
||||
* - span() and spanBack() partition any string the same way when
|
||||
* alternating between span(USET_SPAN_NOT_CONTAINED) and
|
||||
* span(either "contained" condition).
|
||||
* - Using a complemented (inverted) set and the opposite span conditions
|
||||
* yields the same results.
|
||||
*
|
||||
* When a set contains multi-code point strings, then these statements may not
|
||||
* be true, depending on the strings in the set (for example, whether they
|
||||
* overlap with each other) and the string that is processed.
|
||||
* For a set with strings:
|
||||
* - The complement of the set contains the opposite set of code points,
|
||||
* but the same set of strings.
|
||||
* Therefore, complementing both the set and the span conditions
|
||||
* may yield different results.
|
||||
* - When starting spans at different positions in a string
|
||||
* (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
|
||||
* because a set string may start before the later position.
|
||||
* - span(USET_SPAN_SIMPLE) may be shorter than
|
||||
* span(USET_SPAN_CONTAINED) because it will not recursively try
|
||||
* all possible paths.
|
||||
* For example, with a set which contains the three strings "xy", "xya" and "ax",
|
||||
* span("xyax", USET_SPAN_CONTAINED) will return 4 but
|
||||
* span("xyax", USET_SPAN_SIMPLE) will return 3.
|
||||
* span(USET_SPAN_SIMPLE) will never be longer than
|
||||
* span(USET_SPAN_CONTAINED).
|
||||
* - With either "contained" condition, span() and spanBack() may partition
|
||||
* a string in different ways.
|
||||
* For example, with a set which contains the two strings "ab" and "ba",
|
||||
* and when processing the string "aba",
|
||||
* span() will yield contained/not-contained boundaries of { 0, 2, 3 }
|
||||
* while spanBack() will yield boundaries of { 0, 1, 3 }.
|
||||
*
|
||||
* Note: If it is important to get the same boundaries whether iterating forward
|
||||
* or backward through a string, then either only span() should be used and
|
||||
* the boundaries cached for backward operation, or an ICU BreakIterator
|
||||
* could be used.
|
||||
*
|
||||
* Note: Unpaired surrogates are treated like surrogate code points.
|
||||
* Similarly, set strings match only on code point boundaries,
|
||||
* never in the middle of a surrogate pair.
|
||||
* Illegal UTF-8 sequences are treated like U+FFFD.
|
||||
* When processing UTF-8 strings, malformed set strings
|
||||
* (strings with unpaired surrogates which cannot be converted to UTF-8)
|
||||
* are ignored.
|
||||
*
|
||||
* @draft ICU 3.8
|
||||
*/
|
||||
enum USetSpanCondition {
|
||||
/**
|
||||
* Continue a span() while there is no set element at the current position.
|
||||
* Stops before the first set element (character or string).
|
||||
* (For code points only, this is like while contains(current)==FALSE).
|
||||
*
|
||||
* When span() returns, the substring between where it started and the position
|
||||
* it returned consists only of characters that are not in the set,
|
||||
* and none of its strings overlap with the span.
|
||||
*
|
||||
* @draft ICU 3.8
|
||||
*/
|
||||
USET_SPAN_NOT_CONTAINED = 0,
|
||||
/**
|
||||
* Continue a span() while there is a set element at the current position.
|
||||
* (For characters only, this is like while contains(current)==TRUE).
|
||||
*
|
||||
* When span() returns, the substring between where it started and the position
|
||||
* it returned consists only of set elements (characters or strings) that are in the set.
|
||||
*
|
||||
* If a set contains strings, then the span will be the longest substring
|
||||
* matching any of the possible concatenations of set elements (characters or strings).
|
||||
* (There must be a single, non-overlapping concatenation of characters or strings.)
|
||||
* This is equivalent to a POSIX regular expression for (OR of each set element)*.
|
||||
*
|
||||
* @draft ICU 3.8
|
||||
*/
|
||||
USET_SPAN_CONTAINED = 1,
|
||||
/**
|
||||
* Continue a span() while there is a set element at the current position.
|
||||
* (For characters only, this is like while contains(current)==TRUE).
|
||||
*
|
||||
* When span() returns, the substring between where it started and the position
|
||||
* it returned consists only of set elements (characters or strings) that are in the set.
|
||||
*
|
||||
* If a set only contains single characters, then this is the same
|
||||
* as USET_SPAN_CONTAINED.
|
||||
*
|
||||
* If a set contains strings, then the span will be the longest substring
|
||||
* with a match at each position with the longest single set element (character or string).
|
||||
*
|
||||
* Use this span condition together with other longest-match algorithms,
|
||||
* such as ICU converters (ucnv_getUnicodeSet()).
|
||||
*
|
||||
* @draft ICU 3.8
|
||||
*/
|
||||
USET_SPAN_SIMPLE = 2,
|
||||
/**
|
||||
* One more than the last span condition.
|
||||
* @draft ICU 3.8
|
||||
*/
|
||||
USET_SPAN_CONDITION_COUNT
|
||||
};
|
||||
typedef enum USetSpanCondition USetSpanCondition;
|
||||
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
/**
|
||||
* A serialized form of a Unicode set. Limited manipulations are
|
||||
* possible directly on a serialized set. See below.
|
||||
@ -179,9 +293,72 @@ uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
|
||||
U_STABLE void U_EXPORT2
|
||||
uset_close(USet* set);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Returns a copy of this object.
|
||||
* If this set is frozen, then the clone will be frozen as well.
|
||||
* Use uset_cloneAsThawed() for a mutable clone of a frozen set.
|
||||
* @param set the original set
|
||||
* @return the newly allocated copy of the set
|
||||
* @see uset_cloneAsThawed
|
||||
* @draft ICU 3.8
|
||||
*/
|
||||
U_DRAFT USet * U_EXPORT2
|
||||
uset_clone(const USet *set);
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Freezable API
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Determines whether the set has been frozen (made immutable) or not.
|
||||
* See the ICU4J Freezable interface for details.
|
||||
* @param set the set
|
||||
* @return TRUE/FALSE for whether the set has been frozen
|
||||
* @see uset_freeze
|
||||
* @see uset_cloneAsThawed
|
||||
* @draft ICU 3.8
|
||||
*/
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
uset_isFrozen(const USet *set);
|
||||
|
||||
/**
|
||||
* Freeze the set (make it immutable).
|
||||
* Once frozen, it cannot be unfrozen and is therefore thread-safe
|
||||
* until it is deleted.
|
||||
* See the ICU4J Freezable interface for details.
|
||||
* Freezing the set may also make some operations faster, for example
|
||||
* uset_contains() and uset_span().
|
||||
* A frozen set will not be modified. (It remains frozen.)
|
||||
* @param set the set
|
||||
* @return the same set, now frozen
|
||||
* @see uset_isFrozen
|
||||
* @see uset_cloneAsThawed
|
||||
* @draft ICU 3.8
|
||||
*/
|
||||
U_DRAFT void U_EXPORT2
|
||||
uset_freeze(USet *set);
|
||||
|
||||
/**
|
||||
* Clone the set and make the clone mutable.
|
||||
* See the ICU4J Freezable interface for details.
|
||||
* @param set the set
|
||||
* @return the mutable clone
|
||||
* @see uset_freeze
|
||||
* @see uset_isFrozen
|
||||
* @see uset_clone
|
||||
* @draft ICU 3.8
|
||||
*/
|
||||
U_DRAFT USet * U_EXPORT2
|
||||
uset_cloneAsThawed(const USet *set);
|
||||
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
/**
|
||||
* Causes the USet object to represent the range <code>start - end</code>.
|
||||
* If <code>start > end</code> then this USet is set to an empty range.
|
||||
* A frozen set will not be modified.
|
||||
* @param set the object to set to the given range
|
||||
* @param start first character in the set, inclusive
|
||||
* @param end last character in the set, inclusive
|
||||
@ -196,6 +373,7 @@ uset_set(USet* set,
|
||||
* pattern. See the UnicodeSet class description for the syntax of
|
||||
* the pattern language. See also the User Guide chapter about UnicodeSet.
|
||||
* <em>Empties the set passed before applying the pattern.</em>
|
||||
* A frozen set will not be modified.
|
||||
* @param set The set to which the pattern is to be applied.
|
||||
* @param pattern A pointer to UChar string specifying what characters are in the set.
|
||||
* The character at pattern[0] must be a '['.
|
||||
@ -221,6 +399,7 @@ uset_applyPattern(USet *set,
|
||||
* Modifies the set to contain those code points which have the given value
|
||||
* for the given binary or enumerated property, as returned by
|
||||
* u_getIntPropertyValue. Prior contents of this set are lost.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param set the object to contain the code points defined by the property
|
||||
*
|
||||
@ -246,6 +425,7 @@ uset_applyIntPropertyValue(USet* set,
|
||||
* Modifies the set to contain those code points which have the
|
||||
* given value for the given property. Prior contents of this
|
||||
* set are lost.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param set the object to contain the code points defined by the given
|
||||
* property and value alias
|
||||
@ -319,6 +499,7 @@ uset_toPattern(const USet* set,
|
||||
/**
|
||||
* Adds the given character to the given USet. After this call,
|
||||
* uset_contains(set, c) will return TRUE.
|
||||
* A frozen set will not be modified.
|
||||
* @param set the object to which to add the character
|
||||
* @param c the character to add
|
||||
* @stable ICU 2.4
|
||||
@ -332,6 +513,7 @@ uset_add(USet* set, UChar32 c);
|
||||
* modifies this set so that its value is the <i>union</i> of the two
|
||||
* sets. The behavior of this operation is unspecified if the specified
|
||||
* collection is modified while the operation is in progress.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param set the object to which to add the set
|
||||
* @param additionalSet the source set whose elements are to be added to this set.
|
||||
@ -343,6 +525,7 @@ uset_addAll(USet* set, const USet *additionalSet);
|
||||
/**
|
||||
* Adds the given range of characters to the given USet. After this call,
|
||||
* uset_contains(set, start, end) will return TRUE.
|
||||
* A frozen set will not be modified.
|
||||
* @param set the object to which to add the character
|
||||
* @param start the first character of the range to add, inclusive
|
||||
* @param end the last character of the range to add, inclusive
|
||||
@ -354,6 +537,7 @@ uset_addRange(USet* set, UChar32 start, UChar32 end);
|
||||
/**
|
||||
* Adds the given string to the given USet. After this call,
|
||||
* uset_containsString(set, str, strLen) will return TRUE.
|
||||
* A frozen set will not be modified.
|
||||
* @param set the object to which to add the character
|
||||
* @param str the string to add
|
||||
* @param strLen the length of the string or -1 if null terminated.
|
||||
@ -365,6 +549,7 @@ uset_addString(USet* set, const UChar* str, int32_t strLen);
|
||||
/**
|
||||
* Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
|
||||
* If this set already any particular character, it has no effect on that character.
|
||||
* A frozen set will not be modified.
|
||||
* @param set the object to which to add the character
|
||||
* @param str the source string
|
||||
* @param strLen the length of the string or -1 if null terminated.
|
||||
@ -376,6 +561,7 @@ uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen);
|
||||
/**
|
||||
* Removes the given character from the given USet. After this call,
|
||||
* uset_contains(set, c) will return FALSE.
|
||||
* A frozen set will not be modified.
|
||||
* @param set the object from which to remove the character
|
||||
* @param c the character to remove
|
||||
* @stable ICU 2.4
|
||||
@ -386,6 +572,7 @@ uset_remove(USet* set, UChar32 c);
|
||||
/**
|
||||
* Removes the given range of characters from the given USet. After this call,
|
||||
* uset_contains(set, start, end) will return FALSE.
|
||||
* A frozen set will not be modified.
|
||||
* @param set the object to which to add the character
|
||||
* @param start the first character of the range to remove, inclusive
|
||||
* @param end the last character of the range to remove, inclusive
|
||||
@ -397,6 +584,7 @@ uset_removeRange(USet* set, UChar32 start, UChar32 end);
|
||||
/**
|
||||
* Removes the given string to the given USet. After this call,
|
||||
* uset_containsString(set, str, strLen) will return FALSE.
|
||||
* A frozen set will not be modified.
|
||||
* @param set the object to which to add the character
|
||||
* @param str the string to remove
|
||||
* @param strLen the length of the string or -1 if null terminated.
|
||||
@ -410,6 +598,7 @@ uset_removeString(USet* set, const UChar* str, int32_t strLen);
|
||||
* specified set. This operation effectively modifies this
|
||||
* set so that its value is the <i>asymmetric set difference</i> of
|
||||
* the two sets.
|
||||
* A frozen set will not be modified.
|
||||
* @param set the object from which the elements are to be removed
|
||||
* @param removeSet the object that defines which elements will be
|
||||
* removed from this set
|
||||
@ -423,6 +612,7 @@ uset_removeAll(USet* set, const USet* removeSet);
|
||||
* specified range. If <code>start > end</code> then an empty range is
|
||||
* retained, leaving the set empty. This is equivalent to
|
||||
* a boolean logic AND, or a set INTERSECTION.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param set the object for which to retain only the specified range
|
||||
* @param start first character, inclusive, of range to be retained
|
||||
@ -440,6 +630,7 @@ uset_retain(USet* set, UChar32 start, UChar32 end);
|
||||
* its elements that are not contained in the specified set. This
|
||||
* operation effectively modifies this set so that its value is
|
||||
* the <i>intersection</i> of the two sets.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param set the object on which to perform the retain
|
||||
* @param retain set that defines which elements this set will retain
|
||||
@ -451,6 +642,7 @@ uset_retainAll(USet* set, const USet* retain);
|
||||
/**
|
||||
* Reallocate this objects internal structures to take up the least
|
||||
* possible space, without changing this object's value.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param set the object on which to perfrom the compact
|
||||
* @stable ICU 3.2
|
||||
@ -462,6 +654,7 @@ uset_compact(USet* set);
|
||||
* Inverts this set. This operation modifies this set so that
|
||||
* its value is its complement. This operation does not affect
|
||||
* the multicharacter strings, if any.
|
||||
* A frozen set will not be modified.
|
||||
* @param set the set
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
@ -472,6 +665,7 @@ uset_complement(USet* set);
|
||||
* Complements in this set all elements contained in the specified
|
||||
* set. Any character in the other set will be removed if it is
|
||||
* in this set, or will be added if it is not in this set.
|
||||
* A frozen set will not be modified.
|
||||
*
|
||||
* @param set the set with which to complement
|
||||
* @param complement set that defines which elements will be xor'ed
|
||||
@ -484,6 +678,7 @@ uset_complementAll(USet* set, const USet* complement);
|
||||
/**
|
||||
* Removes all of the elements from this set. This set will be
|
||||
* empty after this call returns.
|
||||
* A frozen set will not be modified.
|
||||
* @param set the set
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
@ -502,6 +697,7 @@ uset_isEmpty(const USet* set);
|
||||
|
||||
/**
|
||||
* Returns TRUE if the given USet contains the given character.
|
||||
* This function works faster with a frozen set.
|
||||
* @param set the set
|
||||
* @param c The codepoint to check for within the set
|
||||
* @return true if set contains c
|
||||
@ -651,6 +847,96 @@ uset_containsNone(const USet* set1, const USet* set2);
|
||||
U_STABLE UBool U_EXPORT2
|
||||
uset_containsSome(const USet* set1, const USet* set2);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Returns the length of the initial substring of the input string which
|
||||
* consists only of characters and strings that are contained in this set
|
||||
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
|
||||
* or only of characters and strings that are not contained
|
||||
* in this set (USET_SPAN_NOT_CONTAINED).
|
||||
* See USetSpanCondition for details.
|
||||
* Similar to the strspn() C library function.
|
||||
* Unpaired surrogates are treated according to contains() of their surrogate code points.
|
||||
* This function works faster with a frozen set and with a non-negative string length argument.
|
||||
* @param set the set
|
||||
* @param s start of the string
|
||||
* @param length of the string; can be -1 for NUL-terminated
|
||||
* @spanCondition specifies the containment condition
|
||||
* @return the length of the initial substring according to the spanCondition;
|
||||
* 0 if the start of the string does not fit the spanCondition
|
||||
* @draft ICU 3.8
|
||||
* @see USetSpanCondition
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
|
||||
|
||||
/**
|
||||
* Returns the start of the trailing substring of the input string which
|
||||
* consists only of characters and strings that are contained in this set
|
||||
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
|
||||
* or only of characters and strings that are not contained
|
||||
* in this set (USET_SPAN_NOT_CONTAINED).
|
||||
* See USetSpanCondition for details.
|
||||
* Unpaired surrogates are treated according to contains() of their surrogate code points.
|
||||
* This function works faster with a frozen set and with a non-negative string length argument.
|
||||
* @param set the set
|
||||
* @param s start of the string
|
||||
* @param length of the string; can be -1 for NUL-terminated
|
||||
* @spanCondition specifies the containment condition
|
||||
* @return the start of the trailing substring according to the spanCondition;
|
||||
* the string length if the end of the string does not fit the spanCondition
|
||||
* @draft ICU 3.8
|
||||
* @see USetSpanCondition
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
|
||||
|
||||
/**
|
||||
* Returns the length of the initial substring of the input string which
|
||||
* consists only of characters and strings that are contained in this set
|
||||
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
|
||||
* or only of characters and strings that are not contained
|
||||
* in this set (USET_SPAN_NOT_CONTAINED).
|
||||
* See USetSpanCondition for details.
|
||||
* Similar to the strspn() C library function.
|
||||
* Malformed byte sequences are treated according to contains(0xfffd).
|
||||
* This function works faster with a frozen set and with a non-negative string length argument.
|
||||
* @param set the set
|
||||
* @param s start of the string (UTF-8)
|
||||
* @param length of the string; can be -1 for NUL-terminated
|
||||
* @spanCondition specifies the containment condition
|
||||
* @return the length of the initial substring according to the spanCondition;
|
||||
* 0 if the start of the string does not fit the spanCondition
|
||||
* @draft ICU 3.8
|
||||
* @see USetSpanCondition
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
|
||||
|
||||
/**
|
||||
* Returns the start of the trailing substring of the input string which
|
||||
* consists only of characters and strings that are contained in this set
|
||||
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
|
||||
* or only of characters and strings that are not contained
|
||||
* in this set (USET_SPAN_NOT_CONTAINED).
|
||||
* See USetSpanCondition for details.
|
||||
* Malformed byte sequences are treated according to contains(0xfffd).
|
||||
* This function works faster with a frozen set and with a non-negative string length argument.
|
||||
* @param set the set
|
||||
* @param s start of the string (UTF-8)
|
||||
* @param length of the string; can be -1 for NUL-terminated
|
||||
* @spanCondition specifies the containment condition
|
||||
* @return the start of the trailing substring according to the spanCondition;
|
||||
* the string length if the end of the string does not fit the spanCondition
|
||||
* @draft ICU 3.8
|
||||
* @see USetSpanCondition
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
|
||||
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
/**
|
||||
* Returns true if set1 contains all of the characters and strings
|
||||
* of set2, and vis versa. It answers the question, 'Is set1 equal to set2?'
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "unicode/symtable.h"
|
||||
#include "ruleiter.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "uhash.h"
|
||||
#include "util.h"
|
||||
#include "uvector.h"
|
||||
@ -21,6 +22,8 @@
|
||||
#include "ustrfmt.h"
|
||||
#include "uassert.h"
|
||||
#include "hash.h"
|
||||
#include "bmpset.h"
|
||||
#include "unisetspan.h"
|
||||
|
||||
// Define UChar constants using hex for EBCDIC compatibility
|
||||
// Used #define to reduce private static exports and memory access time.
|
||||
@ -138,8 +141,8 @@ static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) {
|
||||
* Constructs an empty set.
|
||||
*/
|
||||
UnicodeSet::UnicodeSet() :
|
||||
len(1), capacity(1 + START_EXTRA), list(0), buffer(0),
|
||||
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL)
|
||||
len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
|
||||
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
|
||||
{
|
||||
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
|
||||
if(list!=NULL){
|
||||
@ -158,8 +161,8 @@ UnicodeSet::UnicodeSet() :
|
||||
* @param end last character, inclusive, of range
|
||||
*/
|
||||
UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
|
||||
len(1), capacity(1 + START_EXTRA), list(0), buffer(0),
|
||||
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL)
|
||||
len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
|
||||
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
|
||||
{
|
||||
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
|
||||
if(list!=NULL){
|
||||
@ -177,8 +180,10 @@ UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
|
||||
*/
|
||||
UnicodeSet::UnicodeSet(const UnicodeSet& o) :
|
||||
UnicodeFilter(o),
|
||||
len(0), capacity(o.len + GROW_EXTRA), list(0), buffer(0),
|
||||
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL)
|
||||
len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
|
||||
bmpSet(0),
|
||||
buffer(0), bufferCapacity(0),
|
||||
patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
|
||||
{
|
||||
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
|
||||
if(list!=NULL){
|
||||
@ -189,16 +194,41 @@ UnicodeSet::UnicodeSet(const UnicodeSet& o) :
|
||||
_dbgct(this);
|
||||
}
|
||||
|
||||
// Copy-construct as thawed.
|
||||
UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
|
||||
UnicodeFilter(o),
|
||||
len(0), capacity(o.len + GROW_EXTRA), list(0),
|
||||
bmpSet(0),
|
||||
buffer(0), bufferCapacity(0),
|
||||
patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
|
||||
{
|
||||
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
|
||||
if(list!=NULL){
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
allocateStrings(status);
|
||||
// *this = o except for bmpSet and stringSpan
|
||||
len = o.len;
|
||||
uprv_memcpy(list, o.list, len*sizeof(UChar32));
|
||||
strings->assign(*o.strings, cloneUnicodeString, status);
|
||||
if (o.pat) {
|
||||
setPattern(UnicodeString(o.pat, o.patLen));
|
||||
}
|
||||
}
|
||||
_dbgct(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructs the set.
|
||||
*/
|
||||
UnicodeSet::~UnicodeSet() {
|
||||
_dbgdt(this); // first!
|
||||
uprv_free(list);
|
||||
delete bmpSet;
|
||||
if (buffer) {
|
||||
uprv_free(buffer);
|
||||
}
|
||||
delete strings;
|
||||
delete stringSpan;
|
||||
releasePattern();
|
||||
}
|
||||
|
||||
@ -206,11 +236,24 @@ UnicodeSet::~UnicodeSet() {
|
||||
* Assigns this object to be a copy of another.
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
|
||||
if (isFrozen()) {
|
||||
return *this;
|
||||
}
|
||||
ensureCapacity(o.len);
|
||||
len = o.len;
|
||||
uprv_memcpy(list, o.list, len*sizeof(UChar32));
|
||||
if (o.bmpSet == NULL) {
|
||||
bmpSet = NULL;
|
||||
} else {
|
||||
bmpSet = new BMPSet(*o.bmpSet, list, len);
|
||||
}
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
strings->assign(*o.strings, cloneUnicodeString, ec);
|
||||
if (o.stringSpan == NULL) {
|
||||
stringSpan = NULL;
|
||||
} else {
|
||||
stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
|
||||
}
|
||||
releasePattern();
|
||||
if (o.pat) {
|
||||
setPattern(UnicodeString(o.pat, o.patLen));
|
||||
@ -218,6 +261,19 @@ UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of this object. All UnicodeMatcher objects have
|
||||
* to support cloning in order to allow classes using
|
||||
* UnicodeMatchers, such as Transliterator, to implement cloning.
|
||||
*/
|
||||
UnicodeFunctor* UnicodeSet::clone() const {
|
||||
return new UnicodeSet(*this);
|
||||
}
|
||||
|
||||
UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
|
||||
return new UnicodeSet(*this, TRUE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares the specified object with this set for equality. Returns
|
||||
* <tt>true</tt> if the two sets
|
||||
@ -237,15 +293,6 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of this object. All UnicodeMatcher objects have
|
||||
* to support cloning in order to allow classes using
|
||||
* UnicodeMatchers, such as Transliterator, to implement cloning.
|
||||
*/
|
||||
UnicodeFunctor* UnicodeSet::clone() const {
|
||||
return new UnicodeSet(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the hash code value for this set.
|
||||
*
|
||||
@ -265,20 +312,6 @@ int32_t UnicodeSet::hashCode(void) const {
|
||||
// Public API
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Make this object represent the range <code>start - end</code>.
|
||||
* If <code>end > start</code> then this object is set to an
|
||||
* an empty range.
|
||||
*
|
||||
* @param start first character in the set, inclusive
|
||||
* @rparam end last character in the set, inclusive
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
|
||||
clear();
|
||||
complement(start, end);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of elements in this set (its cardinality),
|
||||
* Note than the elements of a set may include both individual
|
||||
@ -317,11 +350,17 @@ UBool UnicodeSet::contains(UChar32 c) const {
|
||||
//for (;;) {
|
||||
// if (c < list[++i]) break;
|
||||
//}
|
||||
if (bmpSet != NULL) {
|
||||
return bmpSet->contains(c);
|
||||
}
|
||||
if (stringSpan != NULL) {
|
||||
return stringSpan->contains(c);
|
||||
}
|
||||
if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
|
||||
return FALSE;
|
||||
}
|
||||
int32_t i = findCodePoint(c);
|
||||
return ((i & 1) != 0); // return true if odd
|
||||
return (UBool)(i & 1); // return true if odd
|
||||
}
|
||||
|
||||
/**
|
||||
@ -350,10 +389,10 @@ int32_t UnicodeSet::findCodePoint(UChar32 c) const {
|
||||
return 0;
|
||||
// High runner test. c is often after the last range, so an
|
||||
// initial check for this condition pays off.
|
||||
if (len >= 2 && c >= list[len-2])
|
||||
return len-1;
|
||||
int32_t lo = 0;
|
||||
int32_t hi = len - 1;
|
||||
if (lo >= hi || c >= list[hi-1])
|
||||
return hi;
|
||||
// invariant: c >= list[lo]
|
||||
// invariant: c < list[hi]
|
||||
for (;;) {
|
||||
@ -428,12 +467,8 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
|
||||
* @return true if the test condition is met
|
||||
*/
|
||||
UBool UnicodeSet::containsAll(const UnicodeString& s) const {
|
||||
UChar32 cp;
|
||||
for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
|
||||
cp = s.char32At(i);
|
||||
if (!contains(cp)) return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
|
||||
s.length());
|
||||
}
|
||||
|
||||
/**
|
||||
@ -479,12 +514,8 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
|
||||
* @return true if the test condition is met
|
||||
*/
|
||||
UBool UnicodeSet::containsNone(const UnicodeString& s) const {
|
||||
UChar32 cp;
|
||||
for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
|
||||
cp = s.char32At(i);
|
||||
if (contains(cp)) return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
|
||||
s.length());
|
||||
}
|
||||
|
||||
/**
|
||||
@ -723,6 +754,20 @@ UChar32 UnicodeSet::charAt(int32_t index) const {
|
||||
return (UChar32)-1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make this object represent the range <code>start - end</code>.
|
||||
* If <code>end > start</code> then this object is set to an
|
||||
* an empty range.
|
||||
*
|
||||
* @param start first character in the set, inclusive
|
||||
* @rparam end last character in the set, inclusive
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
|
||||
clear();
|
||||
complement(start, end);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the specified range to this set if it is not already
|
||||
* present. If this set already contains the specified range,
|
||||
@ -777,7 +822,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
|
||||
int32_t i = findCodePoint(pinCodePoint(c));
|
||||
|
||||
// already in set?
|
||||
if ((i & 1) != 0) return *this;
|
||||
if ((i & 1) != 0 || isFrozen()) return *this;
|
||||
|
||||
// HIGH is 0x110000
|
||||
// assert(list[len-1] == HIGH);
|
||||
@ -888,7 +933,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
|
||||
* @return the modified set, for chaining
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
|
||||
if (s.length() == 0) return *this;
|
||||
if (s.length() == 0 || isFrozen()) return *this;
|
||||
int32_t cp = getSingleCP(s);
|
||||
if (cp < 0) {
|
||||
if (!strings->contains((void*) &s)) {
|
||||
@ -896,7 +941,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
|
||||
releasePattern();
|
||||
}
|
||||
} else {
|
||||
add((UChar32)cp, (UChar32)cp);
|
||||
add((UChar32)cp);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
@ -907,6 +952,9 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
|
||||
* already be in 'strings'.
|
||||
*/
|
||||
void UnicodeSet::_add(const UnicodeString& s) {
|
||||
if (isFrozen()) {
|
||||
return;
|
||||
}
|
||||
UnicodeString* t = new UnicodeString(s);
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
strings->sortedInsert(t, compareUnicodeString, ec);
|
||||
@ -942,7 +990,7 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
|
||||
UChar32 cp;
|
||||
for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
|
||||
cp = s.char32At(i);
|
||||
add(cp, cp);
|
||||
add(cp);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
@ -1070,7 +1118,7 @@ UnicodeSet& UnicodeSet::remove(UChar32 c) {
|
||||
* @return the modified set, for chaining
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
|
||||
if (s.length() == 0) return *this;
|
||||
if (s.length() == 0 || isFrozen()) return *this;
|
||||
int32_t cp = getSingleCP(s);
|
||||
if (cp < 0) {
|
||||
strings->removeElement((void*) &s);
|
||||
@ -1093,6 +1141,9 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
|
||||
* from this set.
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
|
||||
if (isFrozen()) {
|
||||
return *this;
|
||||
}
|
||||
if (pinCodePoint(start) <= pinCodePoint(end)) {
|
||||
UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
|
||||
exclusiveOr(range, 2, 0);
|
||||
@ -1110,6 +1161,9 @@ UnicodeSet& UnicodeSet::complement(UChar32 c) {
|
||||
* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::complement(void) {
|
||||
if (isFrozen()) {
|
||||
return *this;
|
||||
}
|
||||
if (list[0] == UNICODESET_LOW) {
|
||||
ensureBufferCapacity(len-1);
|
||||
uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
|
||||
@ -1134,7 +1188,7 @@ UnicodeSet& UnicodeSet::complement(void) {
|
||||
* @return this object, for chaining
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
|
||||
if (s.length() == 0) return *this;
|
||||
if (s.length() == 0 || isFrozen()) return *this;
|
||||
int32_t cp = getSingleCP(s);
|
||||
if (cp < 0) {
|
||||
if (strings->contains((void*) &s)) {
|
||||
@ -1182,6 +1236,9 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
|
||||
* @param c set that defines which elements this set will retain.
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
|
||||
if (isFrozen()) {
|
||||
return *this;
|
||||
}
|
||||
retain(c.list, c.len, 0);
|
||||
strings->retainAll(*c.strings);
|
||||
return *this;
|
||||
@ -1197,6 +1254,9 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
|
||||
* this set.
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
|
||||
if (isFrozen()) {
|
||||
return *this;
|
||||
}
|
||||
retain(c.list, c.len, 2);
|
||||
strings->removeAll(*c.strings);
|
||||
return *this;
|
||||
@ -1211,6 +1271,9 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
|
||||
* this set.
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
|
||||
if (isFrozen()) {
|
||||
return *this;
|
||||
}
|
||||
exclusiveOr(c.list, c.len, 0);
|
||||
|
||||
for (int32_t i=0; i<c.strings->size(); ++i) {
|
||||
@ -1227,6 +1290,9 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
|
||||
* empty after this call returns.
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::clear(void) {
|
||||
if (isFrozen()) {
|
||||
return *this;
|
||||
}
|
||||
list[0] = UNICODESET_HIGH;
|
||||
len = 1;
|
||||
releasePattern();
|
||||
@ -1277,9 +1343,14 @@ const UnicodeString* UnicodeSet::getString(int32_t index) const {
|
||||
* possible space, without changing this object's value.
|
||||
*/
|
||||
UnicodeSet& UnicodeSet::compact() {
|
||||
if (isFrozen()) {
|
||||
return *this;
|
||||
}
|
||||
// Delete buffer first to defragment memory less.
|
||||
uprv_free(buffer);
|
||||
buffer = NULL;
|
||||
if (buffer != NULL) {
|
||||
uprv_free(buffer);
|
||||
buffer = NULL;
|
||||
}
|
||||
if (len < capacity) {
|
||||
// Make the capacity equal to len or 1.
|
||||
// We don't want to realloc of 0 size.
|
||||
@ -1437,6 +1508,9 @@ static inline UChar32 max(UChar32 a, UChar32 b) {
|
||||
// polarity = 1, 2: x xor ~y == x === y
|
||||
|
||||
void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
|
||||
if (isFrozen()) {
|
||||
return;
|
||||
}
|
||||
ensureBufferCapacity(len + otherLen);
|
||||
int32_t i = 0, j = 0, k = 0;
|
||||
UChar32 a = list[i++];
|
||||
@ -1479,6 +1553,9 @@ void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola
|
||||
// polarity = 3: ~x union ~y
|
||||
|
||||
void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
|
||||
if (isFrozen()) {
|
||||
return;
|
||||
}
|
||||
ensureBufferCapacity(len + otherLen);
|
||||
int32_t i = 0, j = 0, k = 0;
|
||||
UChar32 a = list[i++];
|
||||
@ -1584,6 +1661,9 @@ void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
|
||||
// polarity = 3: ~x intersect ~y
|
||||
|
||||
void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
|
||||
if (isFrozen()) {
|
||||
return;
|
||||
}
|
||||
ensureBufferCapacity(len + otherLen);
|
||||
int32_t i = 0, j = 0, k = 0;
|
||||
UChar32 a = list[i++];
|
||||
@ -1864,4 +1944,199 @@ void UnicodeSet::setPattern(const UnicodeString& newPat) {
|
||||
// We can regenerate an equivalent pattern later when requested.
|
||||
}
|
||||
|
||||
UnicodeFunctor *UnicodeSet::freeze() {
|
||||
if(!isFrozen()) {
|
||||
// Do most of what compact() does before freezing because
|
||||
// compact() will not work when the set is frozen.
|
||||
// Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
|
||||
|
||||
// Delete buffer first to defragment memory less.
|
||||
if (buffer != NULL) {
|
||||
uprv_free(buffer);
|
||||
buffer = NULL;
|
||||
}
|
||||
if (capacity > (len + GROW_EXTRA)) {
|
||||
// Make the capacity equal to len or 1.
|
||||
// We don't want to realloc of 0 size.
|
||||
capacity = len + (len == 0);
|
||||
list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
|
||||
}
|
||||
|
||||
// Optimize contains() and span() and similar functions.
|
||||
if (!strings->isEmpty()) {
|
||||
stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
|
||||
if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
|
||||
// All strings are irrelevant for span() etc. because
|
||||
// all of each string's code points are contained in this set.
|
||||
// Do not check needsStringSpanUTF8() because UTF-8 has at most as
|
||||
// many relevant strings as UTF-16.
|
||||
// (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
|
||||
delete stringSpan;
|
||||
stringSpan = NULL;
|
||||
}
|
||||
}
|
||||
if (stringSpan == NULL) {
|
||||
// No span-relevant strings: Optimize for code point spans.
|
||||
bmpSet=new BMPSet(list, len);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
|
||||
if(length>0 && bmpSet!=NULL) {
|
||||
return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
|
||||
}
|
||||
if(length<0) {
|
||||
length=u_strlen(s);
|
||||
}
|
||||
if(length==0) {
|
||||
return 0;
|
||||
}
|
||||
if(stringSpan!=NULL) {
|
||||
return stringSpan->span(s, length, spanCondition);
|
||||
} else if(!strings->isEmpty()) {
|
||||
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
|
||||
UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
|
||||
UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
|
||||
UnicodeSetStringSpan strSpan(*this, *strings, which);
|
||||
if(strSpan.needsStringSpanUTF16()) {
|
||||
return strSpan.span(s, length, spanCondition);
|
||||
}
|
||||
}
|
||||
|
||||
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
|
||||
}
|
||||
|
||||
UChar32 c;
|
||||
int32_t start=0, prev=0;
|
||||
do {
|
||||
U16_NEXT(s, start, length, c);
|
||||
if(spanCondition!=contains(c)) {
|
||||
break;
|
||||
}
|
||||
} while((prev=start)<length);
|
||||
return prev;
|
||||
}
|
||||
|
||||
int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
|
||||
if(length>0 && bmpSet!=NULL) {
|
||||
return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
|
||||
}
|
||||
if(length<0) {
|
||||
length=u_strlen(s);
|
||||
}
|
||||
if(length==0) {
|
||||
return 0;
|
||||
}
|
||||
if(stringSpan!=NULL) {
|
||||
return stringSpan->spanBack(s, length, spanCondition);
|
||||
} else if(!strings->isEmpty()) {
|
||||
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
|
||||
UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
|
||||
UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
|
||||
UnicodeSetStringSpan strSpan(*this, *strings, which);
|
||||
if(strSpan.needsStringSpanUTF16()) {
|
||||
return strSpan.spanBack(s, length, spanCondition);
|
||||
}
|
||||
}
|
||||
|
||||
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
|
||||
}
|
||||
|
||||
UChar32 c;
|
||||
int32_t prev=length;
|
||||
do {
|
||||
U16_PREV(s, 0, length, c);
|
||||
if(spanCondition!=contains(c)) {
|
||||
break;
|
||||
}
|
||||
} while((prev=length)>0);
|
||||
return prev;
|
||||
}
|
||||
|
||||
int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
|
||||
if(length>0 && bmpSet!=NULL) {
|
||||
const uint8_t *s0=(const uint8_t *)s;
|
||||
return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
|
||||
}
|
||||
if(length<0) {
|
||||
length=uprv_strlen(s);
|
||||
}
|
||||
if(length==0) {
|
||||
return 0;
|
||||
}
|
||||
if(stringSpan!=NULL) {
|
||||
return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
|
||||
} else if(!strings->isEmpty()) {
|
||||
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
|
||||
UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
|
||||
UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
|
||||
UnicodeSetStringSpan strSpan(*this, *strings, which);
|
||||
if(strSpan.needsStringSpanUTF8()) {
|
||||
return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
|
||||
}
|
||||
}
|
||||
|
||||
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
|
||||
}
|
||||
|
||||
UChar32 c;
|
||||
int32_t start=0, prev=0;
|
||||
do {
|
||||
U8_NEXT(s, start, length, c);
|
||||
if(c<0) {
|
||||
c=0xfffd;
|
||||
}
|
||||
if(spanCondition!=contains(c)) {
|
||||
break;
|
||||
}
|
||||
} while((prev=start)<length);
|
||||
return prev;
|
||||
}
|
||||
|
||||
int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
|
||||
if(length>0 && bmpSet!=NULL) {
|
||||
const uint8_t *s0=(const uint8_t *)s;
|
||||
return bmpSet->spanBackUTF8(s0, length, spanCondition);
|
||||
}
|
||||
if(length<0) {
|
||||
length=uprv_strlen(s);
|
||||
}
|
||||
if(length==0) {
|
||||
return 0;
|
||||
}
|
||||
if(stringSpan!=NULL) {
|
||||
return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
|
||||
} else if(!strings->isEmpty()) {
|
||||
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
|
||||
UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
|
||||
UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
|
||||
UnicodeSetStringSpan strSpan(*this, *strings, which);
|
||||
if(strSpan.needsStringSpanUTF8()) {
|
||||
return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
|
||||
}
|
||||
}
|
||||
|
||||
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
|
||||
}
|
||||
|
||||
UChar32 c;
|
||||
int32_t prev=length;
|
||||
do {
|
||||
U8_PREV(s, 0, length, c);
|
||||
if(c<0) {
|
||||
c=0xfffd;
|
||||
}
|
||||
if(spanCondition!=contains(c)) {
|
||||
break;
|
||||
}
|
||||
} while((prev=length)>0);
|
||||
return prev;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -248,8 +248,8 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
|
||||
*/
|
||||
UnicodeSet::UnicodeSet(const UnicodeString& pattern,
|
||||
UErrorCode& status) :
|
||||
len(0), capacity(START_EXTRA), list(0), buffer(0),
|
||||
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL)
|
||||
len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
|
||||
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
|
||||
{
|
||||
if(U_SUCCESS(status)){
|
||||
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
|
||||
@ -276,8 +276,8 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern,
|
||||
uint32_t options,
|
||||
const SymbolTable* symbols,
|
||||
UErrorCode& status) :
|
||||
len(0), capacity(START_EXTRA), list(0), buffer(0),
|
||||
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL)
|
||||
len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
|
||||
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
|
||||
{
|
||||
if(U_SUCCESS(status)){
|
||||
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
|
||||
@ -296,8 +296,8 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
|
||||
uint32_t options,
|
||||
const SymbolTable* symbols,
|
||||
UErrorCode& status) :
|
||||
len(0), capacity(START_EXTRA), list(0), buffer(0),
|
||||
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL)
|
||||
len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
|
||||
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
|
||||
{
|
||||
if(U_SUCCESS(status)){
|
||||
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
|
||||
@ -348,7 +348,7 @@ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
|
||||
uint32_t options,
|
||||
const SymbolTable* symbols,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
if (U_FAILURE(status) || isFrozen()) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -374,7 +374,7 @@ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
|
||||
uint32_t options,
|
||||
const SymbolTable* symbols,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
if (U_FAILURE(status) || isFrozen()) {
|
||||
return *this;
|
||||
}
|
||||
// Need to build the pattern in a temporary string because
|
||||
@ -938,7 +938,7 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
|
||||
|
||||
UnicodeSet&
|
||||
UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
|
||||
if (U_FAILURE(ec)) return *this;
|
||||
if (U_FAILURE(ec) || isFrozen()) return *this;
|
||||
|
||||
if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
|
||||
applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
|
||||
@ -953,7 +953,7 @@ UnicodeSet&
|
||||
UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
|
||||
const UnicodeString& value,
|
||||
UErrorCode& ec) {
|
||||
if (U_FAILURE(ec)) return *this;
|
||||
if (U_FAILURE(ec) || isFrozen()) return *this;
|
||||
|
||||
// prop and value used to be converted to char * using the default
|
||||
// converter instead of the invariant conversion.
|
||||
@ -1293,6 +1293,9 @@ addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString
|
||||
}
|
||||
|
||||
UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
|
||||
if (isFrozen()) {
|
||||
return *this;
|
||||
}
|
||||
if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
const UCaseProps *csp = ucase_getSingleton(&status);
|
||||
|
1508
icu4c/source/common/unisetspan.cpp
Normal file
1508
icu4c/source/common/unisetspan.cpp
Normal file
File diff suppressed because it is too large
Load Diff
155
icu4c/source/common/unisetspan.h
Normal file
155
icu4c/source/common/unisetspan.h
Normal file
@ -0,0 +1,155 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: unisetspan.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2007mar01
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __UNISETSPAN_H__
|
||||
#define __UNISETSPAN_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
* Implement span() etc. for a set with strings.
|
||||
* Avoid recursion because of its exponential complexity.
|
||||
* Instead, try multiple paths at once and track them with an IndexList.
|
||||
*/
|
||||
class UnicodeSetStringSpan : public UMemory {
|
||||
public:
|
||||
/*
|
||||
* Which span() variant will be used?
|
||||
* The object is either built for one variant and used once,
|
||||
* or built for all and may be used many times.
|
||||
*/
|
||||
enum {
|
||||
FWD = 0x20,
|
||||
BACK = 0x10,
|
||||
UTF16 = 8,
|
||||
UTF8 = 4,
|
||||
CONTAINED = 2,
|
||||
NOT_CONTAINED = 1,
|
||||
|
||||
ALL = 0x3f,
|
||||
|
||||
FWD_UTF16_CONTAINED = FWD | UTF16 | CONTAINED,
|
||||
FWD_UTF16_NOT_CONTAINED = FWD | UTF16 | NOT_CONTAINED,
|
||||
FWD_UTF8_CONTAINED = FWD | UTF8 | CONTAINED,
|
||||
FWD_UTF8_NOT_CONTAINED = FWD | UTF8 | NOT_CONTAINED,
|
||||
BACK_UTF16_CONTAINED = BACK | UTF16 | CONTAINED,
|
||||
BACK_UTF16_NOT_CONTAINED= BACK | UTF16 | NOT_CONTAINED,
|
||||
BACK_UTF8_CONTAINED = BACK | UTF8 | CONTAINED,
|
||||
BACK_UTF8_NOT_CONTAINED = BACK | UTF8 | NOT_CONTAINED
|
||||
};
|
||||
|
||||
UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which);
|
||||
|
||||
// Copy constructor. Assumes which==ALL for a frozen set.
|
||||
UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings);
|
||||
|
||||
~UnicodeSetStringSpan();
|
||||
|
||||
/*
|
||||
* Do the strings need to be checked in span() etc.?
|
||||
* @return TRUE if strings need to be checked (call span() here),
|
||||
* FALSE if not (use a BMPSet for best performance).
|
||||
*/
|
||||
inline UBool needsStringSpanUTF16();
|
||||
inline UBool needsStringSpanUTF8();
|
||||
|
||||
// For fast UnicodeSet::contains(c).
|
||||
inline UBool contains(UChar32 c) const;
|
||||
|
||||
int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
int32_t spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
private:
|
||||
// Special spanLength byte values.
|
||||
enum {
|
||||
// The spanLength is >=0xfe.
|
||||
LONG_SPAN=0xfe,
|
||||
// All code points in the string are contained in the parent set.
|
||||
ALL_CP_CONTAINED=0xff
|
||||
};
|
||||
|
||||
// Add a starting or ending string character to the spanNotSet
|
||||
// so that a character span ends before any string.
|
||||
void addToSpanNotSet(UChar32 c);
|
||||
|
||||
int32_t spanNot(const UChar *s, int32_t length) const;
|
||||
int32_t spanNotBack(const UChar *s, int32_t length) const;
|
||||
int32_t spanNotUTF8(const uint8_t *s, int32_t length) const;
|
||||
int32_t spanNotBackUTF8(const uint8_t *s, int32_t length) const;
|
||||
|
||||
// Set for span(). Same as parent but without strings.
|
||||
UnicodeSet spanSet;
|
||||
|
||||
// Set for span(not contained).
|
||||
// Same as spanSet, plus characters that start or end strings.
|
||||
UnicodeSet *pSpanNotSet;
|
||||
|
||||
// The strings of the parent set.
|
||||
const UVector &strings;
|
||||
|
||||
// Pointer to the UTF-8 string lengths.
|
||||
// Also pointer to further allocated storage for meta data and
|
||||
// UTF-8 string contents as necessary.
|
||||
int32_t *utf8Lengths;
|
||||
|
||||
// Pointer to the part of the (utf8Lengths) memory block that stores
|
||||
// the lengths of span(), spanBack() etc. for each string.
|
||||
uint8_t *spanLengths;
|
||||
|
||||
// Pointer to the part of the (utf8Lengths) memory block that stores
|
||||
// the UTF-8 versions of the parent set's strings.
|
||||
uint8_t *utf8;
|
||||
|
||||
// Number of bytes for all UTF-8 versions of strings together.
|
||||
int32_t utf8Length;
|
||||
|
||||
// Maximum lengths of relevant strings.
|
||||
int32_t maxLength16;
|
||||
int32_t maxLength8;
|
||||
|
||||
// Set up for all variants of span()?
|
||||
UBool all;
|
||||
|
||||
// Memory for small numbers and lengths of strings.
|
||||
// For example, for 8 strings:
|
||||
// 8 UTF-8 lengths, 8*4 bytes span lengths, 8*2 3-byte UTF-8 characters
|
||||
// = 112 bytes = int32_t[28].
|
||||
int32_t staticLengths[32];
|
||||
};
|
||||
|
||||
UBool UnicodeSetStringSpan::needsStringSpanUTF16() {
|
||||
return (UBool)(maxLength16!=0);
|
||||
}
|
||||
|
||||
UBool UnicodeSetStringSpan::needsStringSpanUTF8() {
|
||||
return (UBool)(maxLength8!=0);
|
||||
}
|
||||
|
||||
UBool UnicodeSetStringSpan::contains(UChar32 c) const {
|
||||
return spanSet.contains(c);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2006, International Business Machines
|
||||
* Copyright (C) 2002-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -41,6 +41,26 @@ uset_close(USet* set) {
|
||||
delete (UnicodeSet*) set;
|
||||
}
|
||||
|
||||
U_DRAFT USet * U_EXPORT2
|
||||
uset_clone(const USet *set) {
|
||||
return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone());
|
||||
}
|
||||
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
uset_isFrozen(const USet *set) {
|
||||
return ((UnicodeSet*) set)->UnicodeSet::isFrozen();
|
||||
}
|
||||
|
||||
U_DRAFT void U_EXPORT2
|
||||
uset_freeze(USet *set) {
|
||||
((UnicodeSet*) set)->UnicodeSet::freeze();
|
||||
}
|
||||
|
||||
U_DRAFT USet * U_EXPORT2
|
||||
uset_cloneAsThawed(const USet *set) {
|
||||
return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed());
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uset_set(USet* set,
|
||||
UChar32 start, UChar32 end) {
|
||||
@ -64,12 +84,8 @@ uset_addRange(USet* set, UChar32 start, UChar32 end) {
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uset_addString(USet* set, const UChar* str, int32_t strLen) {
|
||||
// WRONG! Do not alias, it will stay aliased, even after
|
||||
// copying. TODO: do we need a copy ctor that unaliases
|
||||
//UnicodeString s(strLen==-1, str, strLen);
|
||||
|
||||
// UnicodeString handles -1 for strLen
|
||||
UnicodeString s(str, strLen);
|
||||
UnicodeString s(strLen<0, str, strLen);
|
||||
((UnicodeSet*) set)->UnicodeSet::add(s);
|
||||
}
|
||||
|
||||
@ -174,6 +190,26 @@ uset_containsSome(const USet* set1, const USet* set2) {
|
||||
return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2);
|
||||
}
|
||||
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) {
|
||||
return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition);
|
||||
}
|
||||
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) {
|
||||
return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition);
|
||||
}
|
||||
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) {
|
||||
return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition);
|
||||
}
|
||||
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) {
|
||||
return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition);
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uset_equals(const USet* set1, const USet* set2) {
|
||||
return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2005, International Business Machines
|
||||
* Copyright (c) 2002-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
@ -19,6 +19,8 @@ static void Testj2269(void);
|
||||
static void TestSerialized(void);
|
||||
static void TestNonInvariantPattern(void);
|
||||
static void TestBadPattern(void);
|
||||
static void TestFreezable(void);
|
||||
static void TestSpan(void);
|
||||
|
||||
void addUSetTest(TestNode** root);
|
||||
|
||||
@ -40,6 +42,8 @@ addUSetTest(TestNode** root) {
|
||||
TEST(TestSerialized);
|
||||
TEST(TestNonInvariantPattern);
|
||||
TEST(TestBadPattern);
|
||||
TEST(TestFreezable);
|
||||
TEST(TestSpan);
|
||||
}
|
||||
|
||||
/*------------------------------------------------------------------
|
||||
@ -529,4 +533,80 @@ static void TestBadPattern(void) {
|
||||
}
|
||||
}
|
||||
|
||||
static USet *openIDSet() {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
U_STRING_DECL(pattern, "[:ID_Continue:]", 15);
|
||||
U_STRING_INIT(pattern, "[:ID_Continue:]", 15);
|
||||
return uset_openPattern(pattern, 15, &errorCode);
|
||||
}
|
||||
|
||||
static void TestFreezable() {
|
||||
USet *idSet=openIDSet();
|
||||
USet *frozen=uset_clone(idSet);
|
||||
USet *thawed;
|
||||
if(!uset_equals(frozen, idSet)) {
|
||||
log_err("uset_clone() did not make an equal copy\n");
|
||||
}
|
||||
uset_freeze(frozen);
|
||||
uset_addRange(frozen, 0xd802, 0xd805);
|
||||
if(uset_isFrozen(idSet) || !uset_isFrozen(frozen) || !uset_equals(frozen, idSet)) {
|
||||
log_err("uset_freeze() or uset_isFrozen() does not work\n");
|
||||
}
|
||||
thawed=uset_cloneAsThawed(frozen);
|
||||
uset_addRange(thawed, 0xd802, 0xd805);
|
||||
if(uset_isFrozen(thawed) || uset_equals(thawed, idSet) || !uset_containsRange(thawed, 0xd802, 0xd805)) {
|
||||
log_err("uset_cloneAsThawed() does not work\n");
|
||||
}
|
||||
uset_close(idSet);
|
||||
uset_close(frozen);
|
||||
uset_close(thawed);
|
||||
}
|
||||
|
||||
static void TestSpan() {
|
||||
static const UChar s16[2]={ 0xe01, 0x3000 };
|
||||
static const char* s8="\xE0\xB8\x81\xE3\x80\x80";
|
||||
|
||||
USet *idSet=openIDSet();
|
||||
|
||||
if(
|
||||
1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) ||
|
||||
0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) ||
|
||||
2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) ||
|
||||
1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED)
|
||||
) {
|
||||
log_err("uset_span() or uset_spanBack() does not work\n");
|
||||
}
|
||||
|
||||
if(
|
||||
3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
|
||||
0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) ||
|
||||
6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
|
||||
3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED)
|
||||
) {
|
||||
log_err("uset_spanUTF8() or uset_spanBackUTF8() does not work\n");
|
||||
}
|
||||
|
||||
uset_freeze(idSet);
|
||||
|
||||
if(
|
||||
1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) ||
|
||||
0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) ||
|
||||
2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) ||
|
||||
1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED)
|
||||
) {
|
||||
log_err("uset_span(frozen) or uset_spanBack(frozen) does not work\n");
|
||||
}
|
||||
|
||||
if(
|
||||
3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
|
||||
0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) ||
|
||||
6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
|
||||
3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED)
|
||||
) {
|
||||
log_err("uset_spanUTF8(frozen) or uset_spanBackUTF8(frozen) does not work\n");
|
||||
}
|
||||
|
||||
uset_close(idSet);
|
||||
}
|
||||
|
||||
/*eof*/
|
||||
|
@ -1,6 +1,6 @@
|
||||
/***************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000-2004, International Business Machines
|
||||
* Copyright (C) 2000-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
************************************************************************
|
||||
@ -21,7 +21,6 @@
|
||||
#include "transapi.h"
|
||||
#include "cpdtrtst.h"
|
||||
#include "transrt.h"
|
||||
#include "usettest.h"
|
||||
#include "jamotest.h"
|
||||
#include "trnserr.h"
|
||||
#include "reptest.h"
|
||||
@ -29,7 +28,7 @@
|
||||
#define CASE(id,test) case id: \
|
||||
name = #test; \
|
||||
if (exec) { \
|
||||
logln(#test "---"); logln(""); \
|
||||
logln(#test "---"); logln(); \
|
||||
test t; \
|
||||
callTest(t, par); \
|
||||
} \
|
||||
@ -43,12 +42,11 @@ void IntlTestTransliterator::runIndexedTest( int32_t index, UBool exec, const ch
|
||||
CASE(1, TransliteratorAPITest);
|
||||
CASE(2, CompoundTransliteratorTest);
|
||||
CASE(3, TransliteratorRoundTripTest);
|
||||
CASE(4, UnicodeSetTest);
|
||||
CASE(5, JamoTest);
|
||||
CASE(6, TransliteratorErrorTest);
|
||||
CASE(7, ReplaceableTest);
|
||||
CASE(4, JamoTest);
|
||||
CASE(5, TransliteratorErrorTest);
|
||||
CASE(6, ReplaceableTest);
|
||||
#if !UCONFIG_NO_TRANSLITERATION && defined(U_USE_UNICODE_FILTER_LOGIC_OBSOLETE_2_8)
|
||||
CASE(10, UnicodeFilterLogicTest);
|
||||
CASE(7, UnicodeFilterLogicTest);
|
||||
#endif
|
||||
|
||||
default: name=""; break;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2005, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2007, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
@ -25,141 +25,41 @@
|
||||
#include "v32test.h"
|
||||
#include "uvectest.h"
|
||||
#include "aliastst.h"
|
||||
#include "usettest.h"
|
||||
//#include "custrtest.h"
|
||||
//#include "ccitrtst.h"
|
||||
//#include "cloctest.h"
|
||||
//#include "ctres.h"
|
||||
//#include "ctucd.h"
|
||||
|
||||
#define CASE(id, test) case id: \
|
||||
name = #test; \
|
||||
if (exec) { \
|
||||
logln(#test "---"); logln(); \
|
||||
test t; \
|
||||
callTest(t, par); \
|
||||
} \
|
||||
break
|
||||
|
||||
void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
|
||||
{
|
||||
if (exec) logln("TestSuite Utilities: ");
|
||||
switch (index) {
|
||||
case 0:
|
||||
name = "MultithreadTest";
|
||||
if (exec) {
|
||||
logln("MultithreadTest---"); logln("");
|
||||
MultithreadTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
|
||||
case 1:
|
||||
name = "StringTest";
|
||||
if (exec) {
|
||||
logln("StringTest---"); logln("");
|
||||
StringTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
|
||||
case 2:
|
||||
name = "UnicodeStringTest";
|
||||
if (exec) {
|
||||
logln("UnicodeStringTest---"); logln("");
|
||||
UnicodeStringTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
|
||||
case 3:
|
||||
name = "LocaleTest";
|
||||
if (exec) {
|
||||
logln("LocaleTest---"); logln("");
|
||||
LocaleTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
|
||||
case 4:
|
||||
name = "CharIterTest";
|
||||
if (exec) {
|
||||
logln("CharIterTest---"); logln("");
|
||||
CharIterTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
|
||||
case 5:
|
||||
name = "UnicodeTest";
|
||||
if (exec) {
|
||||
logln("UnicodeTest---"); logln("");
|
||||
UnicodeTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
|
||||
case 6:
|
||||
name = "ResourceBundleTest";
|
||||
if (exec) {
|
||||
logln("ResourceBundleTest---"); logln("");
|
||||
ResourceBundleTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
case 7:
|
||||
name = "NewResourceBundleTest";
|
||||
if (exec) {
|
||||
logln("NewResourceBundleTest---"); logln("");
|
||||
NewResourceBundleTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
|
||||
case 8:
|
||||
name = "PUtilTest";
|
||||
if (exec) {
|
||||
logln("PUtilTest---"); logln("");
|
||||
PUtilTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
|
||||
case 9:
|
||||
name = "UObjectTest";
|
||||
if(exec) {
|
||||
logln ("UObjectTest---"); logln("");
|
||||
UObjectTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;;
|
||||
|
||||
case 10:
|
||||
name = "UVector32Test";
|
||||
if(exec) {
|
||||
logln ("UVector32Test---"); logln("");
|
||||
UVector32Test test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;;
|
||||
|
||||
case 11:
|
||||
name = "UVectorTest";
|
||||
if(exec) {
|
||||
logln ("UVectorTest---"); logln("");
|
||||
UVectorTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;;
|
||||
|
||||
case 12:
|
||||
name = "UTextTest";
|
||||
if(exec) {
|
||||
logln ("UTextTest---"); logln("");
|
||||
UTextTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
|
||||
case 13:
|
||||
name = "LocaleAliasTest";
|
||||
if (exec) {
|
||||
logln("LocaleAliasTest---"); logln("");
|
||||
LocaleAliasTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
break;
|
||||
|
||||
CASE(0, MultithreadTest);
|
||||
CASE(1, StringTest);
|
||||
CASE(2, UnicodeStringTest);
|
||||
CASE(3, LocaleTest);
|
||||
CASE(4, CharIterTest);
|
||||
CASE(5, UnicodeTest);
|
||||
CASE(6, ResourceBundleTest);
|
||||
CASE(7, NewResourceBundleTest);
|
||||
CASE(8, PUtilTest);
|
||||
CASE(9, UObjectTest);
|
||||
CASE(10, UVector32Test);
|
||||
CASE(11, UVectorTest);
|
||||
CASE(12, UTextTest);
|
||||
CASE(13, MultithreadTest);
|
||||
CASE(14, UnicodeSetTest);
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,7 +1,7 @@
|
||||
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2006, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2007, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************
|
||||
**********************************************************************
|
||||
@ -16,16 +16,21 @@
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/ucnv_err.h"
|
||||
#include "intltest.h"
|
||||
|
||||
class UnicodeSetWithStrings;
|
||||
|
||||
/**
|
||||
* UnicodeSet test
|
||||
*/
|
||||
class UnicodeSetTest: public IntlTest {
|
||||
public:
|
||||
UnicodeSetTest();
|
||||
~UnicodeSetTest();
|
||||
|
||||
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=NULL);
|
||||
|
||||
private:
|
||||
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=NULL);
|
||||
|
||||
void Testj2268();
|
||||
|
||||
@ -76,6 +81,12 @@ private:
|
||||
|
||||
void TestPosixClasses();
|
||||
|
||||
void TestFreezable();
|
||||
|
||||
void TestSpan();
|
||||
|
||||
void TestStringSpan();
|
||||
|
||||
private:
|
||||
|
||||
UBool toPatternAux(UChar32 start, UChar32 end);
|
||||
@ -152,6 +163,26 @@ private:
|
||||
const UnicodeSet& set,
|
||||
UChar32 start, UChar32 end);
|
||||
void doAssert(UBool, const char*);
|
||||
|
||||
void testSpan(const UnicodeSetWithStrings *sets[4], const void *s, int32_t length, UBool isUTF16,
|
||||
uint32_t whichSpans,
|
||||
int32_t expectLimits[], int32_t &expectCount,
|
||||
const char *testName, int32_t index);
|
||||
void testSpan(const UnicodeSetWithStrings *sets[4], const void *s, int32_t length, UBool isUTF16,
|
||||
uint32_t whichSpans,
|
||||
const char *testName, int32_t index);
|
||||
void testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
|
||||
const UChar *s16, int32_t length16,
|
||||
uint32_t whichSpans,
|
||||
const char *testName, int32_t index);
|
||||
void testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName);
|
||||
void testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName);
|
||||
void testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName);
|
||||
|
||||
UConverter *openUTF8Converter();
|
||||
|
||||
UConverter *utf8Cnv;
|
||||
|
||||
public:
|
||||
static UnicodeString escape(const UnicodeString& s);
|
||||
};
|
||||
|
@ -70,7 +70,7 @@
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
AdditionalDependencies="../../../lib/icule.lib ../../../lib/icuuc.lib odbc32.lib odbccp32.lib"
|
||||
AdditionalDependencies="../../../lib/icule.lib ../../../lib/icuuc.lib"
|
||||
OutputFile=".\Release/letest.exe"
|
||||
LinkIncremental="1"
|
||||
SuppressStartupBanner="true"
|
||||
|
78
icu4c/source/test/perf/unisetperf/Makefile.in
Normal file
78
icu4c/source/test/perf/unisetperf/Makefile.in
Normal file
@ -0,0 +1,78 @@
|
||||
## Makefile.in for ICU - test/perf/unisetperf
|
||||
## Copyright (c) 2001-2007, International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
|
||||
## Source directory information
|
||||
srcdir = @srcdir@
|
||||
top_srcdir = @top_srcdir@
|
||||
|
||||
top_builddir = ../../..
|
||||
|
||||
include $(top_builddir)/icudefs.mk
|
||||
|
||||
## Build directory information
|
||||
subdir = test/perf/unisetperf
|
||||
|
||||
## Extra files to remove for 'make clean'
|
||||
CLEANFILES = *~ $(DEPS)
|
||||
|
||||
## Target information
|
||||
TARGET = unisetperf
|
||||
|
||||
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/tools/toolutil -I$(top_srcdir)/tools/ctestfw
|
||||
LIBS = $(LIBCTESTFW) $(LIBICUI18N) $(LIBICUUC) $(LIBICUTOOLUTIL) $(DEFAULT_LIBS) $(LIB_M)
|
||||
|
||||
OBJECTS = unisetperf.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local install install-local clean clean-local \
|
||||
distclean distclean-local dist dist-local check check-local
|
||||
|
||||
## Clear suffix list
|
||||
.SUFFIXES :
|
||||
|
||||
## List of standard targets
|
||||
all: all-local
|
||||
install: install-local
|
||||
clean: clean-local
|
||||
distclean : distclean-local
|
||||
dist: dist-local
|
||||
check: all check-local
|
||||
|
||||
all-local: $(TARGET)
|
||||
|
||||
install-local:
|
||||
|
||||
dist-local:
|
||||
|
||||
clean-local:
|
||||
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
|
||||
$(RMV) $(OBJECTS) $(TARGET)
|
||||
|
||||
distclean-local: clean-local
|
||||
$(RMV) Makefile
|
||||
|
||||
check-local: all-local
|
||||
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
$(TARGET) : $(OBJECTS)
|
||||
$(LINK.cc) -o $@ $^ $(LIBS)
|
||||
|
||||
invoke:
|
||||
ICU_DATA=$${ICU_DATA:-$(top_builddir)/data/} TZ=PST8PDT $(INVOKE) $(INVOCATION)
|
||||
|
||||
ifeq (,$(MAKECMDGOALS))
|
||||
-include $(DEPS)
|
||||
else
|
||||
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
|
||||
ifneq ($(patsubst %install,,$(MAKECMDGOALS)),)
|
||||
-include $(DEPS)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
197
icu4c/source/test/perf/unisetperf/draft/bitset.cpp
Normal file
197
icu4c/source/test/perf/unisetperf/draft/bitset.cpp
Normal file
@ -0,0 +1,197 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: bitset.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2007jan15
|
||||
* created by: Markus Scherer
|
||||
*
|
||||
* Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
|
||||
* using a folded bit set consisting of a 1k-entry index table and a
|
||||
* compacted array of 64-bit words.
|
||||
* Uses a simple hash table for compaction.
|
||||
* Uses the original set for supplementary code points.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicont.h"
|
||||
|
||||
/*
|
||||
* Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
|
||||
* Hashes 64-bit words and maps them to 16-bit integers which are
|
||||
* assigned in order of new incoming words for subsequent storage
|
||||
* in a contiguous array.
|
||||
*/
|
||||
struct BMPBitHash : public UObject {
|
||||
int64_t keys[0x800]; // 2k
|
||||
uint16_t values[0x800];
|
||||
uint16_t reverse[0x400];
|
||||
uint16_t count;
|
||||
const int32_t prime=1301; // Less than 2k.
|
||||
|
||||
BMPBitHash() : count(0) {
|
||||
// Fill values[] with 0xffff.
|
||||
uprv_memset(values, 0xff, sizeof(values));
|
||||
}
|
||||
|
||||
/*
|
||||
* Map a key to an integer count.
|
||||
* Map at most 1k=0x400 different keys with this data structure.
|
||||
*/
|
||||
uint16_t map(int64_t key) {
|
||||
int32_t hash=(int32_t)(key>>55)&0x1ff;
|
||||
hash^=(int32_t)(key>>44)&0x7ff;
|
||||
hash^=(int32_t)(key>>33)&0x7ff;
|
||||
hash^=(int32_t)(key>>22)&0x7ff;
|
||||
hash^=(int32_t)(key>>11)&0x7ff;
|
||||
hash^=(int32_t)key&0x7ff;
|
||||
for(;;) {
|
||||
if(values[hash]==0xffff) {
|
||||
// Unused slot.
|
||||
keys[hash]=key;
|
||||
reverse[count]=hash;
|
||||
return values[hash]=count++;
|
||||
} else if(keys[hash]==key) {
|
||||
// Found a slot with this key.
|
||||
return values[hash];
|
||||
} else {
|
||||
// Used slot with a different key, move to another slot.
|
||||
hash=(hash+prime)&0x7ff;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint16_t countKeys() const { return count; }
|
||||
|
||||
/*
|
||||
* Invert the hash map: Fill an array of length countKeys() with the keys
|
||||
* indexed by their mapped values.
|
||||
*/
|
||||
void invert(int64_t *k) const {
|
||||
uint16_t i;
|
||||
|
||||
for(i=0; i<count; ++i) {
|
||||
k[i]=keys[reverse[i]];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class BitSet : public UObject, public UnicodeContainable {
|
||||
public:
|
||||
BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
BMPBitHash *bitHash=new BMPBitHash;
|
||||
if(bitHash==NULL || restSet==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
UnicodeSetIterator iter(set);
|
||||
int64_t b;
|
||||
UChar32 start, end;
|
||||
int32_t prevIndex, i, j;
|
||||
|
||||
b=0; // Not necessary but makes compilers happy.
|
||||
prevIndex=-1;
|
||||
for(;;) {
|
||||
if(iter.nextRange() && !iter.isString()) {
|
||||
start=iter.getCodepoint();
|
||||
end=iter.getCodepointEnd();
|
||||
} else {
|
||||
start=0x10000;
|
||||
}
|
||||
i=start>>6;
|
||||
if(prevIndex!=i) {
|
||||
// Finish the end of the previous range.
|
||||
if(prevIndex<0) {
|
||||
prevIndex=0;
|
||||
} else {
|
||||
index[prevIndex++]=bitHash->map(b);
|
||||
}
|
||||
// Fill all-zero entries between ranges.
|
||||
if(prevIndex<i) {
|
||||
uint16_t zero=bitHash->map(0);
|
||||
do {
|
||||
index[prevIndex++]=zero;
|
||||
} while(prevIndex<i);
|
||||
}
|
||||
b=0;
|
||||
}
|
||||
if(start>0xffff) {
|
||||
break;
|
||||
}
|
||||
b|=~((INT64_C(1)<<(start&0x3f))-1);
|
||||
j=end>>6;
|
||||
if(i<j) {
|
||||
// Set bits for the start of the range.
|
||||
index[i++]=bitHash->map(b);
|
||||
// Fill all-one entries inside the range.
|
||||
if(i<j) {
|
||||
uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
|
||||
do {
|
||||
index[i++]=all;
|
||||
} while(i<j);
|
||||
}
|
||||
b=INT64_C(0xffffffffffffffff);
|
||||
}
|
||||
/* i==j */
|
||||
b&=(INT64_C(1)<<(end&0x3f))-1;
|
||||
prevIndex=j;
|
||||
}
|
||||
|
||||
if(bitHash->countKeys()>LENGTHOF(shortBits)) {
|
||||
bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
|
||||
}
|
||||
if(bits!=NULL) {
|
||||
bitHash->invert(bits);
|
||||
} else {
|
||||
bits=shortBits;
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
latin1Set[0]=(uint32_t)bits[0];
|
||||
latin1Set[1]=(uint32_t)(bits[0]>>32);
|
||||
latin1Set[2]=(uint32_t)bits[1];
|
||||
latin1Set[3]=(uint32_t)(bits[1]>>32);
|
||||
latin1Set[4]=(uint32_t)bits[2];
|
||||
latin1Set[5]=(uint32_t)(bits[2]>>32);
|
||||
latin1Set[6]=(uint32_t)bits[3];
|
||||
latin1Set[7]=(uint32_t)(bits[3]>>32);
|
||||
|
||||
restSet.remove(0, 0xffff);
|
||||
}
|
||||
|
||||
~BitSet() {
|
||||
if(bits!=shortBits) {
|
||||
uprv_free(bits);
|
||||
}
|
||||
delete restSet;
|
||||
}
|
||||
|
||||
UBool contains(UChar32 c) const {
|
||||
if((uint32_t)c<=0xff) {
|
||||
return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
|
||||
} else if((uint32_t)c<0xffff) {
|
||||
return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
|
||||
} else {
|
||||
return restSet->contains(c);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
uint16_t index[0x400];
|
||||
int64_t shortBits[32];
|
||||
int64_t *bits;
|
||||
|
||||
uint32_t latin1Bits[8];
|
||||
|
||||
UnicodeSet *restSet;
|
||||
};
|
19
icu4c/source/test/perf/unisetperf/draft/contperf.bat
Executable file
19
icu4c/source/test/perf/unisetperf/draft/contperf.bat
Executable file
@ -0,0 +1,19 @@
|
||||
rem Copyright (c) 2007, International Business Machines Corporation and
|
||||
rem others. All Rights Reserved.
|
||||
|
||||
set PERF=c:\svn\icuproj\icu\ucnvutf8\source\test\perf\unisetperf\release\unisetperf
|
||||
rem types: slow Bv Bv0 B0
|
||||
rem --pattern [:White_Space:]
|
||||
|
||||
for %%f in (udhr_eng.txt
|
||||
udhr_deu.txt
|
||||
udhr_fra.txt
|
||||
udhr_rus.txt
|
||||
udhr_tha.txt
|
||||
udhr_jpn.txt
|
||||
udhr_cmn.txt
|
||||
udhr_jpn.html) do (
|
||||
for %%t in (slow Bv Bv0 B0) do (
|
||||
%PERF% Contains --type %%t -f \temp\udhr\%%f --pattern [:White_Space:] -v -e UTF-8 --passes 3 --iterations 10000
|
||||
)
|
||||
)
|
23
icu4c/source/test/perf/unisetperf/draft/contperf.sh
Executable file
23
icu4c/source/test/perf/unisetperf/draft/contperf.sh
Executable file
@ -0,0 +1,23 @@
|
||||
#!/bin/sh
|
||||
# Copyright (c) 2007, International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
|
||||
# Echo shell script commands.
|
||||
set -ex
|
||||
|
||||
PERF=test/perf/unisetperf/unisetperf
|
||||
# slow Bv Bv0 B0
|
||||
# --pattern [:White_Space:]
|
||||
|
||||
for file in udhr_eng.txt \
|
||||
udhr_deu.txt \
|
||||
udhr_fra.txt \
|
||||
udhr_rus.txt \
|
||||
udhr_tha.txt \
|
||||
udhr_jpn.txt \
|
||||
udhr_cmn.txt \
|
||||
udhr_jpn.html; do
|
||||
for type in slow Bv Bv0; do
|
||||
$PERF Contains --type $type -f ~/udhr/$file -v -e UTF-8 --passes 3 --iterations 10000
|
||||
done
|
||||
done
|
19
icu4c/source/test/perf/unisetperf/draft/span16perf.bat
Executable file
19
icu4c/source/test/perf/unisetperf/draft/span16perf.bat
Executable file
@ -0,0 +1,19 @@
|
||||
rem Copyright (c) 2007, International Business Machines Corporation and
|
||||
rem others. All Rights Reserved.
|
||||
|
||||
set PERF=c:\svn\icuproj\icu\ucnvutf8\source\test\perf\unisetperf\release\unisetperf
|
||||
rem types: slow Bv Bv0 B0
|
||||
rem --pattern [:White_Space:]
|
||||
|
||||
for %%f in (udhr_eng.txt
|
||||
udhr_deu.txt
|
||||
udhr_fra.txt
|
||||
udhr_rus.txt
|
||||
udhr_tha.txt
|
||||
udhr_jpn.txt
|
||||
udhr_cmn.txt
|
||||
udhr_jpn.html) do (
|
||||
for %%t in (slow Bv Bv0) do (
|
||||
%PERF% SpanUTF16 --type %%t -f \temp\udhr\%%f --pattern [:White_Space:] -v -e UTF-8 --passes 3 --iterations 10000
|
||||
)
|
||||
)
|
23
icu4c/source/test/perf/unisetperf/draft/span16perf.sh
Executable file
23
icu4c/source/test/perf/unisetperf/draft/span16perf.sh
Executable file
@ -0,0 +1,23 @@
|
||||
#!/bin/sh
|
||||
# Copyright (c) 2007, International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
|
||||
# Echo shell script commands.
|
||||
set -ex
|
||||
|
||||
PERF=test/perf/unisetperf/unisetperf
|
||||
# slow Bv Bv0 B0
|
||||
# --pattern [:White_Space:]
|
||||
|
||||
for file in udhr_eng.txt \
|
||||
udhr_deu.txt \
|
||||
udhr_fra.txt \
|
||||
udhr_rus.txt \
|
||||
udhr_tha.txt \
|
||||
udhr_jpn.txt \
|
||||
udhr_cmn.txt \
|
||||
udhr_jpn.html; do
|
||||
for type in slow Bv Bv0; do
|
||||
$PERF SpanUTF16 --type $type -f ~/udhr/$file -v -e UTF-8 --passes 3 --iterations 10000
|
||||
done
|
||||
done
|
19
icu4c/source/test/perf/unisetperf/draft/span8perf.bat
Executable file
19
icu4c/source/test/perf/unisetperf/draft/span8perf.bat
Executable file
@ -0,0 +1,19 @@
|
||||
rem Copyright (c) 2007, International Business Machines Corporation and
|
||||
rem others. All Rights Reserved.
|
||||
|
||||
set PERF=c:\svn\icuproj\icu\ucnvutf8\source\test\perf\unisetperf\release\unisetperf
|
||||
rem types: slow Bh bh Bv Bv0 B0 BvF Bvp BvpF L Bvl BvL
|
||||
rem --pattern [:White_Space:]
|
||||
|
||||
for %%f in (udhr_eng.txt
|
||||
udhr_deu.txt
|
||||
udhr_fra.txt
|
||||
udhr_rus.txt
|
||||
udhr_tha.txt
|
||||
udhr_jpn.txt
|
||||
udhr_cmn.txt
|
||||
udhr_jpn.html) do (
|
||||
for %%t in (slow BvF BvpF Bvl BvL) do (
|
||||
%PERF% SpanUTF8 --type %%t -f \temp\udhr\%%f --pattern [:White_Space:] -v -e UTF-8 --passes 3 --iterations 10000
|
||||
)
|
||||
)
|
23
icu4c/source/test/perf/unisetperf/draft/span8perf.sh
Executable file
23
icu4c/source/test/perf/unisetperf/draft/span8perf.sh
Executable file
@ -0,0 +1,23 @@
|
||||
#!/bin/sh
|
||||
# Copyright (c) 2007, International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
|
||||
# Echo shell script commands.
|
||||
set -ex
|
||||
|
||||
PERF=test/perf/unisetperf/unisetperf
|
||||
# slow Bh bh Bv Bv0 B0 BvF Bvp BvpF L Bvl BvL
|
||||
# --pattern [:White_Space:]
|
||||
|
||||
for file in udhr_eng.txt \
|
||||
udhr_deu.txt \
|
||||
udhr_fra.txt \
|
||||
udhr_rus.txt \
|
||||
udhr_tha.txt \
|
||||
udhr_jpn.txt \
|
||||
udhr_cmn.txt \
|
||||
udhr_jpn.html; do
|
||||
for type in slow BvF BvpF Bvl BvL; do
|
||||
$PERF SpanUTF8 --type $type -f ~/udhr/$file -v -e UTF-8 --passes 3 --iterations 10000
|
||||
done
|
||||
done
|
111
icu4c/source/test/perf/unisetperf/draft/trieset.cpp
Normal file
111
icu4c/source/test/perf/unisetperf/draft/trieset.cpp
Normal file
@ -0,0 +1,111 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: trieset.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2007jan15
|
||||
* created by: Markus Scherer
|
||||
*
|
||||
* Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
|
||||
* using a UTrie with 8-bit (byte) results per code point.
|
||||
* Modifies the trie index to make the BMP linear, and uses the original set
|
||||
* for supplementary code points.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicont.h"
|
||||
|
||||
#define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
|
||||
|
||||
#define UTRIE_GET8_FROM_LEAD(trie, c16) \
|
||||
((const uint8_t *)(trie)->data32)[ \
|
||||
((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \
|
||||
((c16)&UTRIE_MASK) \
|
||||
]
|
||||
|
||||
class TrieSet : public UObject, public UnicodeContainable {
|
||||
public:
|
||||
TrieSet(const UnicodeSet &set, UErrorCode &errorCode)
|
||||
: trieData(NULL), latin1(NULL), restSet(set.clone()) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
if(restSet==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE);
|
||||
UChar32 start, end;
|
||||
|
||||
UnicodeSetIterator iter(set);
|
||||
|
||||
while(iter.nextRange() && !iter.isString()) {
|
||||
start=iter.getCodepoint();
|
||||
end=iter.getCodepointEnd();
|
||||
if(start>0xffff) {
|
||||
break;
|
||||
}
|
||||
if(end>0xffff) {
|
||||
end=0xffff;
|
||||
}
|
||||
if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) {
|
||||
errorCode=U_INTERNAL_PROGRAM_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Preflight the trie length.
|
||||
int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode);
|
||||
if(errorCode!=U_BUFFER_OVERFLOW_ERROR) {
|
||||
return;
|
||||
}
|
||||
|
||||
trieData=(uint32_t *)uprv_malloc(length);
|
||||
if(trieData==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode);
|
||||
utrie_unserialize(&trie, trieData, length, &errorCode); // TODO: Implement for 8-bit UTrie!
|
||||
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
// Copy the indexes for surrogate code points into the BMP range
|
||||
// for simple access across the entire BMP.
|
||||
uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT),
|
||||
trie.index+UTRIE_BMP_INDEX_LENGTH,
|
||||
(0x800>>UTRIE_SHIFT)*2);
|
||||
latin1=UTRIE_GET8_LATIN1(&trie);
|
||||
}
|
||||
|
||||
restSet.remove(0, 0xffff);
|
||||
}
|
||||
|
||||
~TrieSet() {
|
||||
uprv_free(trieData);
|
||||
delete restSet;
|
||||
}
|
||||
|
||||
UBool contains(UChar32 c) const {
|
||||
if((uint32_t)c<=0xff) {
|
||||
return (UBool)latin1[c];
|
||||
} else if((uint32_t)c<0xffff) {
|
||||
return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c);
|
||||
} else {
|
||||
return restSet->contains(c);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
uint32_t *trieData;
|
||||
const uint8_t *latin1;
|
||||
UTrie trie;
|
||||
UnicodeSet *restSet;
|
||||
};
|
34
icu4c/source/test/perf/unisetperf/draft/unicont.h
Normal file
34
icu4c/source/test/perf/unisetperf/draft/unicont.h
Normal file
@ -0,0 +1,34 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: unicont.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2007jan15
|
||||
* created by: Markus Scherer
|
||||
*
|
||||
* Idea for new common interface underneath the normal UnicodeSet
|
||||
* and other classes, such as "compiled", fast, read-only (immutable)
|
||||
* versions of UnicodeSet.
|
||||
*/
|
||||
|
||||
class UnicodeContainable {
|
||||
public:
|
||||
virtual ~UnicodeContainable() {}
|
||||
|
||||
virtual UBool contains(UChar32 c) const = 0;
|
||||
|
||||
virtual int32_t span(const UChar *s, int32_t length);
|
||||
|
||||
virtual int32_t spanNot(const UChar *s, int32_t length);
|
||||
|
||||
virtual int32_t spanUTF8(const UChar *s, int32_t length);
|
||||
|
||||
virtual int32_t spanNotUTF8(const UChar *s, int32_t length);
|
||||
|
||||
virtual UClassID getDynamicClassID(void) const;
|
||||
};
|
441
icu4c/source/test/perf/unisetperf/unisetperf.cpp
Normal file
441
icu4c/source/test/perf/unisetperf/unisetperf.cpp
Normal file
@ -0,0 +1,441 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: unisetperf.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2007jan31
|
||||
* created by: Markus Scherer
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "unicode/uperf.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "uoptions.h"
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
// Command-line options specific to unisetperf.
|
||||
// Options do not have abbreviations: Force readable command lines.
|
||||
// (Using U+0001 for abbreviation characters.)
|
||||
enum {
|
||||
SET_PATTERN,
|
||||
FAST_TYPE,
|
||||
UNISETPERF_OPTIONS_COUNT
|
||||
};
|
||||
|
||||
static UOption options[UNISETPERF_OPTIONS_COUNT]={
|
||||
UOPTION_DEF("pattern", '\x01', UOPT_REQUIRES_ARG),
|
||||
UOPTION_DEF("type", '\x01', UOPT_REQUIRES_ARG)
|
||||
};
|
||||
|
||||
static const char *const unisetperf_usage =
|
||||
"\t--pattern UnicodeSet pattern for instantiation.\n"
|
||||
"\t Default: [:ID_Continue:]\n"
|
||||
"\t--type Type of UnicodeSet: slow fast\n"
|
||||
"\t Default: slow\n";
|
||||
|
||||
// Test object with setup data.
|
||||
class UnicodeSetPerformanceTest : public UPerfTest {
|
||||
public:
|
||||
UnicodeSetPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
|
||||
: UPerfTest(argc, argv, options, LENGTHOF(options), unisetperf_usage, status),
|
||||
utf8(NULL), utf8Length(0), countInputCodePoints(0), spanCount(0) {
|
||||
if (U_SUCCESS(status)) {
|
||||
UnicodeString pattern=UnicodeString(options[SET_PATTERN].value, -1, US_INV).unescape();
|
||||
set.applyPattern(pattern, status);
|
||||
prefrozen=set;
|
||||
if(0==strcmp(options[FAST_TYPE].value, "fast")) {
|
||||
set.freeze();
|
||||
}
|
||||
|
||||
int32_t inputLength;
|
||||
UPerfTest::getBuffer(inputLength, status);
|
||||
if(U_SUCCESS(status) && inputLength>0) {
|
||||
countInputCodePoints = u_countChar32(buffer, bufferLen);
|
||||
|
||||
countSpans();
|
||||
|
||||
// Preflight the UTF-8 length and allocate utf8.
|
||||
u_strToUTF8(NULL, 0, &utf8Length, buffer, bufferLen, &status);
|
||||
if(status==U_BUFFER_OVERFLOW_ERROR) {
|
||||
utf8=(char *)malloc(utf8Length);
|
||||
if(utf8!=NULL) {
|
||||
status=U_ZERO_ERROR;
|
||||
u_strToUTF8(utf8, utf8Length, NULL, buffer, bufferLen, &status);
|
||||
} else {
|
||||
status=U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
if(verbose) {
|
||||
printf("code points:%ld len16:%ld len8:%ld spans:%ld "
|
||||
"cp/span:%.3g UChar/span:%.3g B/span:%.3g B/cp:%.3g\n",
|
||||
(long)countInputCodePoints, (long)bufferLen, (long)utf8Length, (long)spanCount,
|
||||
(double)countInputCodePoints/spanCount, (double)bufferLen/spanCount, (double)utf8Length/spanCount,
|
||||
(double)utf8Length/countInputCodePoints);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL);
|
||||
|
||||
// Count spans of characters that are in the set,
|
||||
// and spans of characters that are not in the set.
|
||||
// If the very first character is in the set, then one additional
|
||||
// not-span is counted.
|
||||
void countSpans() {
|
||||
const UChar *s=getBuffer();
|
||||
int32_t length=getBufferLen();
|
||||
int32_t i=0;
|
||||
UBool tf=FALSE;
|
||||
while(i<length) {
|
||||
i=span(s, length, i, tf);
|
||||
tf=(UBool)(!tf);
|
||||
++spanCount;
|
||||
}
|
||||
}
|
||||
int32_t span(const UChar *s, int32_t length, int32_t start, UBool tf) const {
|
||||
UChar32 c;
|
||||
int32_t prev;
|
||||
while((prev=start)<length) {
|
||||
U16_NEXT(s, start, length, c);
|
||||
if(tf!=set.contains(c)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return prev;
|
||||
}
|
||||
|
||||
const UChar *getBuffer() const { return buffer; }
|
||||
int32_t getBufferLen() const { return bufferLen; }
|
||||
|
||||
char *utf8;
|
||||
int32_t utf8Length;
|
||||
|
||||
// Number of code points in the input text.
|
||||
int32_t countInputCodePoints;
|
||||
int32_t spanCount;
|
||||
|
||||
UnicodeSet set;
|
||||
UnicodeSet prefrozen;
|
||||
};
|
||||
|
||||
// Performance test function object.
|
||||
class Command : public UPerfFunction {
|
||||
protected:
|
||||
Command(const UnicodeSetPerformanceTest &testcase) : testcase(testcase) {}
|
||||
|
||||
public:
|
||||
virtual ~Command() {}
|
||||
|
||||
// virtual void call(UErrorCode* pErrorCode) { ... }
|
||||
|
||||
virtual long getOperationsPerIteration() {
|
||||
// Number of code points tested:
|
||||
// Input code points, plus one for the end of each span except the last span.
|
||||
return testcase.countInputCodePoints+testcase.spanCount-1;
|
||||
}
|
||||
|
||||
virtual long getEventsPerIteration() {
|
||||
return testcase.spanCount;
|
||||
}
|
||||
|
||||
const UnicodeSetPerformanceTest &testcase;
|
||||
};
|
||||
|
||||
class Contains : public Command {
|
||||
protected:
|
||||
Contains(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
|
||||
// Verify that the frozen set is equal to the unfrozen one.
|
||||
UnicodeSet set;
|
||||
UChar32 c;
|
||||
|
||||
for(c=0; c<=0x10ffff; ++c) {
|
||||
if(testcase.set.contains(c)) {
|
||||
set.add(c);
|
||||
}
|
||||
}
|
||||
if(set!=testcase.set) {
|
||||
fprintf(stderr, "error: frozen set != original!\n");
|
||||
}
|
||||
}
|
||||
public:
|
||||
static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
|
||||
return new Contains(testcase);
|
||||
}
|
||||
virtual void call(UErrorCode* pErrorCode) {
|
||||
const UnicodeSet &set=testcase.set;
|
||||
const UChar *s=testcase.getBuffer();
|
||||
int32_t length=testcase.getBufferLen();
|
||||
int32_t count=0;
|
||||
int32_t i=0;
|
||||
UBool tf=FALSE;
|
||||
while(i<length) {
|
||||
i+=span(set, s+i, length-i, tf);
|
||||
tf=(UBool)(!tf);
|
||||
++count;
|
||||
}
|
||||
if(count!=testcase.spanCount) {
|
||||
fprintf(stderr, "error: Contains() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
|
||||
(long)count, (long)testcase.spanCount);
|
||||
}
|
||||
}
|
||||
static int32_t span(const UnicodeSet &set, const UChar *s, int32_t length, UBool tf) {
|
||||
UChar32 c;
|
||||
int32_t start=0, prev;
|
||||
while((prev=start)<length) {
|
||||
U16_NEXT(s, start, length, c);
|
||||
if(tf!=set.contains(c)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return prev;
|
||||
}
|
||||
};
|
||||
|
||||
class SpanUTF16 : public Command {
|
||||
protected:
|
||||
SpanUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
|
||||
// Verify that the frozen set is equal to the unfrozen one.
|
||||
UnicodeSet set;
|
||||
UChar utf16[2];
|
||||
UChar32 c, c2;
|
||||
|
||||
for(c=0; c<=0xffff; ++c) {
|
||||
utf16[0]=(UChar)c;
|
||||
if(testcase.set.span(utf16, 1, USET_SPAN_CONTAINED)>0) {
|
||||
set.add(c);
|
||||
}
|
||||
}
|
||||
for(c=0xd800; c<=0xdbff; ++c) {
|
||||
utf16[0]=(UChar)c;
|
||||
for(c2=0xdc00; c2<=0xdfff; ++c2) {
|
||||
utf16[1]=(UChar)c2;
|
||||
if(testcase.set.span(utf16, 2, USET_SPAN_CONTAINED)>0) {
|
||||
set.add(U16_GET_SUPPLEMENTARY(c, c2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(set!=testcase.set) {
|
||||
fprintf(stderr, "error: frozen set != original!\n");
|
||||
}
|
||||
}
|
||||
public:
|
||||
static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
|
||||
return new SpanUTF16(testcase);
|
||||
}
|
||||
virtual void call(UErrorCode* pErrorCode) {
|
||||
const UnicodeSet &set=testcase.set;
|
||||
const UChar *s=testcase.getBuffer();
|
||||
int32_t length=testcase.getBufferLen();
|
||||
int32_t count=0;
|
||||
int32_t i=0;
|
||||
UBool tf=FALSE;
|
||||
while(i<length) {
|
||||
i+=set.span(s+i, length-i, (USetSpanCondition)tf);
|
||||
tf=(UBool)(!tf);
|
||||
++count;
|
||||
}
|
||||
if(count!=testcase.spanCount) {
|
||||
fprintf(stderr, "error: SpanUTF16() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
|
||||
(long)count, (long)testcase.spanCount);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class SpanBackUTF16 : public Command {
|
||||
protected:
|
||||
SpanBackUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
|
||||
// Verify that the frozen set is equal to the unfrozen one.
|
||||
UnicodeSet set;
|
||||
UChar utf16[2];
|
||||
UChar32 c, c2;
|
||||
|
||||
for(c=0; c<=0xffff; ++c) {
|
||||
utf16[0]=(UChar)c;
|
||||
if(testcase.set.spanBack(utf16, 1, USET_SPAN_CONTAINED)==0) {
|
||||
set.add(c);
|
||||
}
|
||||
}
|
||||
for(c=0xd800; c<=0xdbff; ++c) {
|
||||
utf16[0]=(UChar)c;
|
||||
for(c2=0xdc00; c2<=0xdfff; ++c2) {
|
||||
utf16[1]=(UChar)c2;
|
||||
if(testcase.set.spanBack(utf16, 2, USET_SPAN_CONTAINED)==0) {
|
||||
set.add(U16_GET_SUPPLEMENTARY(c, c2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(set!=testcase.set) {
|
||||
fprintf(stderr, "error: frozen set != original!\n");
|
||||
}
|
||||
}
|
||||
public:
|
||||
static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
|
||||
return new SpanBackUTF16(testcase);
|
||||
}
|
||||
virtual void call(UErrorCode* pErrorCode) {
|
||||
const UnicodeSet &set=testcase.set;
|
||||
const UChar *s=testcase.getBuffer();
|
||||
int32_t length=testcase.getBufferLen();
|
||||
int32_t count=0;
|
||||
/*
|
||||
* Get the same spans as with span() where we always start with a not-contained span.
|
||||
* If testcase.spanCount is an odd number, then the last span() was not-contained.
|
||||
* The last spanBack() must be not-contained to match the first span().
|
||||
*/
|
||||
UBool tf=(UBool)((testcase.spanCount&1)==0);
|
||||
while(length>0 || !tf) {
|
||||
length=set.spanBack(s, length, (USetSpanCondition)tf);
|
||||
tf=(UBool)(!tf);
|
||||
++count;
|
||||
}
|
||||
if(count!=testcase.spanCount) {
|
||||
fprintf(stderr, "error: SpanBackUTF16() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
|
||||
(long)count, (long)testcase.spanCount);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class SpanUTF8 : public Command {
|
||||
protected:
|
||||
SpanUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
|
||||
// Verify that the frozen set is equal to the unfrozen one.
|
||||
UnicodeSet set;
|
||||
char utf8[4];
|
||||
UChar32 c;
|
||||
int32_t length;
|
||||
|
||||
for(c=0; c<=0x10ffff; ++c) {
|
||||
if(c==0xd800) {
|
||||
c=0xe000;
|
||||
}
|
||||
length=0;
|
||||
U8_APPEND_UNSAFE(utf8, length, c);
|
||||
if(testcase.set.spanUTF8(utf8, length, USET_SPAN_CONTAINED)>0) {
|
||||
set.add(c);
|
||||
}
|
||||
}
|
||||
if(set!=testcase.set) {
|
||||
fprintf(stderr, "error: frozen set != original!\n");
|
||||
}
|
||||
}
|
||||
public:
|
||||
static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
|
||||
return new SpanUTF8(testcase);
|
||||
}
|
||||
virtual void call(UErrorCode* pErrorCode) {
|
||||
const UnicodeSet &set=testcase.set;
|
||||
const char *s=testcase.utf8;
|
||||
int32_t length=testcase.utf8Length;
|
||||
int32_t count=0;
|
||||
int32_t i=0;
|
||||
UBool tf=FALSE;
|
||||
while(i<length) {
|
||||
i+=set.spanUTF8(s+i, length-i, (USetSpanCondition)tf);
|
||||
tf=(UBool)(!tf);
|
||||
++count;
|
||||
}
|
||||
if(count!=testcase.spanCount) {
|
||||
fprintf(stderr, "error: SpanUTF8() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
|
||||
(long)count, (long)testcase.spanCount);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class SpanBackUTF8 : public Command {
|
||||
protected:
|
||||
SpanBackUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
|
||||
// Verify that the frozen set is equal to the unfrozen one.
|
||||
UnicodeSet set;
|
||||
char utf8[4];
|
||||
UChar32 c;
|
||||
int32_t length;
|
||||
|
||||
for(c=0; c<=0x10ffff; ++c) {
|
||||
if(c==0xd800) {
|
||||
c=0xe000;
|
||||
}
|
||||
length=0;
|
||||
U8_APPEND_UNSAFE(utf8, length, c);
|
||||
if(testcase.set.spanBackUTF8(utf8, length, USET_SPAN_CONTAINED)==0) {
|
||||
set.add(c);
|
||||
}
|
||||
}
|
||||
if(set!=testcase.set) {
|
||||
fprintf(stderr, "error: frozen set != original!\n");
|
||||
}
|
||||
}
|
||||
public:
|
||||
static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
|
||||
return new SpanBackUTF8(testcase);
|
||||
}
|
||||
virtual void call(UErrorCode* pErrorCode) {
|
||||
const UnicodeSet &set=testcase.set;
|
||||
const char *s=testcase.utf8;
|
||||
int32_t length=testcase.utf8Length;
|
||||
int32_t count=0;
|
||||
/*
|
||||
* Get the same spans as with span() where we always start with a not-contained span.
|
||||
* If testcase.spanCount is an odd number, then the last span() was not-contained.
|
||||
* The last spanBack() must be not-contained to match the first span().
|
||||
*/
|
||||
UBool tf=(UBool)((testcase.spanCount&1)==0);
|
||||
while(length>0 || !tf) {
|
||||
length=set.spanBackUTF8(s, length, (USetSpanCondition)tf);
|
||||
tf=(UBool)(!tf);
|
||||
++count;
|
||||
}
|
||||
if(count!=testcase.spanCount) {
|
||||
fprintf(stderr, "error: SpanBackUTF8() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
|
||||
(long)count, (long)testcase.spanCount);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
UPerfFunction* UnicodeSetPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) {
|
||||
switch (index) {
|
||||
case 0: name = "Contains"; if (exec) return Contains::get(*this); break;
|
||||
case 1: name = "SpanUTF16"; if (exec) return SpanUTF16::get(*this); break;
|
||||
case 2: name = "SpanBackUTF16";if (exec) return SpanBackUTF16::get(*this); break;
|
||||
case 3: name = "SpanUTF8"; if (exec) return SpanUTF8::get(*this); break;
|
||||
case 4: name = "SpanBackUTF8"; if (exec) return SpanBackUTF8::get(*this); break;
|
||||
default: name = ""; break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int main(int argc, const char *argv[])
|
||||
{
|
||||
// Default values for command-line options.
|
||||
options[SET_PATTERN].value = "[:ID_Continue:]";
|
||||
options[FAST_TYPE].value = "slow";
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSetPerformanceTest test(argc, argv, status);
|
||||
|
||||
if (U_FAILURE(status)){
|
||||
printf("The error is %s\n", u_errorName(status));
|
||||
test.usage();
|
||||
return status;
|
||||
}
|
||||
|
||||
if (test.run() == FALSE){
|
||||
fprintf(stderr, "FAILED: Tests could not be run, please check the "
|
||||
"arguments.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
76
icu4c/source/test/perf/unisetperf/unisetperf.pl
Executable file
76
icu4c/source/test/perf/unisetperf/unisetperf.pl
Executable file
@ -0,0 +1,76 @@
|
||||
#!/usr/bin/perl -w
|
||||
# ********************************************************************
|
||||
# * COPYRIGHT:
|
||||
# * Copyright (c) 2005-2007, International Business Machines Corporation and
|
||||
# * others. All Rights Reserved.
|
||||
# ********************************************************************
|
||||
|
||||
use strict;
|
||||
use lib '../perldriver';
|
||||
use PerfFramework;
|
||||
|
||||
my $options = {
|
||||
"title"=>"UnicodeSet span()/contains() performance",
|
||||
"headers"=>"Bv Bv0",
|
||||
"operationIs"=>"tested Unicode code point",
|
||||
"passes"=>"3",
|
||||
"time"=>"2",
|
||||
#"outputType"=>"HTML",
|
||||
"dataDir"=>"/temp/udhr",
|
||||
"outputDir"=>"../results"
|
||||
};
|
||||
|
||||
# programs
|
||||
# tests will be done for all the programs. Results will be stored and connected
|
||||
my $p = "Release/unisetperf.exe -e UTF-8";
|
||||
my $pc = "$p Contains";
|
||||
my $p16 = "$p SpanUTF16";
|
||||
my $p8 = "$p SpanUTF8";
|
||||
|
||||
my $tests = {
|
||||
"Contains", ["$pc --type Bv",
|
||||
"$pc --type Bv0"
|
||||
],
|
||||
"SpanUTF16", ["$p16 --type Bv",
|
||||
"$p16 --type Bv0"
|
||||
]
|
||||
};
|
||||
|
||||
my $dataFiles = {
|
||||
"",
|
||||
[
|
||||
"udhr_eng.txt",
|
||||
"udhr_deu.txt",
|
||||
"udhr_fra.txt",
|
||||
"udhr_rus.txt",
|
||||
"udhr_tha.txt",
|
||||
"udhr_jpn.txt",
|
||||
"udhr_cmn.txt",
|
||||
"udhr_jpn.html"
|
||||
]
|
||||
};
|
||||
|
||||
runTests($options, $tests, $dataFiles);
|
||||
|
||||
$options = {
|
||||
"title"=>"UnicodeSet span()/contains() performance",
|
||||
"headers"=>"Bv BvF Bvp BvpF L Bvl",
|
||||
"operationIs"=>"tested Unicode code point",
|
||||
"passes"=>"3",
|
||||
"time"=>"2",
|
||||
#"outputType"=>"HTML",
|
||||
"dataDir"=>"/temp/udhr",
|
||||
"outputDir"=>"../results"
|
||||
};
|
||||
|
||||
$tests = {
|
||||
"SpanUTF8", ["$p8 --type Bv",
|
||||
"$p8 --type BvF",
|
||||
"$p8 --type Bvp",
|
||||
"$p8 --type BvpF",
|
||||
"$p8 --type L",
|
||||
"$p8 --type Bvl"
|
||||
]
|
||||
};
|
||||
|
||||
runTests($options, $tests, $dataFiles);
|
209
icu4c/source/test/perf/unisetperf/unisetperf.vcproj
Normal file
209
icu4c/source/test/perf/unisetperf/unisetperf.vcproj
Normal file
@ -0,0 +1,209 @@
|
||||
<?xml version="1.0" encoding="Windows-1252"?>
|
||||
<VisualStudioProject
|
||||
ProjectType="Visual C++"
|
||||
Version="8.00"
|
||||
Name="unisetperf"
|
||||
ProjectGUID="{E7728E98-0469-AF37-43F4-4529A3D52C6B}"
|
||||
>
|
||||
<Platforms>
|
||||
<Platform
|
||||
Name="Win32"
|
||||
/>
|
||||
</Platforms>
|
||||
<ToolFiles>
|
||||
</ToolFiles>
|
||||
<Configurations>
|
||||
<Configuration
|
||||
Name="Debug|Win32"
|
||||
OutputDirectory=".\Debug"
|
||||
IntermediateDirectory=".\Debug"
|
||||
ConfigurationType="1"
|
||||
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
|
||||
UseOfMFC="0"
|
||||
ATLMinimizesCRunTimeLibraryUsage="false"
|
||||
CharacterSet="2"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
TypeLibraryName=".\Debug/unisetperf.tlb"
|
||||
HeaderFileName=""
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="0"
|
||||
AdditionalIncludeDirectories="..\..\..\..\include;..\..\..\tools\toolutil;..\..\..\common;..\..\..\tools\ctestfw"
|
||||
PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
|
||||
BasicRuntimeChecks="3"
|
||||
RuntimeLibrary="3"
|
||||
PrecompiledHeaderFile=".\Debug/unisetperf.pch"
|
||||
AssemblerListingLocation=".\Debug/"
|
||||
ObjectFile=".\Debug/"
|
||||
ProgramDataBaseFileName=".\Debug/"
|
||||
WarningLevel="3"
|
||||
SuppressStartupBanner="true"
|
||||
DebugInformationFormat="4"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
PreprocessorDefinitions="_DEBUG"
|
||||
Culture="1033"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
AdditionalDependencies="icuucd.lib icutud.lib winmm.lib icutestd.lib"
|
||||
OutputFile=".\Debug/unisetperf.exe"
|
||||
LinkIncremental="1"
|
||||
SuppressStartupBanner="true"
|
||||
AdditionalLibraryDirectories="..\..\..\..\lib\"
|
||||
GenerateDebugInformation="true"
|
||||
ProgramDatabaseFile=".\Debug/unisetperf.pdb"
|
||||
SubSystem="1"
|
||||
TargetMachine="1"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebDeploymentTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
<Configuration
|
||||
Name="Release|Win32"
|
||||
OutputDirectory=".\Release"
|
||||
IntermediateDirectory=".\Release"
|
||||
ConfigurationType="1"
|
||||
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
|
||||
UseOfMFC="0"
|
||||
ATLMinimizesCRunTimeLibraryUsage="false"
|
||||
CharacterSet="2"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
TypeLibraryName=".\Release/unisetperf.tlb"
|
||||
HeaderFileName=""
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="2"
|
||||
InlineFunctionExpansion="1"
|
||||
AdditionalIncludeDirectories="..\..\..\..\include;..\..\..\tools\toolutil;..\..\..\common;..\..\..\tools\ctestfw"
|
||||
PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
|
||||
StringPooling="true"
|
||||
RuntimeLibrary="2"
|
||||
EnableFunctionLevelLinking="true"
|
||||
PrecompiledHeaderFile=".\Release/unisetperf.pch"
|
||||
AssemblerListingLocation=".\Release/"
|
||||
ObjectFile=".\Release/"
|
||||
ProgramDataBaseFileName=".\Release/"
|
||||
WarningLevel="3"
|
||||
SuppressStartupBanner="true"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
PreprocessorDefinitions="NDEBUG"
|
||||
Culture="1033"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
AdditionalDependencies="icuuc.lib icutu.lib icutest.lib winmm.lib"
|
||||
OutputFile=".\Release/unisetperf.exe"
|
||||
LinkIncremental="1"
|
||||
SuppressStartupBanner="true"
|
||||
AdditionalLibraryDirectories="..\..\..\..\lib\"
|
||||
ProgramDatabaseFile=".\Release/unisetperf.pdb"
|
||||
SubSystem="1"
|
||||
TargetMachine="1"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebDeploymentTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
</Configurations>
|
||||
<References>
|
||||
</References>
|
||||
<Files>
|
||||
<File
|
||||
RelativePath=".\unisetperf.cpp"
|
||||
>
|
||||
</File>
|
||||
</Files>
|
||||
<Globals>
|
||||
</Globals>
|
||||
</VisualStudioProject>
|
@ -341,6 +341,7 @@ UBool UPerfTest::runTestLoop( char* testname, char* par )
|
||||
int32_t loops = 0;
|
||||
double t=0;
|
||||
int32_t n = 1;
|
||||
long ops;
|
||||
do {
|
||||
this->runIndexedTest( index, FALSE, name );
|
||||
if (!name || (name[0] == 0))
|
||||
@ -358,7 +359,8 @@ UBool UPerfTest::runTestLoop( char* testname, char* par )
|
||||
fprintf(stderr,"%s function returned NULL", name);
|
||||
return FALSE;
|
||||
}
|
||||
if (testFunction->getOperationsPerIteration() < 1) {
|
||||
ops = testFunction->getOperationsPerIteration();
|
||||
if (ops < 1) {
|
||||
fprintf(stderr, "%s returned an illegal operations/iteration()\n", name);
|
||||
return FALSE;
|
||||
}
|
||||
@ -396,8 +398,10 @@ UBool UPerfTest::runTestLoop( char* testname, char* par )
|
||||
loops = iterations;
|
||||
}
|
||||
|
||||
double min_t=1000000.0, sum_t=0.0;
|
||||
long events = -1;
|
||||
|
||||
for(int32_t ps =0; ps < passes; ps++){
|
||||
long events = -1;
|
||||
fprintf(stdout,"= %s begin " ,name);
|
||||
if(verbose==TRUE){
|
||||
if(iterations > 0) {
|
||||
@ -413,36 +417,40 @@ UBool UPerfTest::runTestLoop( char* testname, char* par )
|
||||
printf("Performance test failed with error: %s \n", u_errorName(status));
|
||||
break;
|
||||
}
|
||||
sum_t+=t;
|
||||
if(t<min_t) {
|
||||
min_t=t;
|
||||
}
|
||||
events = testFunction->getEventsPerIteration();
|
||||
//print info only in verbose mode
|
||||
if(verbose==TRUE){
|
||||
/*
|
||||
if(events == -1){
|
||||
fprintf(stdout,"= %s end %f %i %i\n",name , t , loops, testFunction->getOperationsPerIteration());
|
||||
fprintf(stdout, "= %s end: %f loops: %i operations: %li \n", name, t, (int)loops, ops);
|
||||
}else{
|
||||
fprintf(stdout,"= %s end %f %i %i %i\n",name , t , loops, testFunction->getOperationsPerIteration(), events);
|
||||
}
|
||||
*/
|
||||
if(events == -1){
|
||||
fprintf(stdout, "= %s end: %f loops: %i operations: %li \n", name, t, (int)loops, testFunction->getOperationsPerIteration());
|
||||
}else{
|
||||
fprintf(stdout, "= %s end: %f loops: %i operations: %li events: %li\n", name, t, (int)loops, testFunction->getOperationsPerIteration(), events);
|
||||
fprintf(stdout, "= %s end: %f loops: %i operations: %li events: %li\n", name, t, (int)loops, ops, events);
|
||||
}
|
||||
}else{
|
||||
/*
|
||||
if(events == -1){
|
||||
fprintf(stdout,"= %f %i %i \n", t , loops, testFunction->getOperationsPerIteration());
|
||||
fprintf(stdout,"= %s end %f %i %li\n", name, t, (int)loops, ops);
|
||||
}else{
|
||||
fprintf(stdout,"= %f %i %i %i\n", t , loops, testFunction->getOperationsPerIteration(), events);
|
||||
}
|
||||
*/
|
||||
if(events == -1){
|
||||
fprintf(stdout,"= %s end %f %i %li\n", name, t, (int)loops, testFunction->getOperationsPerIteration());
|
||||
}else{
|
||||
fprintf(stdout,"= %s end %f %i %li %li\n", name, t, (int)loops, testFunction->getOperationsPerIteration(), events);
|
||||
fprintf(stdout,"= %s end %f %i %li %li\n", name, t, (int)loops, ops, events);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(verbose && U_SUCCESS(status)) {
|
||||
double avg_t = sum_t/passes;
|
||||
if(events == -1) {
|
||||
fprintf(stdout, "%%= %s avg: %.4g loops: %i avg/op: %.4g ns\n",
|
||||
name, avg_t, (int)loops, (avg_t*1E9)/(loops*ops));
|
||||
fprintf(stdout, "_= %s min: %.4g loops: %i min/op: %.4g ns\n",
|
||||
name, min_t, (int)loops, (min_t*1E9)/(loops*ops));
|
||||
} else {
|
||||
fprintf(stdout, "%%= %s avg: %.4g loops: %i avg/op: %.4g ns avg/event: %.4g ns\n",
|
||||
name, avg_t, (int)loops, (avg_t*1E9)/(loops*ops), (avg_t*1E9)/(loops*events));
|
||||
fprintf(stdout, "_= %s min: %.4g loops: %i min/op: %.4g ns min/event: %.4g ns\n",
|
||||
name, min_t, (int)loops, (min_t*1E9)/(loops*ops), (min_t*1E9)/(loops*events));
|
||||
}
|
||||
}
|
||||
delete testFunction;
|
||||
}
|
||||
index++;
|
||||
|
Loading…
Reference in New Issue
Block a user