ICU-5585 fast, freezable UnicodeSet with span() with string support; svn merge -r 21018:22033 http://source.icu-project.org/repos/icu/icu/branches/markus/fastset and revert source/test/perf/unisetperf/draft/* and source/common/utrie.* and source/allinone/allinone.sln (experimental code)

X-SVN-Rev: 22053
This commit is contained in:
Markus Scherer 2007-07-19 04:33:20 +00:00
parent 4d282a7e02
commit 19446aeeba
32 changed files with 6562 additions and 231 deletions

11
.gitignore vendored
View File

@ -47,7 +47,9 @@ icu4c/source/config/Makefile.inc
icu4c/source/config/icu-config
icu4c/source/config/icu-config.1
icu4c/source/data/*.plg
icu4c/source/data/Debug
icu4c/source/data/Makefile
icu4c/source/data/Release
icu4c/source/data/icupkg.inc
icu4c/source/data/in
icu4c/source/data/makedata.vcproj.*.*.user
@ -305,6 +307,15 @@ icu4c/source/test/perf/ubrkperf/debug
icu4c/source/test/perf/ubrkperf/release
icu4c/source/test/perf/ubrkperf/ubrkperf
icu4c/source/test/perf/ubrkperf/ubrkperf.vcproj.*.*.user
icu4c/source/test/perf/unisetperf/*.d
icu4c/source/test/perf/unisetperf/*.o
icu4c/source/test/perf/unisetperf/Debug
icu4c/source/test/perf/unisetperf/Makefile
icu4c/source/test/perf/unisetperf/Release
icu4c/source/test/perf/unisetperf/debug
icu4c/source/test/perf/unisetperf/release
icu4c/source/test/perf/unisetperf/unisetperf
icu4c/source/test/perf/unisetperf/unisetperf.vcproj.*.*.user
icu4c/source/test/perf/usetperf/*.d
icu4c/source/test/perf/usetperf/*.o
icu4c/source/test/perf/usetperf/Debug

View File

@ -80,7 +80,7 @@ utf_impl.o ustring.o ustrcase.o ucasemap.o cstring.o ustrfmt.o ustrtrns.o ustr_w
normlzr.o unorm.o unormcmp.o unorm_it.o chariter.o schriter.o uchriter.o uiter.o \
uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \
uscript.o usc_impl.o unames.o \
utrie.o uset_props.o uniset_props.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
utrie.o bmpset.o unisetspan.o uset_props.o uniset_props.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o triedict.o \
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \

View File

@ -0,0 +1,714 @@
/*
******************************************************************************
*
* Copyright (C) 2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: bmpset.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2007jan29
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/uniset.h"
#include "cmemory.h"
#include "bmpset.h"
U_NAMESPACE_BEGIN
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
list(parentList), listLength(parentListLength) {
uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
uprv_memset(table7FF, 0, sizeof(table7FF));
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
/*
* Set the list indexes for binary searches for
* U+0800, U+1000, U+2000, .., U+F000, U+10000.
* U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
* looked up in the bit tables.
* The last pair of indexes is for finding supplementary code points.
*/
list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
int32_t i;
for(i=1; i<=0x10; ++i) {
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
}
list4kStarts[0x11]=listLength-1;
initBits();
overrideIllegal();
}
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
list(newParentList), listLength(newParentListLength) {
uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
}
/*
* Set bits in a bit rectangle in "vertical" bit organization.
* start<limit<=0x800
*/
static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
int32_t lead=start>>6;
int32_t trail=start&0x3f;
// Set one bit indicating an all-one block.
uint32_t bits=(uint32_t)1<<lead;
if((start+1)==limit) { // Single-character shortcut.
table[trail]|=bits;
return;
}
int32_t limitLead=limit>>6;
int32_t limitTrail=limit&0x3f;
if(lead==limitLead) {
// Partial vertical bit column.
while(trail<limitTrail) {
table[trail++]|=bits;
}
} else {
// Partial vertical bit column,
// followed by a bit rectangle,
// followed by another partial vertical bit column.
if(trail>0) {
do {
table[trail++]|=bits;
} while(trail<64);
++lead;
}
if(lead<limitLead) {
bits=~((1<<lead)-1);
if(limitLead<0x20) {
bits&=(1<<limitLead)-1;
}
for(trail=0; trail<64; ++trail) {
table[trail]|=bits;
}
}
bits=1<<limitLead;
for(trail=0; trail<limitTrail; ++trail) {
table[trail]|=bits;
}
}
}
void BMPSet::initBits() {
UChar32 start, limit;
int32_t listIndex=0;
// Set asciiBytes[].
do {
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
}
if(start>=0x80) {
break;
}
do {
asciiBytes[start++]=1;
} while(start<limit && start<0x80);
} while(limit<=0x80);
// Set table7FF[].
while(start<0x800) {
set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
if(limit>0x800) {
start=0x800;
break;
}
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
}
}
// Set bmpBlockBits[].
int32_t minStart=0x800;
while(start<0x10000) {
if(limit>0x10000) {
limit=0x10000;
}
if(start<minStart) {
start=minStart;
}
if(start<limit) { // Else: Another range entirely in a known mixed-value block.
if(start&0x3f) {
// Mixed-value block of 64 code points.
start>>=6;
bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
start=(start+1)<<6; // Round up to the next block boundary.
minStart=start; // Ignore further ranges in this block.
}
if(start<limit) {
if(start<(limit&~0x3f)) {
// Multiple all-ones blocks of 64 code points each.
set32x64Bits(bmpBlockBits, start>>6, limit>>6);
}
if(limit&0x3f) {
// Mixed-value block of 64 code points.
limit>>=6;
bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
limit=(limit+1)<<6; // Round up to the next block boundary.
minStart=limit; // Ignore further ranges in this block.
}
}
}
if(limit==0x10000) {
break;
}
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
}
}
}
/*
* Override some bits and bytes to the result of contains(FFFD)
* for faster validity checking at runtime.
* No need to set 0 values where they were reset to 0 in the constructor
* and not modified by initBits().
* (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
* Need to set 0 values for surrogates D800..DFFF.
*/
void BMPSet::overrideIllegal() {
uint32_t bits, mask;
int32_t i;
if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
// contains(FFFD)==TRUE
for(i=0x80; i<0xc0; ++i) {
asciiBytes[i]=1;
}
bits=3; // Lead bytes 0xC0 and 0xC1.
for(i=0; i<64; ++i) {
table7FF[i]|=bits;
}
bits=1; // Lead byte 0xE0.
for(i=0; i<32; ++i) { // First half of 4k block.
bmpBlockBits[i]|=bits;
}
mask=~(0x10001<<0xd); // Lead byte 0xED.
bits=1<<0xd;
for(i=32; i<64; ++i) { // Second half of 4k block.
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
}
} else {
// contains(FFFD)==FALSE
mask=~(0x10001<<0xd); // Lead byte 0xED.
for(i=32; i<64; ++i) { // Second half of 4k block.
bmpBlockBits[i]&=mask;
}
}
}
int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
/* Examples:
findCodePoint(c)
set list[] c=0 1 3 4 7 8
=== ============== ===========
[] [110000] 0 0 0 0 0 0
[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
[:Any:] [0, 110000] 1 1 1 1 1 1
*/
// Return the smallest i such that c < list[i]. Assume
// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
if (c < list[lo])
return lo;
// High runner test. c is often after the last range, so an
// initial check for this condition pays off.
if (lo >= hi || c >= list[hi-1])
return hi;
// invariant: c >= list[lo]
// invariant: c < list[hi]
for (;;) {
int32_t i = (lo + hi) >> 1;
if (i == lo) {
break; // Found!
} else if (c < list[i]) {
hi = i;
} else {
lo = i;
}
}
return hi;
}
UBool
BMPSet::contains(UChar32 c) const {
if((uint32_t)c<=0x7f) {
return (UBool)asciiBytes[c];
} else if((uint32_t)c<=0x7ff) {
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
int lead=c>>12;
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
if(twoBits<=1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
return (UBool)twoBits;
} else {
// Look up the code point in its 4k block of code points.
return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
}
} else if((uint32_t)c<=0x10ffff) {
// surrogate or supplementary code point
return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
} else {
// Out-of-range code points get FALSE, consistent with long-standing
// behavior of UnicodeSet::contains(c).
return FALSE;
}
}
/*
* Check for sufficient length for trail unit for each surrogate pair.
* Handle single surrogates as surrogate code points as usual in ICU.
*/
const UChar *
BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
UChar c, c2;
if(spanCondition) {
// span
do {
c=*s;
if(c<=0x7f) {
if(!asciiBytes[c]) {
break;
}
} else if(c<=0x7ff) {
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
break;
}
} else if(c<0xd800 || c>=0xe000) {
int lead=c>>12;
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
if(twoBits<=1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if(twoBits==0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
break;
}
}
} else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
// surrogate code point
if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
break;
}
} else {
// surrogate pair
if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
++s;
}
} while(++s<limit);
} else {
// span not
do {
c=*s;
if(c<=0x7f) {
if(asciiBytes[c]) {
break;
}
} else if(c<=0x7ff) {
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
break;
}
} else if(c<0xd800 || c>=0xe000) {
int lead=c>>12;
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
if(twoBits<=1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if(twoBits!=0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
break;
}
}
} else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
// surrogate code point
if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
break;
}
} else {
// surrogate pair
if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
++s;
}
} while(++s<limit);
}
return s;
}
/* Symmetrical with span(). */
const UChar *
BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
UChar c, c2;
if(spanCondition) {
// span
for(;;) {
c=*(--limit);
if(c<=0x7f) {
if(!asciiBytes[c]) {
break;
}
} else if(c<=0x7ff) {
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
break;
}
} else if(c<0xd800 || c>=0xe000) {
int lead=c>>12;
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
if(twoBits<=1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if(twoBits==0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
break;
}
}
} else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
// surrogate code point
if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
break;
}
} else {
// surrogate pair
if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
--limit;
}
if(s==limit) {
return s;
}
}
} else {
// span not
for(;;) {
c=*(--limit);
if(c<=0x7f) {
if(asciiBytes[c]) {
break;
}
} else if(c<=0x7ff) {
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
break;
}
} else if(c<0xd800 || c>=0xe000) {
int lead=c>>12;
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
if(twoBits<=1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if(twoBits!=0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
break;
}
}
} else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
// surrogate code point
if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
break;
}
} else {
// surrogate pair
if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
--limit;
}
if(s==limit) {
return s;
}
}
}
return limit+1;
}
/*
* Precheck for sufficient trail bytes at end of string only once per span.
* Check validity.
*/
const uint8_t *
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
const uint8_t *limit=s+length;
uint8_t b=*s;
if((int8_t)b>=0) {
// Initial all-ASCII span.
if(spanCondition) {
do {
if(!asciiBytes[b] || ++s==limit) {
return s;
}
b=*s;
} while((int8_t)b>=0);
} else {
do {
if(asciiBytes[b] || ++s==limit) {
return s;
}
b=*s;
} while((int8_t)b>=0);
}
length=(int32_t)(limit-s);
}
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
}
const uint8_t *limit0=limit;
/*
* Make sure that the last 1/2/3/4-byte sequence before limit is complete
* or runs into a lead byte.
* In the span loop compare s with limit only once
* per multi-byte character.
*
* Give a trailing illegal sequence the same value as the result of contains(FFFD),
* including it if that is part of the span, otherwise set limit0 to before
* the truncated sequence.
*/
b=*(limit-1);
if((int8_t)b<0) {
// b>=0x80: lead or trail byte
if(b<0xc0) {
// single trail byte, check for preceding 3- or 4-byte lead byte
if(length>=2 && (b=*(limit-2))>=0xe0) {
limit-=2;
if(asciiBytes[0x80]!=spanCondition) {
limit0=limit;
}
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
// 4-byte lead byte with only two trail bytes
limit-=3;
if(asciiBytes[0x80]!=spanCondition) {
limit0=limit;
}
}
} else {
// lead byte with no trail bytes
--limit;
if(asciiBytes[0x80]!=spanCondition) {
limit0=limit;
}
}
}
uint8_t t1, t2, t3;
while(s<limit) {
b=*s;
if(b<0xc0) {
// ASCII; or trail bytes with the result of contains(FFFD).
if(spanCondition) {
do {
if(!asciiBytes[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
} while(b<0xc0);
} else {
do {
if(asciiBytes[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
} while(b<0xc0);
}
}
++s; // Advance past the lead byte.
if(b>=0xe0) {
if(b<0xf0) {
if( /* handle U+0000..U+FFFF inline */
(t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
(t2=(uint8_t)(s[1]-0x80)) <= 0x3f
) {
b&=0xf;
uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
if(twoBits<=1) {
// All 64 code points with this lead byte and middle trail byte
// are either in the set or not.
if(twoBits!=spanCondition) {
return s-1;
}
} else {
// Look up the code point in its 4k block of code points.
UChar32 c=(b<<12)|(t1<<6)|t2;
if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
return s-1;
}
}
s+=2;
continue;
}
} else if( /* handle U+10000..U+10FFFF inline */
(t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
(t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
(t3=(uint8_t)(s[2]-0x80)) <= 0x3f
) {
// Give an illegal sequence the same value as the result of contains(FFFD).
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
if( ( (0x10000<=c && c<=0x10ffff) ?
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
asciiBytes[0x80]
) != spanCondition
) {
return s-1;
}
s+=3;
continue;
}
} else /* 0xc0<=b<0xe0 */ {
if( /* handle U+0000..U+07FF inline */
(t1=(uint8_t)(*s-0x80)) <= 0x3f
) {
if(((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
return s-1;
}
++s;
continue;
}
}
// Give an illegal sequence the same value as the result of contains(FFFD).
// Handle each byte of an illegal sequence separately to simplify the code;
// no need to optimize error handling.
if(asciiBytes[0x80]!=spanCondition) {
return s-1;
}
}
return limit0;
}
/*
* While going backwards through UTF-8 optimize only for ASCII.
* Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
* possible to tell from the last byte in a multi-byte sequence how many
* preceding bytes there should be. Therefore, going backwards through UTF-8
* is much harder than going forward.
*/
int32_t
BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
}
uint8_t b;
do {
b=s[--length];
if((int8_t)b>=0) {
// ASCII sub-span
if(spanCondition) {
do {
if(!asciiBytes[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
} while((int8_t)b>=0);
} else {
do {
if(asciiBytes[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
} while((int8_t)b>=0);
}
}
int32_t prev=length;
UChar32 c;
if(b<0xc0) {
// trail byte: collect a multi-byte character
c=utf8_prevCharSafeBody(s, 0, &length, b, -1);
if(c<0) {
c=0xfffd;
}
} else {
// lead byte in last-trail position
c=0xfffd;
}
// c is a valid code point, not ASCII, not a surrogate
if(c<=0x7ff) {
if(((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
return prev+1;
}
} else if(c<=0xffff) {
int lead=c>>12;
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
if(twoBits<=1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if(twoBits!=spanCondition) {
return prev+1;
}
} else {
// Look up the code point in its 4k block of code points.
if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
return prev+1;
}
}
} else {
if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
return prev+1;
}
}
} while(length>0);
return 0;
}
U_NAMESPACE_END

View File

@ -0,0 +1,160 @@
/*
******************************************************************************
*
* Copyright (C) 2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: bmpset.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2007jan29
* created by: Markus W. Scherer
*/
#ifndef __BMPSET_H__
#define __BMPSET_H__
#include "unicode/utypes.h"
#include "unicode/uniset.h"
U_NAMESPACE_BEGIN
/*
* Helper class for frozen UnicodeSets, implements contains() and span()
* optimized for BMP code points. Structured to be UTF-8-friendly.
*
* ASCII: Look up bytes.
* 2-byte characters: Bits organized vertically.
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
* with mixed for illegal ranges.
* Supplementary characters: Call contains() on the parent set.
*/
class BMPSet : public UMemory {
public:
BMPSet(const int32_t *parentList, int32_t parentListLength);
BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength);
virtual UBool contains(UChar32 c) const;
/*
* Span the initial substring for which each character c has spanCondition==contains(c).
* It must be s<limit and spanCondition==0 or 1.
* @return The string pointer which limits the span.
*/
const UChar *span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;
/*
* Span the trailing substring for which each character c has spanCondition==contains(c).
* It must be s<limit and spanCondition==0 or 1.
* @return The string pointer which starts the span.
*/
const UChar *spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;
/*
* Span the initial substring for which each character c has spanCondition==contains(c).
* It must be length>0 and spanCondition==0 or 1.
* @return The string pointer which limits the span.
*/
const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
/*
* Span the trailing substring for which each character c has spanCondition==contains(c).
* It must be length>0 and spanCondition==0 or 1.
* @return The start of the span.
*/
int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
private:
void initBits();
void overrideIllegal();
/**
* Same as UnicodeSet::findCodePoint(UChar32 c) const except that the
* binary search is restricted for finding code points in a certain range.
*
* For restricting the search for finding in the range start..end,
* pass in
* lo=findCodePoint(start) and
* hi=findCodePoint(end)
* with 0<=lo<=hi<len.
* findCodePoint(c) defaults to lo=0 and hi=len-1.
*
* @param c a character in a subrange of MIN_VALUE..MAX_VALUE
* @param lo The lowest index to be returned.
* @param hi The highest index to be returned.
* @return the smallest integer i in the range lo..hi,
* inclusive, such that c < list[i]
*/
int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const;
inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
/*
* One byte per ASCII character, or trail byte in lead position.
* 0 or 1 for ASCII characters.
* The value for trail bytes is the result of contains(FFFD)
* for faster validity checking at runtime.
*/
UBool asciiBytes[0xc0];
/*
* One bit per code point from U+0000..U+07FF.
* The bits are organized vertically; consecutive code points
* correspond to the same bit positions in consecutive table words.
* With code point parts
* lead=c{10..6}
* trail=c{5..0}
* it is set.contains(c)==(table7FF[trail] bit lead)
*
* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD)
* for faster validity checking at runtime.
*/
uint32_t table7FF[64];
/*
* One bit per 64 BMP code points.
* The bits are organized vertically; consecutive 64-code point blocks
* correspond to the same bit position in consecutive table words.
* With code point parts
* lead=c{15..12}
* t1=c{11..6}
* test bits (lead+16) and lead in bmpBlockBits[t1].
* If the upper bit is 0, then the lower bit indicates if contains(c)
* for all code points in the 64-block.
* If the upper bit is 1, then the block is mixed and set.contains(c)
* must be called.
*
* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to
* the result of contains(FFFD) for faster validity checking at runtime.
*/
uint32_t bmpBlockBits[64];
/*
* Inversion list indexes for restricted binary searches in
* findCodePoint(), from
* findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000).
* U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
* always looked up in the bit tables.
* The last pair of indexes is for finding supplementary code points.
*/
int32_t list4kStarts[18];
/*
* The inversion list of the parent set, for the slower contains() implementation
* for mixed BMP blocks and for supplementary code points.
* The list is terminated with list[listLength-1]=0x110000.
*/
const int32_t *list;
int32_t listLength;
};
inline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const {
return (UBool)(findCodePoint(c, lo, hi) & 1);
}
U_NAMESPACE_END
#endif

View File

@ -1771,6 +1771,14 @@
<Filter
Name="properties &amp; sets"
>
<File
RelativePath=".\bmpset.cpp"
>
</File>
<File
RelativePath=".\bmpset.h"
>
</File>
<File
RelativePath=".\propname.cpp"
>
@ -1951,6 +1959,14 @@
RelativePath=".\uniset_props.cpp"
>
</File>
<File
RelativePath=".\unisetspan.cpp"
>
</File>
<File
RelativePath=".\unisetspan.h"
>
</File>
<File
RelativePath=".\uprops.c"
>

View File

@ -22,8 +22,10 @@
U_NAMESPACE_BEGIN
class BMPSet;
class ParsePosition;
class SymbolTable;
class UnicodeSetStringSpan;
class UVector;
class RuleCharacterIterator;
@ -263,6 +265,7 @@ class U_COMMON_API UnicodeSet : public UnicodeFilter {
int32_t len; // length of list used; 0 <= len <= capacity
int32_t capacity; // capacity of list
UChar32* list; // MUST be terminated with HIGH
BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
UChar32* buffer; // internal buffer, may be NULL
int32_t bufferCapacity; // capacity of buffer
int32_t patLen;
@ -278,6 +281,7 @@ class U_COMMON_API UnicodeSet : public UnicodeFilter {
*/
UChar *pat;
UVector* strings; // maintained in sorted order
UnicodeSetStringSpan *stringSpan;
public:
@ -377,6 +381,7 @@ public:
/**
* Assigns this object to be a copy of another.
* A frozen set will not be modified.
* @stable ICU 2.0
*/
UnicodeSet& operator=(const UnicodeSet& o);
@ -405,6 +410,9 @@ public:
* Returns a copy of this object. All UnicodeFunctor objects have
* to support cloning in order to allow classes using
* UnicodeFunctors, such as Transliterator, to implement cloning.
* If this set is frozen, then the clone will be frozen as well.
* Use cloneAsThawed() for a mutable clone of a frozen set.
* @see cloneAsThawed
* @stable ICU 2.0
*/
virtual UnicodeFunctor* clone() const;
@ -418,6 +426,45 @@ public:
*/
virtual int32_t hashCode(void) const;
//----------------------------------------------------------------
// Freezable API
//----------------------------------------------------------------
/**
* Determines whether the set has been frozen (made immutable) or not.
* See the ICU4J Freezable interface for details.
* @return TRUE/FALSE for whether the set has been frozen
* @see freeze
* @see cloneAsThawed
* @draft ICU 3.8
*/
inline UBool isFrozen() const;
/**
* Freeze the set (make it immutable).
* Once frozen, it cannot be unfrozen and is therefore thread-safe
* until it is deleted.
* See the ICU4J Freezable interface for details.
* Freezing the set may also make some operations faster, for example
* contains() and span().
* A frozen set will not be modified. (It remains frozen.)
* @return this set.
* @see isFrozen
* @see cloneAsThawed
* @draft ICU 3.8
*/
UnicodeFunctor *freeze();
/**
* Clone the set and make the clone mutable.
* See the ICU4J Freezable interface for details.
* @return the mutable clone
* @see freeze
* @see isFrozen
* @draft ICU 3.8
*/
UnicodeFunctor *cloneAsThawed() const;
//----------------------------------------------------------------
// Public API
//----------------------------------------------------------------
@ -426,6 +473,7 @@ public:
* Make this object represent the range <code>start - end</code>.
* If <code>end > start</code> then this object is set to an
* an empty range.
* A frozen set will not be modified.
*
* @param start first character in the set, inclusive
* @param end last character in the set, inclusive
@ -445,6 +493,7 @@ public:
* Modifies this set to represent the set specified by the given
* pattern, optionally ignoring white space. See the class
* description for the syntax of the pattern language.
* A frozen set will not be modified.
* @param pattern a string specifying what characters are in the set
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
* contains a syntax error.
@ -459,6 +508,7 @@ public:
* Modifies this set to represent the set specified by the given
* pattern, optionally ignoring white space. See the class
* description for the syntax of the pattern language.
* A frozen set will not be modified.
* @param pattern a string specifying what characters are in the set
* @param options bitmask for options to apply to the pattern.
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
@ -486,6 +536,7 @@ public:
* pairs list for the parsed pattern is returned. This method calls
* itself recursively to parse embedded subpatterns.
*<em> Empties the set passed before applying the pattern.</em>
* A frozen set will not be modified.
*
* @param pattern the string containing the pattern to be parsed.
* The portion of the string from pos.getIndex(), which must be a
@ -515,6 +566,7 @@ public:
* Returns a string representation of this set. If the result of
* calling this function is passed to a UnicodeSet constructor, it
* will produce another set that is equal to this one.
* A frozen set will not be modified.
* @param result the string to receive the rules. Previous
* contents will be deleted.
* @param escapeUnprintable if TRUE then convert unprintable
@ -530,6 +582,7 @@ public:
* Modifies this set to contain those code points which have the given value
* for the given binary or enumerated property, as returned by
* u_getIntPropertyValue. Prior contents of this set are lost.
* A frozen set will not be modified.
*
* @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
* or UCHAR_INT_START..UCHAR_INT_LIMIT-1
@ -555,6 +608,7 @@ public:
* Modifies this set to contain those code points which have the
* given value for the given property. Prior contents of this
* set are lost.
* A frozen set will not be modified.
*
* @param prop a property alias, either short or long. The name is matched
* loosely. See PropertyAliases.txt for names and a description of loose
@ -603,6 +657,7 @@ public:
/**
* Returns true if this set contains the given character.
* This function works faster with a frozen set.
* @param c character to be checked for containment
* @return true if the test condition is met
* @stable ICU 2.0
@ -702,6 +757,84 @@ public:
*/
inline UBool containsSome(const UnicodeString& s) const;
/**
* Returns the length of the initial substring of the input string which
* consists only of characters and strings that are contained in this set
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
* or only of characters and strings that are not contained
* in this set (USET_SPAN_NOT_CONTAINED).
* See USetSpanCondition for details.
* Similar to the strspn() C library function.
* Unpaired surrogates are treated according to contains() of their surrogate code points.
* This function works faster with a frozen set and with a non-negative string length argument.
* @param s start of the string
* @param length of the string; can be -1 for NUL-terminated
* @spanCondition specifies the containment condition
* @return the length of the initial substring according to the spanCondition;
* 0 if the start of the string does not fit the spanCondition
* @draft ICU 3.8
* @see USetSpanCondition
*/
int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
/**
* Returns the start of the trailing substring of the input string which
* consists only of characters and strings that are contained in this set
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
* or only of characters and strings that are not contained
* in this set (USET_SPAN_NOT_CONTAINED).
* See USetSpanCondition for details.
* Unpaired surrogates are treated according to contains() of their surrogate code points.
* This function works faster with a frozen set and with a non-negative string length argument.
* @param s start of the string
* @param length of the string; can be -1 for NUL-terminated
* @spanCondition specifies the containment condition
* @return the start of the trailing substring according to the spanCondition;
* the string length if the end of the string does not fit the spanCondition
* @draft ICU 3.8
* @see USetSpanCondition
*/
int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
/**
* Returns the length of the initial substring of the input string which
* consists only of characters and strings that are contained in this set
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
* or only of characters and strings that are not contained
* in this set (USET_SPAN_NOT_CONTAINED).
* See USetSpanCondition for details.
* Similar to the strspn() C library function.
* Malformed byte sequences are treated according to contains(0xfffd).
* This function works faster with a frozen set and with a non-negative string length argument.
* @param s start of the string (UTF-8)
* @param length of the string; can be -1 for NUL-terminated
* @spanCondition specifies the containment condition
* @return the length of the initial substring according to the spanCondition;
* 0 if the start of the string does not fit the spanCondition
* @draft ICU 3.8
* @see USetSpanCondition
*/
int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
/**
* Returns the start of the trailing substring of the input string which
* consists only of characters and strings that are contained in this set
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
* or only of characters and strings that are not contained
* in this set (USET_SPAN_NOT_CONTAINED).
* See USetSpanCondition for details.
* Malformed byte sequences are treated according to contains(0xfffd).
* This function works faster with a frozen set and with a non-negative string length argument.
* @param s start of the string (UTF-8)
* @param length of the string; can be -1 for NUL-terminated
* @spanCondition specifies the containment condition
* @return the start of the trailing substring according to the spanCondition;
* the string length if the end of the string does not fit the spanCondition
* @draft ICU 3.8
* @see USetSpanCondition
*/
int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
/**
* Implement UnicodeMatcher::matches()
* @stable ICU 2.4
@ -786,6 +919,7 @@ public:
* the call leaves this set unchanged. If <code>end > start</code>
* then an empty range is added, leaving the set unchanged.
* This is equivalent to a boolean logic OR, or a set UNION.
* A frozen set will not be modified.
*
* @param start first character, inclusive, of range to be added
* to this set.
@ -799,6 +933,7 @@ public:
* Adds the specified character to this set if it is not already
* present. If this set already contains the specified character,
* the call leaves this set unchanged.
* A frozen set will not be modified.
* @stable ICU 2.0
*/
UnicodeSet& add(UChar32 c);
@ -809,6 +944,7 @@ public:
* the call leaves this set unchanged.
* Thus "ch" => {"ch"}
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
* A frozen set will not be modified.
* @param s the source string
* @return this object, for chaining
* @stable ICU 2.4
@ -829,6 +965,7 @@ public:
/**
* Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
* If this set already any particular character, it has no effect on that character.
* A frozen set will not be modified.
* @param s the source string
* @return this object, for chaining
* @stable ICU 2.4
@ -838,6 +975,7 @@ public:
/**
* Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
* If this set already any particular character, it has no effect on that character.
* A frozen set will not be modified.
* @param s the source string
* @return this object, for chaining
* @stable ICU 2.4
@ -847,6 +985,7 @@ public:
/**
* Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
* If this set already any particular character, it has no effect on that character.
* A frozen set will not be modified.
* @param s the source string
* @return this object, for chaining
* @stable ICU 2.4
@ -856,6 +995,7 @@ public:
/**
* Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
* If this set already any particular character, it has no effect on that character.
* A frozen set will not be modified.
* @param s the source string
* @return this object, for chaining
* @stable ICU 2.4
@ -887,6 +1027,7 @@ public:
* specified range. If <code>end > start</code> then an empty range is
* retained, leaving the set empty. This is equivalent to
* a boolean logic AND, or a set INTERSECTION.
* A frozen set will not be modified.
*
* @param start first character, inclusive, of range to be retained
* to this set.
@ -899,6 +1040,7 @@ public:
/**
* Retain the specified character from this set if it is present.
* A frozen set will not be modified.
* @stable ICU 2.0
*/
UnicodeSet& retain(UChar32 c);
@ -908,6 +1050,7 @@ public:
* The set will not contain the specified range once the call
* returns. If <code>end > start</code> then an empty range is
* removed, leaving the set unchanged.
* A frozen set will not be modified.
*
* @param start first character, inclusive, of range to be removed
* from this set.
@ -921,6 +1064,7 @@ public:
* Removes the specified character from this set if it is present.
* The set will not contain the specified range once the call
* returns.
* A frozen set will not be modified.
* @stable ICU 2.0
*/
UnicodeSet& remove(UChar32 c);
@ -929,6 +1073,7 @@ public:
* Removes the specified string from this set if it is present.
* The set will not contain the specified character once the call
* returns.
* A frozen set will not be modified.
* @param s the source string
* @return this object, for chaining
* @stable ICU 2.4
@ -939,6 +1084,7 @@ public:
* Inverts this set. This operation modifies this set so that
* its value is its complement. This is equivalent to
* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
* A frozen set will not be modified.
* @stable ICU 2.0
*/
virtual UnicodeSet& complement(void);
@ -949,6 +1095,7 @@ public:
* added if it is not in this set. If <code>end > start</code>
* then an empty range is complemented, leaving the set unchanged.
* This is equivalent to a boolean logic XOR.
* A frozen set will not be modified.
*
* @param start first character, inclusive, of range to be removed
* from this set.
@ -962,6 +1109,7 @@ public:
* Complements the specified character in this set. The character
* will be removed if it is in this set, or will be added if it is
* not in this set.
* A frozen set will not be modified.
* @stable ICU 2.0
*/
UnicodeSet& complement(UChar32 c);
@ -971,6 +1119,7 @@ public:
* The set will not contain the specified string once the call
* returns.
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
* A frozen set will not be modified.
* @param s the string to complement
* @return this object, for chaining
* @stable ICU 2.4
@ -983,6 +1132,7 @@ public:
* modifies this set so that its value is the <i>union</i> of the two
* sets. The behavior of this operation is unspecified if the specified
* collection is modified while the operation is in progress.
* A frozen set will not be modified.
*
* @param c set whose elements are to be added to this set.
* @see #add(char, char)
@ -996,6 +1146,7 @@ public:
* its elements that are not contained in the specified set. This
* operation effectively modifies this set so that its value is
* the <i>intersection</i> of the two sets.
* A frozen set will not be modified.
*
* @param c set that defines which elements this set will retain.
* @stable ICU 2.0
@ -1007,6 +1158,7 @@ public:
* specified set. This operation effectively modifies this
* set so that its value is the <i>asymmetric set difference</i> of
* the two sets.
* A frozen set will not be modified.
*
* @param c set that defines which elements will be removed from
* this set.
@ -1018,6 +1170,7 @@ public:
* Complements in this set all elements contained in the specified
* set. Any character in the other set will be removed if it is
* in this set, or will be added if it is not in this set.
* A frozen set will not be modified.
*
* @param c set that defines which elements will be xor'ed from
* this set.
@ -1028,6 +1181,7 @@ public:
/**
* Removes all of the elements from this set. This set will be
* empty after this call returns.
* A frozen set will not be modified.
* @stable ICU 2.0
*/
virtual UnicodeSet& clear(void);
@ -1049,6 +1203,8 @@ public:
* == b denotes that the contents are the same, not pointer
* comparison.)
*
* A frozen set will not be modified.
*
* @param attribute bitmask for attributes to close over.
* Currently only the USET_CASE bit is supported. Any undefined bits
* are ignored.
@ -1137,6 +1293,7 @@ public:
/**
* Reallocate this objects internal structures to take up the least
* possible space, without changing this object's value.
* A frozen set will not be modified.
* @stable ICU 2.4
*/
virtual UnicodeSet& compact();
@ -1189,6 +1346,12 @@ private:
private:
//----------------------------------------------------------------
// Implementation: Clone as thawed (see ICU4J Freezable)
//----------------------------------------------------------------
UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
//----------------------------------------------------------------
// Implementation: Pattern parsing
//----------------------------------------------------------------
@ -1324,6 +1487,10 @@ inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
return !operator==(o);
}
inline UBool UnicodeSet::isFrozen() const {
return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
}
inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
return !containsNone(start, end);
}

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2006, International Business Machines
* Copyright (C) 2002-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -97,6 +97,120 @@ enum {
USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8
};
#ifndef U_HIDE_DRAFT_API
/**
* Argument values for whether span() and similar functions continue while
* the current character is contained vs. not contained in the set.
*
* The functionality is straightforward for sets with only single code points,
* without strings (which is the common case):
* - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE
* work the same.
* - span() and spanBack() partition any string the same way when
* alternating between span(USET_SPAN_NOT_CONTAINED) and
* span(either "contained" condition).
* - Using a complemented (inverted) set and the opposite span conditions
* yields the same results.
*
* When a set contains multi-code point strings, then these statements may not
* be true, depending on the strings in the set (for example, whether they
* overlap with each other) and the string that is processed.
* For a set with strings:
* - The complement of the set contains the opposite set of code points,
* but the same set of strings.
* Therefore, complementing both the set and the span conditions
* may yield different results.
* - When starting spans at different positions in a string
* (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
* because a set string may start before the later position.
* - span(USET_SPAN_SIMPLE) may be shorter than
* span(USET_SPAN_CONTAINED) because it will not recursively try
* all possible paths.
* For example, with a set which contains the three strings "xy", "xya" and "ax",
* span("xyax", USET_SPAN_CONTAINED) will return 4 but
* span("xyax", USET_SPAN_SIMPLE) will return 3.
* span(USET_SPAN_SIMPLE) will never be longer than
* span(USET_SPAN_CONTAINED).
* - With either "contained" condition, span() and spanBack() may partition
* a string in different ways.
* For example, with a set which contains the two strings "ab" and "ba",
* and when processing the string "aba",
* span() will yield contained/not-contained boundaries of { 0, 2, 3 }
* while spanBack() will yield boundaries of { 0, 1, 3 }.
*
* Note: If it is important to get the same boundaries whether iterating forward
* or backward through a string, then either only span() should be used and
* the boundaries cached for backward operation, or an ICU BreakIterator
* could be used.
*
* Note: Unpaired surrogates are treated like surrogate code points.
* Similarly, set strings match only on code point boundaries,
* never in the middle of a surrogate pair.
* Illegal UTF-8 sequences are treated like U+FFFD.
* When processing UTF-8 strings, malformed set strings
* (strings with unpaired surrogates which cannot be converted to UTF-8)
* are ignored.
*
* @draft ICU 3.8
*/
enum USetSpanCondition {
/**
* Continue a span() while there is no set element at the current position.
* Stops before the first set element (character or string).
* (For code points only, this is like while contains(current)==FALSE).
*
* When span() returns, the substring between where it started and the position
* it returned consists only of characters that are not in the set,
* and none of its strings overlap with the span.
*
* @draft ICU 3.8
*/
USET_SPAN_NOT_CONTAINED = 0,
/**
* Continue a span() while there is a set element at the current position.
* (For characters only, this is like while contains(current)==TRUE).
*
* When span() returns, the substring between where it started and the position
* it returned consists only of set elements (characters or strings) that are in the set.
*
* If a set contains strings, then the span will be the longest substring
* matching any of the possible concatenations of set elements (characters or strings).
* (There must be a single, non-overlapping concatenation of characters or strings.)
* This is equivalent to a POSIX regular expression for (OR of each set element)*.
*
* @draft ICU 3.8
*/
USET_SPAN_CONTAINED = 1,
/**
* Continue a span() while there is a set element at the current position.
* (For characters only, this is like while contains(current)==TRUE).
*
* When span() returns, the substring between where it started and the position
* it returned consists only of set elements (characters or strings) that are in the set.
*
* If a set only contains single characters, then this is the same
* as USET_SPAN_CONTAINED.
*
* If a set contains strings, then the span will be the longest substring
* with a match at each position with the longest single set element (character or string).
*
* Use this span condition together with other longest-match algorithms,
* such as ICU converters (ucnv_getUnicodeSet()).
*
* @draft ICU 3.8
*/
USET_SPAN_SIMPLE = 2,
/**
* One more than the last span condition.
* @draft ICU 3.8
*/
USET_SPAN_CONDITION_COUNT
};
typedef enum USetSpanCondition USetSpanCondition;
#endif /* U_HIDE_DRAFT_API */
/**
* A serialized form of a Unicode set. Limited manipulations are
* possible directly on a serialized set. See below.
@ -179,9 +293,72 @@ uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
U_STABLE void U_EXPORT2
uset_close(USet* set);
#ifndef U_HIDE_DRAFT_API
/**
* Returns a copy of this object.
* If this set is frozen, then the clone will be frozen as well.
* Use uset_cloneAsThawed() for a mutable clone of a frozen set.
* @param set the original set
* @return the newly allocated copy of the set
* @see uset_cloneAsThawed
* @draft ICU 3.8
*/
U_DRAFT USet * U_EXPORT2
uset_clone(const USet *set);
//----------------------------------------------------------------
// Freezable API
//----------------------------------------------------------------
/**
* Determines whether the set has been frozen (made immutable) or not.
* See the ICU4J Freezable interface for details.
* @param set the set
* @return TRUE/FALSE for whether the set has been frozen
* @see uset_freeze
* @see uset_cloneAsThawed
* @draft ICU 3.8
*/
U_DRAFT UBool U_EXPORT2
uset_isFrozen(const USet *set);
/**
* Freeze the set (make it immutable).
* Once frozen, it cannot be unfrozen and is therefore thread-safe
* until it is deleted.
* See the ICU4J Freezable interface for details.
* Freezing the set may also make some operations faster, for example
* uset_contains() and uset_span().
* A frozen set will not be modified. (It remains frozen.)
* @param set the set
* @return the same set, now frozen
* @see uset_isFrozen
* @see uset_cloneAsThawed
* @draft ICU 3.8
*/
U_DRAFT void U_EXPORT2
uset_freeze(USet *set);
/**
* Clone the set and make the clone mutable.
* See the ICU4J Freezable interface for details.
* @param set the set
* @return the mutable clone
* @see uset_freeze
* @see uset_isFrozen
* @see uset_clone
* @draft ICU 3.8
*/
U_DRAFT USet * U_EXPORT2
uset_cloneAsThawed(const USet *set);
#endif /* U_HIDE_DRAFT_API */
/**
* Causes the USet object to represent the range <code>start - end</code>.
* If <code>start > end</code> then this USet is set to an empty range.
* A frozen set will not be modified.
* @param set the object to set to the given range
* @param start first character in the set, inclusive
* @param end last character in the set, inclusive
@ -196,6 +373,7 @@ uset_set(USet* set,
* pattern. See the UnicodeSet class description for the syntax of
* the pattern language. See also the User Guide chapter about UnicodeSet.
* <em>Empties the set passed before applying the pattern.</em>
* A frozen set will not be modified.
* @param set The set to which the pattern is to be applied.
* @param pattern A pointer to UChar string specifying what characters are in the set.
* The character at pattern[0] must be a '['.
@ -221,6 +399,7 @@ uset_applyPattern(USet *set,
* Modifies the set to contain those code points which have the given value
* for the given binary or enumerated property, as returned by
* u_getIntPropertyValue. Prior contents of this set are lost.
* A frozen set will not be modified.
*
* @param set the object to contain the code points defined by the property
*
@ -246,6 +425,7 @@ uset_applyIntPropertyValue(USet* set,
* Modifies the set to contain those code points which have the
* given value for the given property. Prior contents of this
* set are lost.
* A frozen set will not be modified.
*
* @param set the object to contain the code points defined by the given
* property and value alias
@ -319,6 +499,7 @@ uset_toPattern(const USet* set,
/**
* Adds the given character to the given USet. After this call,
* uset_contains(set, c) will return TRUE.
* A frozen set will not be modified.
* @param set the object to which to add the character
* @param c the character to add
* @stable ICU 2.4
@ -332,6 +513,7 @@ uset_add(USet* set, UChar32 c);
* modifies this set so that its value is the <i>union</i> of the two
* sets. The behavior of this operation is unspecified if the specified
* collection is modified while the operation is in progress.
* A frozen set will not be modified.
*
* @param set the object to which to add the set
* @param additionalSet the source set whose elements are to be added to this set.
@ -343,6 +525,7 @@ uset_addAll(USet* set, const USet *additionalSet);
/**
* Adds the given range of characters to the given USet. After this call,
* uset_contains(set, start, end) will return TRUE.
* A frozen set will not be modified.
* @param set the object to which to add the character
* @param start the first character of the range to add, inclusive
* @param end the last character of the range to add, inclusive
@ -354,6 +537,7 @@ uset_addRange(USet* set, UChar32 start, UChar32 end);
/**
* Adds the given string to the given USet. After this call,
* uset_containsString(set, str, strLen) will return TRUE.
* A frozen set will not be modified.
* @param set the object to which to add the character
* @param str the string to add
* @param strLen the length of the string or -1 if null terminated.
@ -365,6 +549,7 @@ uset_addString(USet* set, const UChar* str, int32_t strLen);
/**
* Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
* If this set already any particular character, it has no effect on that character.
* A frozen set will not be modified.
* @param set the object to which to add the character
* @param str the source string
* @param strLen the length of the string or -1 if null terminated.
@ -376,6 +561,7 @@ uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen);
/**
* Removes the given character from the given USet. After this call,
* uset_contains(set, c) will return FALSE.
* A frozen set will not be modified.
* @param set the object from which to remove the character
* @param c the character to remove
* @stable ICU 2.4
@ -386,6 +572,7 @@ uset_remove(USet* set, UChar32 c);
/**
* Removes the given range of characters from the given USet. After this call,
* uset_contains(set, start, end) will return FALSE.
* A frozen set will not be modified.
* @param set the object to which to add the character
* @param start the first character of the range to remove, inclusive
* @param end the last character of the range to remove, inclusive
@ -397,6 +584,7 @@ uset_removeRange(USet* set, UChar32 start, UChar32 end);
/**
* Removes the given string to the given USet. After this call,
* uset_containsString(set, str, strLen) will return FALSE.
* A frozen set will not be modified.
* @param set the object to which to add the character
* @param str the string to remove
* @param strLen the length of the string or -1 if null terminated.
@ -410,6 +598,7 @@ uset_removeString(USet* set, const UChar* str, int32_t strLen);
* specified set. This operation effectively modifies this
* set so that its value is the <i>asymmetric set difference</i> of
* the two sets.
* A frozen set will not be modified.
* @param set the object from which the elements are to be removed
* @param removeSet the object that defines which elements will be
* removed from this set
@ -423,6 +612,7 @@ uset_removeAll(USet* set, const USet* removeSet);
* specified range. If <code>start > end</code> then an empty range is
* retained, leaving the set empty. This is equivalent to
* a boolean logic AND, or a set INTERSECTION.
* A frozen set will not be modified.
*
* @param set the object for which to retain only the specified range
* @param start first character, inclusive, of range to be retained
@ -440,6 +630,7 @@ uset_retain(USet* set, UChar32 start, UChar32 end);
* its elements that are not contained in the specified set. This
* operation effectively modifies this set so that its value is
* the <i>intersection</i> of the two sets.
* A frozen set will not be modified.
*
* @param set the object on which to perform the retain
* @param retain set that defines which elements this set will retain
@ -451,6 +642,7 @@ uset_retainAll(USet* set, const USet* retain);
/**
* Reallocate this objects internal structures to take up the least
* possible space, without changing this object's value.
* A frozen set will not be modified.
*
* @param set the object on which to perfrom the compact
* @stable ICU 3.2
@ -462,6 +654,7 @@ uset_compact(USet* set);
* Inverts this set. This operation modifies this set so that
* its value is its complement. This operation does not affect
* the multicharacter strings, if any.
* A frozen set will not be modified.
* @param set the set
* @stable ICU 2.4
*/
@ -472,6 +665,7 @@ uset_complement(USet* set);
* Complements in this set all elements contained in the specified
* set. Any character in the other set will be removed if it is
* in this set, or will be added if it is not in this set.
* A frozen set will not be modified.
*
* @param set the set with which to complement
* @param complement set that defines which elements will be xor'ed
@ -484,6 +678,7 @@ uset_complementAll(USet* set, const USet* complement);
/**
* Removes all of the elements from this set. This set will be
* empty after this call returns.
* A frozen set will not be modified.
* @param set the set
* @stable ICU 2.4
*/
@ -502,6 +697,7 @@ uset_isEmpty(const USet* set);
/**
* Returns TRUE if the given USet contains the given character.
* This function works faster with a frozen set.
* @param set the set
* @param c The codepoint to check for within the set
* @return true if set contains c
@ -651,6 +847,96 @@ uset_containsNone(const USet* set1, const USet* set2);
U_STABLE UBool U_EXPORT2
uset_containsSome(const USet* set1, const USet* set2);
#ifndef U_HIDE_DRAFT_API
/**
* Returns the length of the initial substring of the input string which
* consists only of characters and strings that are contained in this set
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
* or only of characters and strings that are not contained
* in this set (USET_SPAN_NOT_CONTAINED).
* See USetSpanCondition for details.
* Similar to the strspn() C library function.
* Unpaired surrogates are treated according to contains() of their surrogate code points.
* This function works faster with a frozen set and with a non-negative string length argument.
* @param set the set
* @param s start of the string
* @param length of the string; can be -1 for NUL-terminated
* @spanCondition specifies the containment condition
* @return the length of the initial substring according to the spanCondition;
* 0 if the start of the string does not fit the spanCondition
* @draft ICU 3.8
* @see USetSpanCondition
*/
U_DRAFT int32_t U_EXPORT2
uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
/**
* Returns the start of the trailing substring of the input string which
* consists only of characters and strings that are contained in this set
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
* or only of characters and strings that are not contained
* in this set (USET_SPAN_NOT_CONTAINED).
* See USetSpanCondition for details.
* Unpaired surrogates are treated according to contains() of their surrogate code points.
* This function works faster with a frozen set and with a non-negative string length argument.
* @param set the set
* @param s start of the string
* @param length of the string; can be -1 for NUL-terminated
* @spanCondition specifies the containment condition
* @return the start of the trailing substring according to the spanCondition;
* the string length if the end of the string does not fit the spanCondition
* @draft ICU 3.8
* @see USetSpanCondition
*/
U_DRAFT int32_t U_EXPORT2
uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
/**
* Returns the length of the initial substring of the input string which
* consists only of characters and strings that are contained in this set
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
* or only of characters and strings that are not contained
* in this set (USET_SPAN_NOT_CONTAINED).
* See USetSpanCondition for details.
* Similar to the strspn() C library function.
* Malformed byte sequences are treated according to contains(0xfffd).
* This function works faster with a frozen set and with a non-negative string length argument.
* @param set the set
* @param s start of the string (UTF-8)
* @param length of the string; can be -1 for NUL-terminated
* @spanCondition specifies the containment condition
* @return the length of the initial substring according to the spanCondition;
* 0 if the start of the string does not fit the spanCondition
* @draft ICU 3.8
* @see USetSpanCondition
*/
U_DRAFT int32_t U_EXPORT2
uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
/**
* Returns the start of the trailing substring of the input string which
* consists only of characters and strings that are contained in this set
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
* or only of characters and strings that are not contained
* in this set (USET_SPAN_NOT_CONTAINED).
* See USetSpanCondition for details.
* Malformed byte sequences are treated according to contains(0xfffd).
* This function works faster with a frozen set and with a non-negative string length argument.
* @param set the set
* @param s start of the string (UTF-8)
* @param length of the string; can be -1 for NUL-terminated
* @spanCondition specifies the containment condition
* @return the start of the trailing substring according to the spanCondition;
* the string length if the end of the string does not fit the spanCondition
* @draft ICU 3.8
* @see USetSpanCondition
*/
U_DRAFT int32_t U_EXPORT2
uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
#endif /* U_HIDE_DRAFT_API */
/**
* Returns true if set1 contains all of the characters and strings
* of set2, and vis versa. It answers the question, 'Is set1 equal to set2?'

View File

@ -14,6 +14,7 @@
#include "unicode/symtable.h"
#include "ruleiter.h"
#include "cmemory.h"
#include "cstring.h"
#include "uhash.h"
#include "util.h"
#include "uvector.h"
@ -21,6 +22,8 @@
#include "ustrfmt.h"
#include "uassert.h"
#include "hash.h"
#include "bmpset.h"
#include "unisetspan.h"
// Define UChar constants using hex for EBCDIC compatibility
// Used #define to reduce private static exports and memory access time.
@ -138,8 +141,8 @@ static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) {
* Constructs an empty set.
*/
UnicodeSet::UnicodeSet() :
len(1), capacity(1 + START_EXTRA), list(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL)
len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
{
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
if(list!=NULL){
@ -158,8 +161,8 @@ UnicodeSet::UnicodeSet() :
* @param end last character, inclusive, of range
*/
UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
len(1), capacity(1 + START_EXTRA), list(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL)
len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
{
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
if(list!=NULL){
@ -177,8 +180,10 @@ UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
*/
UnicodeSet::UnicodeSet(const UnicodeSet& o) :
UnicodeFilter(o),
len(0), capacity(o.len + GROW_EXTRA), list(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL)
len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
bmpSet(0),
buffer(0), bufferCapacity(0),
patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
{
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
if(list!=NULL){
@ -189,16 +194,41 @@ UnicodeSet::UnicodeSet(const UnicodeSet& o) :
_dbgct(this);
}
// Copy-construct as thawed.
UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
UnicodeFilter(o),
len(0), capacity(o.len + GROW_EXTRA), list(0),
bmpSet(0),
buffer(0), bufferCapacity(0),
patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
{
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
if(list!=NULL){
UErrorCode status = U_ZERO_ERROR;
allocateStrings(status);
// *this = o except for bmpSet and stringSpan
len = o.len;
uprv_memcpy(list, o.list, len*sizeof(UChar32));
strings->assign(*o.strings, cloneUnicodeString, status);
if (o.pat) {
setPattern(UnicodeString(o.pat, o.patLen));
}
}
_dbgct(this);
}
/**
* Destructs the set.
*/
UnicodeSet::~UnicodeSet() {
_dbgdt(this); // first!
uprv_free(list);
delete bmpSet;
if (buffer) {
uprv_free(buffer);
}
delete strings;
delete stringSpan;
releasePattern();
}
@ -206,11 +236,24 @@ UnicodeSet::~UnicodeSet() {
* Assigns this object to be a copy of another.
*/
UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
if (isFrozen()) {
return *this;
}
ensureCapacity(o.len);
len = o.len;
uprv_memcpy(list, o.list, len*sizeof(UChar32));
if (o.bmpSet == NULL) {
bmpSet = NULL;
} else {
bmpSet = new BMPSet(*o.bmpSet, list, len);
}
UErrorCode ec = U_ZERO_ERROR;
strings->assign(*o.strings, cloneUnicodeString, ec);
if (o.stringSpan == NULL) {
stringSpan = NULL;
} else {
stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
}
releasePattern();
if (o.pat) {
setPattern(UnicodeString(o.pat, o.patLen));
@ -218,6 +261,19 @@ UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
return *this;
}
/**
* Returns a copy of this object. All UnicodeMatcher objects have
* to support cloning in order to allow classes using
* UnicodeMatchers, such as Transliterator, to implement cloning.
*/
UnicodeFunctor* UnicodeSet::clone() const {
return new UnicodeSet(*this);
}
UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
return new UnicodeSet(*this, TRUE);
}
/**
* Compares the specified object with this set for equality. Returns
* <tt>true</tt> if the two sets
@ -237,15 +293,6 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const {
return TRUE;
}
/**
* Returns a copy of this object. All UnicodeMatcher objects have
* to support cloning in order to allow classes using
* UnicodeMatchers, such as Transliterator, to implement cloning.
*/
UnicodeFunctor* UnicodeSet::clone() const {
return new UnicodeSet(*this);
}
/**
* Returns the hash code value for this set.
*
@ -265,20 +312,6 @@ int32_t UnicodeSet::hashCode(void) const {
// Public API
//----------------------------------------------------------------
/**
* Make this object represent the range <code>start - end</code>.
* If <code>end > start</code> then this object is set to an
* an empty range.
*
* @param start first character in the set, inclusive
* @rparam end last character in the set, inclusive
*/
UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
clear();
complement(start, end);
return *this;
}
/**
* Returns the number of elements in this set (its cardinality),
* Note than the elements of a set may include both individual
@ -317,11 +350,17 @@ UBool UnicodeSet::contains(UChar32 c) const {
//for (;;) {
// if (c < list[++i]) break;
//}
if (bmpSet != NULL) {
return bmpSet->contains(c);
}
if (stringSpan != NULL) {
return stringSpan->contains(c);
}
if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
return FALSE;
}
int32_t i = findCodePoint(c);
return ((i & 1) != 0); // return true if odd
return (UBool)(i & 1); // return true if odd
}
/**
@ -350,10 +389,10 @@ int32_t UnicodeSet::findCodePoint(UChar32 c) const {
return 0;
// High runner test. c is often after the last range, so an
// initial check for this condition pays off.
if (len >= 2 && c >= list[len-2])
return len-1;
int32_t lo = 0;
int32_t hi = len - 1;
if (lo >= hi || c >= list[hi-1])
return hi;
// invariant: c >= list[lo]
// invariant: c < list[hi]
for (;;) {
@ -428,12 +467,8 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
* @return true if the test condition is met
*/
UBool UnicodeSet::containsAll(const UnicodeString& s) const {
UChar32 cp;
for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
cp = s.char32At(i);
if (!contains(cp)) return FALSE;
}
return TRUE;
return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
s.length());
}
/**
@ -479,12 +514,8 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
* @return true if the test condition is met
*/
UBool UnicodeSet::containsNone(const UnicodeString& s) const {
UChar32 cp;
for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
cp = s.char32At(i);
if (contains(cp)) return FALSE;
}
return TRUE;
return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
s.length());
}
/**
@ -723,6 +754,20 @@ UChar32 UnicodeSet::charAt(int32_t index) const {
return (UChar32)-1;
}
/**
* Make this object represent the range <code>start - end</code>.
* If <code>end > start</code> then this object is set to an
* an empty range.
*
* @param start first character in the set, inclusive
* @rparam end last character in the set, inclusive
*/
UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
clear();
complement(start, end);
return *this;
}
/**
* Adds the specified range to this set if it is not already
* present. If this set already contains the specified range,
@ -777,7 +822,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
int32_t i = findCodePoint(pinCodePoint(c));
// already in set?
if ((i & 1) != 0) return *this;
if ((i & 1) != 0 || isFrozen()) return *this;
// HIGH is 0x110000
// assert(list[len-1] == HIGH);
@ -888,7 +933,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
* @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
if (s.length() == 0) return *this;
if (s.length() == 0 || isFrozen()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (!strings->contains((void*) &s)) {
@ -896,7 +941,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
releasePattern();
}
} else {
add((UChar32)cp, (UChar32)cp);
add((UChar32)cp);
}
return *this;
}
@ -907,6 +952,9 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
* already be in 'strings'.
*/
void UnicodeSet::_add(const UnicodeString& s) {
if (isFrozen()) {
return;
}
UnicodeString* t = new UnicodeString(s);
UErrorCode ec = U_ZERO_ERROR;
strings->sortedInsert(t, compareUnicodeString, ec);
@ -942,7 +990,7 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
UChar32 cp;
for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
cp = s.char32At(i);
add(cp, cp);
add(cp);
}
return *this;
}
@ -1070,7 +1118,7 @@ UnicodeSet& UnicodeSet::remove(UChar32 c) {
* @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
if (s.length() == 0) return *this;
if (s.length() == 0 || isFrozen()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
strings->removeElement((void*) &s);
@ -1093,6 +1141,9 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
* from this set.
*/
UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
if (isFrozen()) {
return *this;
}
if (pinCodePoint(start) <= pinCodePoint(end)) {
UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
exclusiveOr(range, 2, 0);
@ -1110,6 +1161,9 @@ UnicodeSet& UnicodeSet::complement(UChar32 c) {
* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
*/
UnicodeSet& UnicodeSet::complement(void) {
if (isFrozen()) {
return *this;
}
if (list[0] == UNICODESET_LOW) {
ensureBufferCapacity(len-1);
uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
@ -1134,7 +1188,7 @@ UnicodeSet& UnicodeSet::complement(void) {
* @return this object, for chaining
*/
UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
if (s.length() == 0) return *this;
if (s.length() == 0 || isFrozen()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (strings->contains((void*) &s)) {
@ -1182,6 +1236,9 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
* @param c set that defines which elements this set will retain.
*/
UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
if (isFrozen()) {
return *this;
}
retain(c.list, c.len, 0);
strings->retainAll(*c.strings);
return *this;
@ -1197,6 +1254,9 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
* this set.
*/
UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
if (isFrozen()) {
return *this;
}
retain(c.list, c.len, 2);
strings->removeAll(*c.strings);
return *this;
@ -1211,6 +1271,9 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
* this set.
*/
UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
if (isFrozen()) {
return *this;
}
exclusiveOr(c.list, c.len, 0);
for (int32_t i=0; i<c.strings->size(); ++i) {
@ -1227,6 +1290,9 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
* empty after this call returns.
*/
UnicodeSet& UnicodeSet::clear(void) {
if (isFrozen()) {
return *this;
}
list[0] = UNICODESET_HIGH;
len = 1;
releasePattern();
@ -1277,9 +1343,14 @@ const UnicodeString* UnicodeSet::getString(int32_t index) const {
* possible space, without changing this object's value.
*/
UnicodeSet& UnicodeSet::compact() {
if (isFrozen()) {
return *this;
}
// Delete buffer first to defragment memory less.
uprv_free(buffer);
buffer = NULL;
if (buffer != NULL) {
uprv_free(buffer);
buffer = NULL;
}
if (len < capacity) {
// Make the capacity equal to len or 1.
// We don't want to realloc of 0 size.
@ -1437,6 +1508,9 @@ static inline UChar32 max(UChar32 a, UChar32 b) {
// polarity = 1, 2: x xor ~y == x === y
void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
if (isFrozen()) {
return;
}
ensureBufferCapacity(len + otherLen);
int32_t i = 0, j = 0, k = 0;
UChar32 a = list[i++];
@ -1479,6 +1553,9 @@ void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola
// polarity = 3: ~x union ~y
void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
if (isFrozen()) {
return;
}
ensureBufferCapacity(len + otherLen);
int32_t i = 0, j = 0, k = 0;
UChar32 a = list[i++];
@ -1584,6 +1661,9 @@ void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
// polarity = 3: ~x intersect ~y
void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
if (isFrozen()) {
return;
}
ensureBufferCapacity(len + otherLen);
int32_t i = 0, j = 0, k = 0;
UChar32 a = list[i++];
@ -1864,4 +1944,199 @@ void UnicodeSet::setPattern(const UnicodeString& newPat) {
// We can regenerate an equivalent pattern later when requested.
}
UnicodeFunctor *UnicodeSet::freeze() {
if(!isFrozen()) {
// Do most of what compact() does before freezing because
// compact() will not work when the set is frozen.
// Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
// Delete buffer first to defragment memory less.
if (buffer != NULL) {
uprv_free(buffer);
buffer = NULL;
}
if (capacity > (len + GROW_EXTRA)) {
// Make the capacity equal to len or 1.
// We don't want to realloc of 0 size.
capacity = len + (len == 0);
list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
}
// Optimize contains() and span() and similar functions.
if (!strings->isEmpty()) {
stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
// All strings are irrelevant for span() etc. because
// all of each string's code points are contained in this set.
// Do not check needsStringSpanUTF8() because UTF-8 has at most as
// many relevant strings as UTF-16.
// (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
delete stringSpan;
stringSpan = NULL;
}
}
if (stringSpan == NULL) {
// No span-relevant strings: Optimize for code point spans.
bmpSet=new BMPSet(list, len);
}
}
return this;
}
int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
if(length>0 && bmpSet!=NULL) {
return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
}
if(length<0) {
length=u_strlen(s);
}
if(length==0) {
return 0;
}
if(stringSpan!=NULL) {
return stringSpan->span(s, length, spanCondition);
} else if(!strings->isEmpty()) {
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
UnicodeSetStringSpan strSpan(*this, *strings, which);
if(strSpan.needsStringSpanUTF16()) {
return strSpan.span(s, length, spanCondition);
}
}
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
}
UChar32 c;
int32_t start=0, prev=0;
do {
U16_NEXT(s, start, length, c);
if(spanCondition!=contains(c)) {
break;
}
} while((prev=start)<length);
return prev;
}
int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
if(length>0 && bmpSet!=NULL) {
return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
}
if(length<0) {
length=u_strlen(s);
}
if(length==0) {
return 0;
}
if(stringSpan!=NULL) {
return stringSpan->spanBack(s, length, spanCondition);
} else if(!strings->isEmpty()) {
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
UnicodeSetStringSpan strSpan(*this, *strings, which);
if(strSpan.needsStringSpanUTF16()) {
return strSpan.spanBack(s, length, spanCondition);
}
}
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
}
UChar32 c;
int32_t prev=length;
do {
U16_PREV(s, 0, length, c);
if(spanCondition!=contains(c)) {
break;
}
} while((prev=length)>0);
return prev;
}
int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
if(length>0 && bmpSet!=NULL) {
const uint8_t *s0=(const uint8_t *)s;
return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
}
if(length<0) {
length=uprv_strlen(s);
}
if(length==0) {
return 0;
}
if(stringSpan!=NULL) {
return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
} else if(!strings->isEmpty()) {
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
UnicodeSetStringSpan strSpan(*this, *strings, which);
if(strSpan.needsStringSpanUTF8()) {
return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
}
}
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
}
UChar32 c;
int32_t start=0, prev=0;
do {
U8_NEXT(s, start, length, c);
if(c<0) {
c=0xfffd;
}
if(spanCondition!=contains(c)) {
break;
}
} while((prev=start)<length);
return prev;
}
int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
if(length>0 && bmpSet!=NULL) {
const uint8_t *s0=(const uint8_t *)s;
return bmpSet->spanBackUTF8(s0, length, spanCondition);
}
if(length<0) {
length=uprv_strlen(s);
}
if(length==0) {
return 0;
}
if(stringSpan!=NULL) {
return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
} else if(!strings->isEmpty()) {
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
UnicodeSetStringSpan strSpan(*this, *strings, which);
if(strSpan.needsStringSpanUTF8()) {
return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
}
}
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
}
UChar32 c;
int32_t prev=length;
do {
U8_PREV(s, 0, length, c);
if(c<0) {
c=0xfffd;
}
if(spanCondition!=contains(c)) {
break;
}
} while((prev=length)>0);
return prev;
}
U_NAMESPACE_END

View File

@ -248,8 +248,8 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
*/
UnicodeSet::UnicodeSet(const UnicodeString& pattern,
UErrorCode& status) :
len(0), capacity(START_EXTRA), list(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL)
len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
{
if(U_SUCCESS(status)){
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
@ -276,8 +276,8 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern,
uint32_t options,
const SymbolTable* symbols,
UErrorCode& status) :
len(0), capacity(START_EXTRA), list(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL)
len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
{
if(U_SUCCESS(status)){
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
@ -296,8 +296,8 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
uint32_t options,
const SymbolTable* symbols,
UErrorCode& status) :
len(0), capacity(START_EXTRA), list(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL)
len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
{
if(U_SUCCESS(status)){
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
@ -348,7 +348,7 @@ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
uint32_t options,
const SymbolTable* symbols,
UErrorCode& status) {
if (U_FAILURE(status)) {
if (U_FAILURE(status) || isFrozen()) {
return *this;
}
@ -374,7 +374,7 @@ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
uint32_t options,
const SymbolTable* symbols,
UErrorCode& status) {
if (U_FAILURE(status)) {
if (U_FAILURE(status) || isFrozen()) {
return *this;
}
// Need to build the pattern in a temporary string because
@ -938,7 +938,7 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
UnicodeSet&
UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
if (U_FAILURE(ec)) return *this;
if (U_FAILURE(ec) || isFrozen()) return *this;
if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
@ -953,7 +953,7 @@ UnicodeSet&
UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
const UnicodeString& value,
UErrorCode& ec) {
if (U_FAILURE(ec)) return *this;
if (U_FAILURE(ec) || isFrozen()) return *this;
// prop and value used to be converted to char * using the default
// converter instead of the invariant conversion.
@ -1293,6 +1293,9 @@ addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString
}
UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
if (isFrozen()) {
return *this;
}
if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
UErrorCode status = U_ZERO_ERROR;
const UCaseProps *csp = ucase_getSingleton(&status);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,155 @@
/*
******************************************************************************
*
* Copyright (C) 2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: unisetspan.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2007mar01
* created by: Markus W. Scherer
*/
#ifndef __UNISETSPAN_H__
#define __UNISETSPAN_H__
#include "unicode/utypes.h"
#include "unicode/uniset.h"
U_NAMESPACE_BEGIN
/*
* Implement span() etc. for a set with strings.
* Avoid recursion because of its exponential complexity.
* Instead, try multiple paths at once and track them with an IndexList.
*/
class UnicodeSetStringSpan : public UMemory {
public:
/*
* Which span() variant will be used?
* The object is either built for one variant and used once,
* or built for all and may be used many times.
*/
enum {
FWD = 0x20,
BACK = 0x10,
UTF16 = 8,
UTF8 = 4,
CONTAINED = 2,
NOT_CONTAINED = 1,
ALL = 0x3f,
FWD_UTF16_CONTAINED = FWD | UTF16 | CONTAINED,
FWD_UTF16_NOT_CONTAINED = FWD | UTF16 | NOT_CONTAINED,
FWD_UTF8_CONTAINED = FWD | UTF8 | CONTAINED,
FWD_UTF8_NOT_CONTAINED = FWD | UTF8 | NOT_CONTAINED,
BACK_UTF16_CONTAINED = BACK | UTF16 | CONTAINED,
BACK_UTF16_NOT_CONTAINED= BACK | UTF16 | NOT_CONTAINED,
BACK_UTF8_CONTAINED = BACK | UTF8 | CONTAINED,
BACK_UTF8_NOT_CONTAINED = BACK | UTF8 | NOT_CONTAINED
};
UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which);
// Copy constructor. Assumes which==ALL for a frozen set.
UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings);
~UnicodeSetStringSpan();
/*
* Do the strings need to be checked in span() etc.?
* @return TRUE if strings need to be checked (call span() here),
* FALSE if not (use a BMPSet for best performance).
*/
inline UBool needsStringSpanUTF16();
inline UBool needsStringSpanUTF8();
// For fast UnicodeSet::contains(c).
inline UBool contains(UChar32 c) const;
int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
int32_t spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
private:
// Special spanLength byte values.
enum {
// The spanLength is >=0xfe.
LONG_SPAN=0xfe,
// All code points in the string are contained in the parent set.
ALL_CP_CONTAINED=0xff
};
// Add a starting or ending string character to the spanNotSet
// so that a character span ends before any string.
void addToSpanNotSet(UChar32 c);
int32_t spanNot(const UChar *s, int32_t length) const;
int32_t spanNotBack(const UChar *s, int32_t length) const;
int32_t spanNotUTF8(const uint8_t *s, int32_t length) const;
int32_t spanNotBackUTF8(const uint8_t *s, int32_t length) const;
// Set for span(). Same as parent but without strings.
UnicodeSet spanSet;
// Set for span(not contained).
// Same as spanSet, plus characters that start or end strings.
UnicodeSet *pSpanNotSet;
// The strings of the parent set.
const UVector &strings;
// Pointer to the UTF-8 string lengths.
// Also pointer to further allocated storage for meta data and
// UTF-8 string contents as necessary.
int32_t *utf8Lengths;
// Pointer to the part of the (utf8Lengths) memory block that stores
// the lengths of span(), spanBack() etc. for each string.
uint8_t *spanLengths;
// Pointer to the part of the (utf8Lengths) memory block that stores
// the UTF-8 versions of the parent set's strings.
uint8_t *utf8;
// Number of bytes for all UTF-8 versions of strings together.
int32_t utf8Length;
// Maximum lengths of relevant strings.
int32_t maxLength16;
int32_t maxLength8;
// Set up for all variants of span()?
UBool all;
// Memory for small numbers and lengths of strings.
// For example, for 8 strings:
// 8 UTF-8 lengths, 8*4 bytes span lengths, 8*2 3-byte UTF-8 characters
// = 112 bytes = int32_t[28].
int32_t staticLengths[32];
};
UBool UnicodeSetStringSpan::needsStringSpanUTF16() {
return (UBool)(maxLength16!=0);
}
UBool UnicodeSetStringSpan::needsStringSpanUTF8() {
return (UBool)(maxLength8!=0);
}
UBool UnicodeSetStringSpan::contains(UChar32 c) const {
return spanSet.contains(c);
}
U_NAMESPACE_END
#endif

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2006, International Business Machines
* Copyright (C) 2002-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -41,6 +41,26 @@ uset_close(USet* set) {
delete (UnicodeSet*) set;
}
U_DRAFT USet * U_EXPORT2
uset_clone(const USet *set) {
return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone());
}
U_DRAFT UBool U_EXPORT2
uset_isFrozen(const USet *set) {
return ((UnicodeSet*) set)->UnicodeSet::isFrozen();
}
U_DRAFT void U_EXPORT2
uset_freeze(USet *set) {
((UnicodeSet*) set)->UnicodeSet::freeze();
}
U_DRAFT USet * U_EXPORT2
uset_cloneAsThawed(const USet *set) {
return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed());
}
U_CAPI void U_EXPORT2
uset_set(USet* set,
UChar32 start, UChar32 end) {
@ -64,12 +84,8 @@ uset_addRange(USet* set, UChar32 start, UChar32 end) {
U_CAPI void U_EXPORT2
uset_addString(USet* set, const UChar* str, int32_t strLen) {
// WRONG! Do not alias, it will stay aliased, even after
// copying. TODO: do we need a copy ctor that unaliases
//UnicodeString s(strLen==-1, str, strLen);
// UnicodeString handles -1 for strLen
UnicodeString s(str, strLen);
UnicodeString s(strLen<0, str, strLen);
((UnicodeSet*) set)->UnicodeSet::add(s);
}
@ -174,6 +190,26 @@ uset_containsSome(const USet* set1, const USet* set2) {
return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2);
}
U_DRAFT int32_t U_EXPORT2
uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) {
return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition);
}
U_DRAFT int32_t U_EXPORT2
uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) {
return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition);
}
U_DRAFT int32_t U_EXPORT2
uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) {
return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition);
}
U_DRAFT int32_t U_EXPORT2
uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) {
return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition);
}
U_CAPI UBool U_EXPORT2
uset_equals(const USet* set1, const USet* set2) {
return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2;

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (c) 2002-2005, International Business Machines
* Copyright (c) 2002-2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -19,6 +19,8 @@ static void Testj2269(void);
static void TestSerialized(void);
static void TestNonInvariantPattern(void);
static void TestBadPattern(void);
static void TestFreezable(void);
static void TestSpan(void);
void addUSetTest(TestNode** root);
@ -40,6 +42,8 @@ addUSetTest(TestNode** root) {
TEST(TestSerialized);
TEST(TestNonInvariantPattern);
TEST(TestBadPattern);
TEST(TestFreezable);
TEST(TestSpan);
}
/*------------------------------------------------------------------
@ -529,4 +533,80 @@ static void TestBadPattern(void) {
}
}
static USet *openIDSet() {
UErrorCode errorCode = U_ZERO_ERROR;
U_STRING_DECL(pattern, "[:ID_Continue:]", 15);
U_STRING_INIT(pattern, "[:ID_Continue:]", 15);
return uset_openPattern(pattern, 15, &errorCode);
}
static void TestFreezable() {
USet *idSet=openIDSet();
USet *frozen=uset_clone(idSet);
USet *thawed;
if(!uset_equals(frozen, idSet)) {
log_err("uset_clone() did not make an equal copy\n");
}
uset_freeze(frozen);
uset_addRange(frozen, 0xd802, 0xd805);
if(uset_isFrozen(idSet) || !uset_isFrozen(frozen) || !uset_equals(frozen, idSet)) {
log_err("uset_freeze() or uset_isFrozen() does not work\n");
}
thawed=uset_cloneAsThawed(frozen);
uset_addRange(thawed, 0xd802, 0xd805);
if(uset_isFrozen(thawed) || uset_equals(thawed, idSet) || !uset_containsRange(thawed, 0xd802, 0xd805)) {
log_err("uset_cloneAsThawed() does not work\n");
}
uset_close(idSet);
uset_close(frozen);
uset_close(thawed);
}
static void TestSpan() {
static const UChar s16[2]={ 0xe01, 0x3000 };
static const char* s8="\xE0\xB8\x81\xE3\x80\x80";
USet *idSet=openIDSet();
if(
1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) ||
0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) ||
2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) ||
1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED)
) {
log_err("uset_span() or uset_spanBack() does not work\n");
}
if(
3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) ||
6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED)
) {
log_err("uset_spanUTF8() or uset_spanBackUTF8() does not work\n");
}
uset_freeze(idSet);
if(
1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) ||
0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) ||
2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) ||
1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED)
) {
log_err("uset_span(frozen) or uset_spanBack(frozen) does not work\n");
}
if(
3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) ||
6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED)
) {
log_err("uset_spanUTF8(frozen) or uset_spanBackUTF8(frozen) does not work\n");
}
uset_close(idSet);
}
/*eof*/

View File

@ -1,6 +1,6 @@
/***************************************************************************
*
* Copyright (C) 2000-2004, International Business Machines
* Copyright (C) 2000-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
************************************************************************
@ -21,7 +21,6 @@
#include "transapi.h"
#include "cpdtrtst.h"
#include "transrt.h"
#include "usettest.h"
#include "jamotest.h"
#include "trnserr.h"
#include "reptest.h"
@ -29,7 +28,7 @@
#define CASE(id,test) case id: \
name = #test; \
if (exec) { \
logln(#test "---"); logln(""); \
logln(#test "---"); logln(); \
test t; \
callTest(t, par); \
} \
@ -43,12 +42,11 @@ void IntlTestTransliterator::runIndexedTest( int32_t index, UBool exec, const ch
CASE(1, TransliteratorAPITest);
CASE(2, CompoundTransliteratorTest);
CASE(3, TransliteratorRoundTripTest);
CASE(4, UnicodeSetTest);
CASE(5, JamoTest);
CASE(6, TransliteratorErrorTest);
CASE(7, ReplaceableTest);
CASE(4, JamoTest);
CASE(5, TransliteratorErrorTest);
CASE(6, ReplaceableTest);
#if !UCONFIG_NO_TRANSLITERATION && defined(U_USE_UNICODE_FILTER_LOGIC_OBSOLETE_2_8)
CASE(10, UnicodeFilterLogicTest);
CASE(7, UnicodeFilterLogicTest);
#endif
default: name=""; break;

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2005, International Business Machines Corporation and
* Copyright (c) 1997-2007, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -25,141 +25,41 @@
#include "v32test.h"
#include "uvectest.h"
#include "aliastst.h"
#include "usettest.h"
//#include "custrtest.h"
//#include "ccitrtst.h"
//#include "cloctest.h"
//#include "ctres.h"
//#include "ctucd.h"
#define CASE(id, test) case id: \
name = #test; \
if (exec) { \
logln(#test "---"); logln(); \
test t; \
callTest(t, par); \
} \
break
void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
{
if (exec) logln("TestSuite Utilities: ");
switch (index) {
case 0:
name = "MultithreadTest";
if (exec) {
logln("MultithreadTest---"); logln("");
MultithreadTest test;
callTest( test, par );
}
break;
case 1:
name = "StringTest";
if (exec) {
logln("StringTest---"); logln("");
StringTest test;
callTest( test, par );
}
break;
case 2:
name = "UnicodeStringTest";
if (exec) {
logln("UnicodeStringTest---"); logln("");
UnicodeStringTest test;
callTest( test, par );
}
break;
case 3:
name = "LocaleTest";
if (exec) {
logln("LocaleTest---"); logln("");
LocaleTest test;
callTest( test, par );
}
break;
case 4:
name = "CharIterTest";
if (exec) {
logln("CharIterTest---"); logln("");
CharIterTest test;
callTest( test, par );
}
break;
case 5:
name = "UnicodeTest";
if (exec) {
logln("UnicodeTest---"); logln("");
UnicodeTest test;
callTest( test, par );
}
break;
case 6:
name = "ResourceBundleTest";
if (exec) {
logln("ResourceBundleTest---"); logln("");
ResourceBundleTest test;
callTest( test, par );
}
break;
case 7:
name = "NewResourceBundleTest";
if (exec) {
logln("NewResourceBundleTest---"); logln("");
NewResourceBundleTest test;
callTest( test, par );
}
break;
case 8:
name = "PUtilTest";
if (exec) {
logln("PUtilTest---"); logln("");
PUtilTest test;
callTest( test, par );
}
break;
case 9:
name = "UObjectTest";
if(exec) {
logln ("UObjectTest---"); logln("");
UObjectTest test;
callTest( test, par );
}
break;;
case 10:
name = "UVector32Test";
if(exec) {
logln ("UVector32Test---"); logln("");
UVector32Test test;
callTest( test, par );
}
break;;
case 11:
name = "UVectorTest";
if(exec) {
logln ("UVectorTest---"); logln("");
UVectorTest test;
callTest( test, par );
}
break;;
case 12:
name = "UTextTest";
if(exec) {
logln ("UTextTest---"); logln("");
UTextTest test;
callTest( test, par );
}
break;
case 13:
name = "LocaleAliasTest";
if (exec) {
logln("LocaleAliasTest---"); logln("");
LocaleAliasTest test;
callTest( test, par );
}
break;
CASE(0, MultithreadTest);
CASE(1, StringTest);
CASE(2, UnicodeStringTest);
CASE(3, LocaleTest);
CASE(4, CharIterTest);
CASE(5, UnicodeTest);
CASE(6, ResourceBundleTest);
CASE(7, NewResourceBundleTest);
CASE(8, PUtilTest);
CASE(9, UObjectTest);
CASE(10, UVector32Test);
CASE(11, UVectorTest);
CASE(12, UTextTest);
CASE(13, MultithreadTest);
CASE(14, UnicodeSetTest);
default: name = ""; break; //needed to end loop
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2006, International Business Machines Corporation and
* Copyright (c) 1997-2007, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************
**********************************************************************
@ -16,16 +16,21 @@
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/ucnv_err.h"
#include "intltest.h"
class UnicodeSetWithStrings;
/**
* UnicodeSet test
*/
class UnicodeSetTest: public IntlTest {
public:
UnicodeSetTest();
~UnicodeSetTest();
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=NULL);
private:
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=NULL);
void Testj2268();
@ -76,6 +81,12 @@ private:
void TestPosixClasses();
void TestFreezable();
void TestSpan();
void TestStringSpan();
private:
UBool toPatternAux(UChar32 start, UChar32 end);
@ -152,6 +163,26 @@ private:
const UnicodeSet& set,
UChar32 start, UChar32 end);
void doAssert(UBool, const char*);
void testSpan(const UnicodeSetWithStrings *sets[4], const void *s, int32_t length, UBool isUTF16,
uint32_t whichSpans,
int32_t expectLimits[], int32_t &expectCount,
const char *testName, int32_t index);
void testSpan(const UnicodeSetWithStrings *sets[4], const void *s, int32_t length, UBool isUTF16,
uint32_t whichSpans,
const char *testName, int32_t index);
void testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
const UChar *s16, int32_t length16,
uint32_t whichSpans,
const char *testName, int32_t index);
void testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName);
void testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName);
void testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName);
UConverter *openUTF8Converter();
UConverter *utf8Cnv;
public:
static UnicodeString escape(const UnicodeString& s);
};

View File

@ -70,7 +70,7 @@
/>
<Tool
Name="VCLinkerTool"
AdditionalDependencies="../../../lib/icule.lib ../../../lib/icuuc.lib odbc32.lib odbccp32.lib"
AdditionalDependencies="../../../lib/icule.lib ../../../lib/icuuc.lib"
OutputFile=".\Release/letest.exe"
LinkIncremental="1"
SuppressStartupBanner="true"

View File

@ -0,0 +1,78 @@
## Makefile.in for ICU - test/perf/unisetperf
## Copyright (c) 2001-2007, International Business Machines Corporation and
## others. All Rights Reserved.
## Source directory information
srcdir = @srcdir@
top_srcdir = @top_srcdir@
top_builddir = ../../..
include $(top_builddir)/icudefs.mk
## Build directory information
subdir = test/perf/unisetperf
## Extra files to remove for 'make clean'
CLEANFILES = *~ $(DEPS)
## Target information
TARGET = unisetperf
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/tools/toolutil -I$(top_srcdir)/tools/ctestfw
LIBS = $(LIBCTESTFW) $(LIBICUI18N) $(LIBICUUC) $(LIBICUTOOLUTIL) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = unisetperf.o
DEPS = $(OBJECTS:.o=.d)
## List of phony targets
.PHONY : all all-local install install-local clean clean-local \
distclean distclean-local dist dist-local check check-local
## Clear suffix list
.SUFFIXES :
## List of standard targets
all: all-local
install: install-local
clean: clean-local
distclean : distclean-local
dist: dist-local
check: all check-local
all-local: $(TARGET)
install-local:
dist-local:
clean-local:
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
$(RMV) $(OBJECTS) $(TARGET)
distclean-local: clean-local
$(RMV) Makefile
check-local: all-local
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
$(TARGET) : $(OBJECTS)
$(LINK.cc) -o $@ $^ $(LIBS)
invoke:
ICU_DATA=$${ICU_DATA:-$(top_builddir)/data/} TZ=PST8PDT $(INVOKE) $(INVOCATION)
ifeq (,$(MAKECMDGOALS))
-include $(DEPS)
else
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
ifneq ($(patsubst %install,,$(MAKECMDGOALS)),)
-include $(DEPS)
endif
endif
endif

View File

@ -0,0 +1,197 @@
/*
**********************************************************************
* Copyright (C) 2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: bitset.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2007jan15
* created by: Markus Scherer
*
* Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
* using a folded bit set consisting of a 1k-entry index table and a
* compacted array of 64-bit words.
* Uses a simple hash table for compaction.
* Uses the original set for supplementary code points.
*/
#include "unicode/utypes.h"
#include "unicont.h"
/*
* Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
* Hashes 64-bit words and maps them to 16-bit integers which are
* assigned in order of new incoming words for subsequent storage
* in a contiguous array.
*/
struct BMPBitHash : public UObject {
int64_t keys[0x800]; // 2k
uint16_t values[0x800];
uint16_t reverse[0x400];
uint16_t count;
const int32_t prime=1301; // Less than 2k.
BMPBitHash() : count(0) {
// Fill values[] with 0xffff.
uprv_memset(values, 0xff, sizeof(values));
}
/*
* Map a key to an integer count.
* Map at most 1k=0x400 different keys with this data structure.
*/
uint16_t map(int64_t key) {
int32_t hash=(int32_t)(key>>55)&0x1ff;
hash^=(int32_t)(key>>44)&0x7ff;
hash^=(int32_t)(key>>33)&0x7ff;
hash^=(int32_t)(key>>22)&0x7ff;
hash^=(int32_t)(key>>11)&0x7ff;
hash^=(int32_t)key&0x7ff;
for(;;) {
if(values[hash]==0xffff) {
// Unused slot.
keys[hash]=key;
reverse[count]=hash;
return values[hash]=count++;
} else if(keys[hash]==key) {
// Found a slot with this key.
return values[hash];
} else {
// Used slot with a different key, move to another slot.
hash=(hash+prime)&0x7ff;
}
}
}
uint16_t countKeys() const { return count; }
/*
* Invert the hash map: Fill an array of length countKeys() with the keys
* indexed by their mapped values.
*/
void invert(int64_t *k) const {
uint16_t i;
for(i=0; i<count; ++i) {
k[i]=keys[reverse[i]];
}
}
};
class BitSet : public UObject, public UnicodeContainable {
public:
BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
if(U_FAILURE(errorCode)) {
return;
}
BMPBitHash *bitHash=new BMPBitHash;
if(bitHash==NULL || restSet==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
UnicodeSetIterator iter(set);
int64_t b;
UChar32 start, end;
int32_t prevIndex, i, j;
b=0; // Not necessary but makes compilers happy.
prevIndex=-1;
for(;;) {
if(iter.nextRange() && !iter.isString()) {
start=iter.getCodepoint();
end=iter.getCodepointEnd();
} else {
start=0x10000;
}
i=start>>6;
if(prevIndex!=i) {
// Finish the end of the previous range.
if(prevIndex<0) {
prevIndex=0;
} else {
index[prevIndex++]=bitHash->map(b);
}
// Fill all-zero entries between ranges.
if(prevIndex<i) {
uint16_t zero=bitHash->map(0);
do {
index[prevIndex++]=zero;
} while(prevIndex<i);
}
b=0;
}
if(start>0xffff) {
break;
}
b|=~((INT64_C(1)<<(start&0x3f))-1);
j=end>>6;
if(i<j) {
// Set bits for the start of the range.
index[i++]=bitHash->map(b);
// Fill all-one entries inside the range.
if(i<j) {
uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
do {
index[i++]=all;
} while(i<j);
}
b=INT64_C(0xffffffffffffffff);
}
/* i==j */
b&=(INT64_C(1)<<(end&0x3f))-1;
prevIndex=j;
}
if(bitHash->countKeys()>LENGTHOF(shortBits)) {
bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
}
if(bits!=NULL) {
bitHash->invert(bits);
} else {
bits=shortBits;
errorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
latin1Set[0]=(uint32_t)bits[0];
latin1Set[1]=(uint32_t)(bits[0]>>32);
latin1Set[2]=(uint32_t)bits[1];
latin1Set[3]=(uint32_t)(bits[1]>>32);
latin1Set[4]=(uint32_t)bits[2];
latin1Set[5]=(uint32_t)(bits[2]>>32);
latin1Set[6]=(uint32_t)bits[3];
latin1Set[7]=(uint32_t)(bits[3]>>32);
restSet.remove(0, 0xffff);
}
~BitSet() {
if(bits!=shortBits) {
uprv_free(bits);
}
delete restSet;
}
UBool contains(UChar32 c) const {
if((uint32_t)c<=0xff) {
return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
} else if((uint32_t)c<0xffff) {
return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
} else {
return restSet->contains(c);
}
}
private:
uint16_t index[0x400];
int64_t shortBits[32];
int64_t *bits;
uint32_t latin1Bits[8];
UnicodeSet *restSet;
};

View File

@ -0,0 +1,19 @@
rem Copyright (c) 2007, International Business Machines Corporation and
rem others. All Rights Reserved.
set PERF=c:\svn\icuproj\icu\ucnvutf8\source\test\perf\unisetperf\release\unisetperf
rem types: slow Bv Bv0 B0
rem --pattern [:White_Space:]
for %%f in (udhr_eng.txt
udhr_deu.txt
udhr_fra.txt
udhr_rus.txt
udhr_tha.txt
udhr_jpn.txt
udhr_cmn.txt
udhr_jpn.html) do (
for %%t in (slow Bv Bv0 B0) do (
%PERF% Contains --type %%t -f \temp\udhr\%%f --pattern [:White_Space:] -v -e UTF-8 --passes 3 --iterations 10000
)
)

View File

@ -0,0 +1,23 @@
#!/bin/sh
# Copyright (c) 2007, International Business Machines Corporation and
# others. All Rights Reserved.
# Echo shell script commands.
set -ex
PERF=test/perf/unisetperf/unisetperf
# slow Bv Bv0 B0
# --pattern [:White_Space:]
for file in udhr_eng.txt \
udhr_deu.txt \
udhr_fra.txt \
udhr_rus.txt \
udhr_tha.txt \
udhr_jpn.txt \
udhr_cmn.txt \
udhr_jpn.html; do
for type in slow Bv Bv0; do
$PERF Contains --type $type -f ~/udhr/$file -v -e UTF-8 --passes 3 --iterations 10000
done
done

View File

@ -0,0 +1,19 @@
rem Copyright (c) 2007, International Business Machines Corporation and
rem others. All Rights Reserved.
set PERF=c:\svn\icuproj\icu\ucnvutf8\source\test\perf\unisetperf\release\unisetperf
rem types: slow Bv Bv0 B0
rem --pattern [:White_Space:]
for %%f in (udhr_eng.txt
udhr_deu.txt
udhr_fra.txt
udhr_rus.txt
udhr_tha.txt
udhr_jpn.txt
udhr_cmn.txt
udhr_jpn.html) do (
for %%t in (slow Bv Bv0) do (
%PERF% SpanUTF16 --type %%t -f \temp\udhr\%%f --pattern [:White_Space:] -v -e UTF-8 --passes 3 --iterations 10000
)
)

View File

@ -0,0 +1,23 @@
#!/bin/sh
# Copyright (c) 2007, International Business Machines Corporation and
# others. All Rights Reserved.
# Echo shell script commands.
set -ex
PERF=test/perf/unisetperf/unisetperf
# slow Bv Bv0 B0
# --pattern [:White_Space:]
for file in udhr_eng.txt \
udhr_deu.txt \
udhr_fra.txt \
udhr_rus.txt \
udhr_tha.txt \
udhr_jpn.txt \
udhr_cmn.txt \
udhr_jpn.html; do
for type in slow Bv Bv0; do
$PERF SpanUTF16 --type $type -f ~/udhr/$file -v -e UTF-8 --passes 3 --iterations 10000
done
done

View File

@ -0,0 +1,19 @@
rem Copyright (c) 2007, International Business Machines Corporation and
rem others. All Rights Reserved.
set PERF=c:\svn\icuproj\icu\ucnvutf8\source\test\perf\unisetperf\release\unisetperf
rem types: slow Bh bh Bv Bv0 B0 BvF Bvp BvpF L Bvl BvL
rem --pattern [:White_Space:]
for %%f in (udhr_eng.txt
udhr_deu.txt
udhr_fra.txt
udhr_rus.txt
udhr_tha.txt
udhr_jpn.txt
udhr_cmn.txt
udhr_jpn.html) do (
for %%t in (slow BvF BvpF Bvl BvL) do (
%PERF% SpanUTF8 --type %%t -f \temp\udhr\%%f --pattern [:White_Space:] -v -e UTF-8 --passes 3 --iterations 10000
)
)

View File

@ -0,0 +1,23 @@
#!/bin/sh
# Copyright (c) 2007, International Business Machines Corporation and
# others. All Rights Reserved.
# Echo shell script commands.
set -ex
PERF=test/perf/unisetperf/unisetperf
# slow Bh bh Bv Bv0 B0 BvF Bvp BvpF L Bvl BvL
# --pattern [:White_Space:]
for file in udhr_eng.txt \
udhr_deu.txt \
udhr_fra.txt \
udhr_rus.txt \
udhr_tha.txt \
udhr_jpn.txt \
udhr_cmn.txt \
udhr_jpn.html; do
for type in slow BvF BvpF Bvl BvL; do
$PERF SpanUTF8 --type $type -f ~/udhr/$file -v -e UTF-8 --passes 3 --iterations 10000
done
done

View File

@ -0,0 +1,111 @@
/*
**********************************************************************
* Copyright (C) 2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: trieset.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2007jan15
* created by: Markus Scherer
*
* Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
* using a UTrie with 8-bit (byte) results per code point.
* Modifies the trie index to make the BMP linear, and uses the original set
* for supplementary code points.
*/
#include "unicode/utypes.h"
#include "unicont.h"
#define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
#define UTRIE_GET8_FROM_LEAD(trie, c16) \
((const uint8_t *)(trie)->data32)[ \
((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \
((c16)&UTRIE_MASK) \
]
class TrieSet : public UObject, public UnicodeContainable {
public:
TrieSet(const UnicodeSet &set, UErrorCode &errorCode)
: trieData(NULL), latin1(NULL), restSet(set.clone()) {
if(U_FAILURE(errorCode)) {
return;
}
if(restSet==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE);
UChar32 start, end;
UnicodeSetIterator iter(set);
while(iter.nextRange() && !iter.isString()) {
start=iter.getCodepoint();
end=iter.getCodepointEnd();
if(start>0xffff) {
break;
}
if(end>0xffff) {
end=0xffff;
}
if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) {
errorCode=U_INTERNAL_PROGRAM_ERROR;
return;
}
}
// Preflight the trie length.
int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode);
if(errorCode!=U_BUFFER_OVERFLOW_ERROR) {
return;
}
trieData=(uint32_t *)uprv_malloc(length);
if(trieData==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
errorCode=U_ZERO_ERROR;
utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode);
utrie_unserialize(&trie, trieData, length, &errorCode); // TODO: Implement for 8-bit UTrie!
if(U_SUCCESS(errorCode)) {
// Copy the indexes for surrogate code points into the BMP range
// for simple access across the entire BMP.
uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT),
trie.index+UTRIE_BMP_INDEX_LENGTH,
(0x800>>UTRIE_SHIFT)*2);
latin1=UTRIE_GET8_LATIN1(&trie);
}
restSet.remove(0, 0xffff);
}
~TrieSet() {
uprv_free(trieData);
delete restSet;
}
UBool contains(UChar32 c) const {
if((uint32_t)c<=0xff) {
return (UBool)latin1[c];
} else if((uint32_t)c<0xffff) {
return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c);
} else {
return restSet->contains(c);
}
}
private:
uint32_t *trieData;
const uint8_t *latin1;
UTrie trie;
UnicodeSet *restSet;
};

View File

@ -0,0 +1,34 @@
/*
**********************************************************************
* Copyright (C) 2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: unicont.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2007jan15
* created by: Markus Scherer
*
* Idea for new common interface underneath the normal UnicodeSet
* and other classes, such as "compiled", fast, read-only (immutable)
* versions of UnicodeSet.
*/
class UnicodeContainable {
public:
virtual ~UnicodeContainable() {}
virtual UBool contains(UChar32 c) const = 0;
virtual int32_t span(const UChar *s, int32_t length);
virtual int32_t spanNot(const UChar *s, int32_t length);
virtual int32_t spanUTF8(const UChar *s, int32_t length);
virtual int32_t spanNotUTF8(const UChar *s, int32_t length);
virtual UClassID getDynamicClassID(void) const;
};

View File

@ -0,0 +1,441 @@
/*
**********************************************************************
* Copyright (C) 2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: unisetperf.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2007jan31
* created by: Markus Scherer
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "unicode/uperf.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "uoptions.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
// Command-line options specific to unisetperf.
// Options do not have abbreviations: Force readable command lines.
// (Using U+0001 for abbreviation characters.)
enum {
SET_PATTERN,
FAST_TYPE,
UNISETPERF_OPTIONS_COUNT
};
static UOption options[UNISETPERF_OPTIONS_COUNT]={
UOPTION_DEF("pattern", '\x01', UOPT_REQUIRES_ARG),
UOPTION_DEF("type", '\x01', UOPT_REQUIRES_ARG)
};
static const char *const unisetperf_usage =
"\t--pattern UnicodeSet pattern for instantiation.\n"
"\t Default: [:ID_Continue:]\n"
"\t--type Type of UnicodeSet: slow fast\n"
"\t Default: slow\n";
// Test object with setup data.
class UnicodeSetPerformanceTest : public UPerfTest {
public:
UnicodeSetPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
: UPerfTest(argc, argv, options, LENGTHOF(options), unisetperf_usage, status),
utf8(NULL), utf8Length(0), countInputCodePoints(0), spanCount(0) {
if (U_SUCCESS(status)) {
UnicodeString pattern=UnicodeString(options[SET_PATTERN].value, -1, US_INV).unescape();
set.applyPattern(pattern, status);
prefrozen=set;
if(0==strcmp(options[FAST_TYPE].value, "fast")) {
set.freeze();
}
int32_t inputLength;
UPerfTest::getBuffer(inputLength, status);
if(U_SUCCESS(status) && inputLength>0) {
countInputCodePoints = u_countChar32(buffer, bufferLen);
countSpans();
// Preflight the UTF-8 length and allocate utf8.
u_strToUTF8(NULL, 0, &utf8Length, buffer, bufferLen, &status);
if(status==U_BUFFER_OVERFLOW_ERROR) {
utf8=(char *)malloc(utf8Length);
if(utf8!=NULL) {
status=U_ZERO_ERROR;
u_strToUTF8(utf8, utf8Length, NULL, buffer, bufferLen, &status);
} else {
status=U_MEMORY_ALLOCATION_ERROR;
}
}
if(verbose) {
printf("code points:%ld len16:%ld len8:%ld spans:%ld "
"cp/span:%.3g UChar/span:%.3g B/span:%.3g B/cp:%.3g\n",
(long)countInputCodePoints, (long)bufferLen, (long)utf8Length, (long)spanCount,
(double)countInputCodePoints/spanCount, (double)bufferLen/spanCount, (double)utf8Length/spanCount,
(double)utf8Length/countInputCodePoints);
}
}
}
}
virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL);
// Count spans of characters that are in the set,
// and spans of characters that are not in the set.
// If the very first character is in the set, then one additional
// not-span is counted.
void countSpans() {
const UChar *s=getBuffer();
int32_t length=getBufferLen();
int32_t i=0;
UBool tf=FALSE;
while(i<length) {
i=span(s, length, i, tf);
tf=(UBool)(!tf);
++spanCount;
}
}
int32_t span(const UChar *s, int32_t length, int32_t start, UBool tf) const {
UChar32 c;
int32_t prev;
while((prev=start)<length) {
U16_NEXT(s, start, length, c);
if(tf!=set.contains(c)) {
break;
}
}
return prev;
}
const UChar *getBuffer() const { return buffer; }
int32_t getBufferLen() const { return bufferLen; }
char *utf8;
int32_t utf8Length;
// Number of code points in the input text.
int32_t countInputCodePoints;
int32_t spanCount;
UnicodeSet set;
UnicodeSet prefrozen;
};
// Performance test function object.
class Command : public UPerfFunction {
protected:
Command(const UnicodeSetPerformanceTest &testcase) : testcase(testcase) {}
public:
virtual ~Command() {}
// virtual void call(UErrorCode* pErrorCode) { ... }
virtual long getOperationsPerIteration() {
// Number of code points tested:
// Input code points, plus one for the end of each span except the last span.
return testcase.countInputCodePoints+testcase.spanCount-1;
}
virtual long getEventsPerIteration() {
return testcase.spanCount;
}
const UnicodeSetPerformanceTest &testcase;
};
class Contains : public Command {
protected:
Contains(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
// Verify that the frozen set is equal to the unfrozen one.
UnicodeSet set;
UChar32 c;
for(c=0; c<=0x10ffff; ++c) {
if(testcase.set.contains(c)) {
set.add(c);
}
}
if(set!=testcase.set) {
fprintf(stderr, "error: frozen set != original!\n");
}
}
public:
static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
return new Contains(testcase);
}
virtual void call(UErrorCode* pErrorCode) {
const UnicodeSet &set=testcase.set;
const UChar *s=testcase.getBuffer();
int32_t length=testcase.getBufferLen();
int32_t count=0;
int32_t i=0;
UBool tf=FALSE;
while(i<length) {
i+=span(set, s+i, length-i, tf);
tf=(UBool)(!tf);
++count;
}
if(count!=testcase.spanCount) {
fprintf(stderr, "error: Contains() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
(long)count, (long)testcase.spanCount);
}
}
static int32_t span(const UnicodeSet &set, const UChar *s, int32_t length, UBool tf) {
UChar32 c;
int32_t start=0, prev;
while((prev=start)<length) {
U16_NEXT(s, start, length, c);
if(tf!=set.contains(c)) {
break;
}
}
return prev;
}
};
class SpanUTF16 : public Command {
protected:
SpanUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
// Verify that the frozen set is equal to the unfrozen one.
UnicodeSet set;
UChar utf16[2];
UChar32 c, c2;
for(c=0; c<=0xffff; ++c) {
utf16[0]=(UChar)c;
if(testcase.set.span(utf16, 1, USET_SPAN_CONTAINED)>0) {
set.add(c);
}
}
for(c=0xd800; c<=0xdbff; ++c) {
utf16[0]=(UChar)c;
for(c2=0xdc00; c2<=0xdfff; ++c2) {
utf16[1]=(UChar)c2;
if(testcase.set.span(utf16, 2, USET_SPAN_CONTAINED)>0) {
set.add(U16_GET_SUPPLEMENTARY(c, c2));
}
}
}
if(set!=testcase.set) {
fprintf(stderr, "error: frozen set != original!\n");
}
}
public:
static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
return new SpanUTF16(testcase);
}
virtual void call(UErrorCode* pErrorCode) {
const UnicodeSet &set=testcase.set;
const UChar *s=testcase.getBuffer();
int32_t length=testcase.getBufferLen();
int32_t count=0;
int32_t i=0;
UBool tf=FALSE;
while(i<length) {
i+=set.span(s+i, length-i, (USetSpanCondition)tf);
tf=(UBool)(!tf);
++count;
}
if(count!=testcase.spanCount) {
fprintf(stderr, "error: SpanUTF16() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
(long)count, (long)testcase.spanCount);
}
}
};
class SpanBackUTF16 : public Command {
protected:
SpanBackUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
// Verify that the frozen set is equal to the unfrozen one.
UnicodeSet set;
UChar utf16[2];
UChar32 c, c2;
for(c=0; c<=0xffff; ++c) {
utf16[0]=(UChar)c;
if(testcase.set.spanBack(utf16, 1, USET_SPAN_CONTAINED)==0) {
set.add(c);
}
}
for(c=0xd800; c<=0xdbff; ++c) {
utf16[0]=(UChar)c;
for(c2=0xdc00; c2<=0xdfff; ++c2) {
utf16[1]=(UChar)c2;
if(testcase.set.spanBack(utf16, 2, USET_SPAN_CONTAINED)==0) {
set.add(U16_GET_SUPPLEMENTARY(c, c2));
}
}
}
if(set!=testcase.set) {
fprintf(stderr, "error: frozen set != original!\n");
}
}
public:
static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
return new SpanBackUTF16(testcase);
}
virtual void call(UErrorCode* pErrorCode) {
const UnicodeSet &set=testcase.set;
const UChar *s=testcase.getBuffer();
int32_t length=testcase.getBufferLen();
int32_t count=0;
/*
* Get the same spans as with span() where we always start with a not-contained span.
* If testcase.spanCount is an odd number, then the last span() was not-contained.
* The last spanBack() must be not-contained to match the first span().
*/
UBool tf=(UBool)((testcase.spanCount&1)==0);
while(length>0 || !tf) {
length=set.spanBack(s, length, (USetSpanCondition)tf);
tf=(UBool)(!tf);
++count;
}
if(count!=testcase.spanCount) {
fprintf(stderr, "error: SpanBackUTF16() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
(long)count, (long)testcase.spanCount);
}
}
};
class SpanUTF8 : public Command {
protected:
SpanUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
// Verify that the frozen set is equal to the unfrozen one.
UnicodeSet set;
char utf8[4];
UChar32 c;
int32_t length;
for(c=0; c<=0x10ffff; ++c) {
if(c==0xd800) {
c=0xe000;
}
length=0;
U8_APPEND_UNSAFE(utf8, length, c);
if(testcase.set.spanUTF8(utf8, length, USET_SPAN_CONTAINED)>0) {
set.add(c);
}
}
if(set!=testcase.set) {
fprintf(stderr, "error: frozen set != original!\n");
}
}
public:
static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
return new SpanUTF8(testcase);
}
virtual void call(UErrorCode* pErrorCode) {
const UnicodeSet &set=testcase.set;
const char *s=testcase.utf8;
int32_t length=testcase.utf8Length;
int32_t count=0;
int32_t i=0;
UBool tf=FALSE;
while(i<length) {
i+=set.spanUTF8(s+i, length-i, (USetSpanCondition)tf);
tf=(UBool)(!tf);
++count;
}
if(count!=testcase.spanCount) {
fprintf(stderr, "error: SpanUTF8() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
(long)count, (long)testcase.spanCount);
}
}
};
class SpanBackUTF8 : public Command {
protected:
SpanBackUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
// Verify that the frozen set is equal to the unfrozen one.
UnicodeSet set;
char utf8[4];
UChar32 c;
int32_t length;
for(c=0; c<=0x10ffff; ++c) {
if(c==0xd800) {
c=0xe000;
}
length=0;
U8_APPEND_UNSAFE(utf8, length, c);
if(testcase.set.spanBackUTF8(utf8, length, USET_SPAN_CONTAINED)==0) {
set.add(c);
}
}
if(set!=testcase.set) {
fprintf(stderr, "error: frozen set != original!\n");
}
}
public:
static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
return new SpanBackUTF8(testcase);
}
virtual void call(UErrorCode* pErrorCode) {
const UnicodeSet &set=testcase.set;
const char *s=testcase.utf8;
int32_t length=testcase.utf8Length;
int32_t count=0;
/*
* Get the same spans as with span() where we always start with a not-contained span.
* If testcase.spanCount is an odd number, then the last span() was not-contained.
* The last spanBack() must be not-contained to match the first span().
*/
UBool tf=(UBool)((testcase.spanCount&1)==0);
while(length>0 || !tf) {
length=set.spanBackUTF8(s, length, (USetSpanCondition)tf);
tf=(UBool)(!tf);
++count;
}
if(count!=testcase.spanCount) {
fprintf(stderr, "error: SpanBackUTF8() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
(long)count, (long)testcase.spanCount);
}
}
};
UPerfFunction* UnicodeSetPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) {
switch (index) {
case 0: name = "Contains"; if (exec) return Contains::get(*this); break;
case 1: name = "SpanUTF16"; if (exec) return SpanUTF16::get(*this); break;
case 2: name = "SpanBackUTF16";if (exec) return SpanBackUTF16::get(*this); break;
case 3: name = "SpanUTF8"; if (exec) return SpanUTF8::get(*this); break;
case 4: name = "SpanBackUTF8"; if (exec) return SpanBackUTF8::get(*this); break;
default: name = ""; break;
}
return NULL;
}
int main(int argc, const char *argv[])
{
// Default values for command-line options.
options[SET_PATTERN].value = "[:ID_Continue:]";
options[FAST_TYPE].value = "slow";
UErrorCode status = U_ZERO_ERROR;
UnicodeSetPerformanceTest test(argc, argv, status);
if (U_FAILURE(status)){
printf("The error is %s\n", u_errorName(status));
test.usage();
return status;
}
if (test.run() == FALSE){
fprintf(stderr, "FAILED: Tests could not be run, please check the "
"arguments.\n");
return 1;
}
return 0;
}

View File

@ -0,0 +1,76 @@
#!/usr/bin/perl -w
# ********************************************************************
# * COPYRIGHT:
# * Copyright (c) 2005-2007, International Business Machines Corporation and
# * others. All Rights Reserved.
# ********************************************************************
use strict;
use lib '../perldriver';
use PerfFramework;
my $options = {
"title"=>"UnicodeSet span()/contains() performance",
"headers"=>"Bv Bv0",
"operationIs"=>"tested Unicode code point",
"passes"=>"3",
"time"=>"2",
#"outputType"=>"HTML",
"dataDir"=>"/temp/udhr",
"outputDir"=>"../results"
};
# programs
# tests will be done for all the programs. Results will be stored and connected
my $p = "Release/unisetperf.exe -e UTF-8";
my $pc = "$p Contains";
my $p16 = "$p SpanUTF16";
my $p8 = "$p SpanUTF8";
my $tests = {
"Contains", ["$pc --type Bv",
"$pc --type Bv0"
],
"SpanUTF16", ["$p16 --type Bv",
"$p16 --type Bv0"
]
};
my $dataFiles = {
"",
[
"udhr_eng.txt",
"udhr_deu.txt",
"udhr_fra.txt",
"udhr_rus.txt",
"udhr_tha.txt",
"udhr_jpn.txt",
"udhr_cmn.txt",
"udhr_jpn.html"
]
};
runTests($options, $tests, $dataFiles);
$options = {
"title"=>"UnicodeSet span()/contains() performance",
"headers"=>"Bv BvF Bvp BvpF L Bvl",
"operationIs"=>"tested Unicode code point",
"passes"=>"3",
"time"=>"2",
#"outputType"=>"HTML",
"dataDir"=>"/temp/udhr",
"outputDir"=>"../results"
};
$tests = {
"SpanUTF8", ["$p8 --type Bv",
"$p8 --type BvF",
"$p8 --type Bvp",
"$p8 --type BvpF",
"$p8 --type L",
"$p8 --type Bvl"
]
};
runTests($options, $tests, $dataFiles);

View File

@ -0,0 +1,209 @@
<?xml version="1.0" encoding="Windows-1252"?>
<VisualStudioProject
ProjectType="Visual C++"
Version="8.00"
Name="unisetperf"
ProjectGUID="{E7728E98-0469-AF37-43F4-4529A3D52C6B}"
>
<Platforms>
<Platform
Name="Win32"
/>
</Platforms>
<ToolFiles>
</ToolFiles>
<Configurations>
<Configuration
Name="Debug|Win32"
OutputDirectory=".\Debug"
IntermediateDirectory=".\Debug"
ConfigurationType="1"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
UseOfMFC="0"
ATLMinimizesCRunTimeLibraryUsage="false"
CharacterSet="2"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
TypeLibraryName=".\Debug/unisetperf.tlb"
HeaderFileName=""
/>
<Tool
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..\..\..\..\include;..\..\..\tools\toolutil;..\..\..\common;..\..\..\tools\ctestfw"
PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
BasicRuntimeChecks="3"
RuntimeLibrary="3"
PrecompiledHeaderFile=".\Debug/unisetperf.pch"
AssemblerListingLocation=".\Debug/"
ObjectFile=".\Debug/"
ProgramDataBaseFileName=".\Debug/"
WarningLevel="3"
SuppressStartupBanner="true"
DebugInformationFormat="4"
CompileAs="0"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
PreprocessorDefinitions="_DEBUG"
Culture="1033"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
AdditionalDependencies="icuucd.lib icutud.lib winmm.lib icutestd.lib"
OutputFile=".\Debug/unisetperf.exe"
LinkIncremental="1"
SuppressStartupBanner="true"
AdditionalLibraryDirectories="..\..\..\..\lib\"
GenerateDebugInformation="true"
ProgramDatabaseFile=".\Debug/unisetperf.pdb"
SubSystem="1"
TargetMachine="1"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCWebDeploymentTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
<Configuration
Name="Release|Win32"
OutputDirectory=".\Release"
IntermediateDirectory=".\Release"
ConfigurationType="1"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
UseOfMFC="0"
ATLMinimizesCRunTimeLibraryUsage="false"
CharacterSet="2"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
TypeLibraryName=".\Release/unisetperf.tlb"
HeaderFileName=""
/>
<Tool
Name="VCCLCompilerTool"
Optimization="2"
InlineFunctionExpansion="1"
AdditionalIncludeDirectories="..\..\..\..\include;..\..\..\tools\toolutil;..\..\..\common;..\..\..\tools\ctestfw"
PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
StringPooling="true"
RuntimeLibrary="2"
EnableFunctionLevelLinking="true"
PrecompiledHeaderFile=".\Release/unisetperf.pch"
AssemblerListingLocation=".\Release/"
ObjectFile=".\Release/"
ProgramDataBaseFileName=".\Release/"
WarningLevel="3"
SuppressStartupBanner="true"
CompileAs="0"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
PreprocessorDefinitions="NDEBUG"
Culture="1033"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
AdditionalDependencies="icuuc.lib icutu.lib icutest.lib winmm.lib"
OutputFile=".\Release/unisetperf.exe"
LinkIncremental="1"
SuppressStartupBanner="true"
AdditionalLibraryDirectories="..\..\..\..\lib\"
ProgramDatabaseFile=".\Release/unisetperf.pdb"
SubSystem="1"
TargetMachine="1"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCWebDeploymentTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
</Configurations>
<References>
</References>
<Files>
<File
RelativePath=".\unisetperf.cpp"
>
</File>
</Files>
<Globals>
</Globals>
</VisualStudioProject>

View File

@ -341,6 +341,7 @@ UBool UPerfTest::runTestLoop( char* testname, char* par )
int32_t loops = 0;
double t=0;
int32_t n = 1;
long ops;
do {
this->runIndexedTest( index, FALSE, name );
if (!name || (name[0] == 0))
@ -358,7 +359,8 @@ UBool UPerfTest::runTestLoop( char* testname, char* par )
fprintf(stderr,"%s function returned NULL", name);
return FALSE;
}
if (testFunction->getOperationsPerIteration() < 1) {
ops = testFunction->getOperationsPerIteration();
if (ops < 1) {
fprintf(stderr, "%s returned an illegal operations/iteration()\n", name);
return FALSE;
}
@ -396,8 +398,10 @@ UBool UPerfTest::runTestLoop( char* testname, char* par )
loops = iterations;
}
double min_t=1000000.0, sum_t=0.0;
long events = -1;
for(int32_t ps =0; ps < passes; ps++){
long events = -1;
fprintf(stdout,"= %s begin " ,name);
if(verbose==TRUE){
if(iterations > 0) {
@ -413,36 +417,40 @@ UBool UPerfTest::runTestLoop( char* testname, char* par )
printf("Performance test failed with error: %s \n", u_errorName(status));
break;
}
sum_t+=t;
if(t<min_t) {
min_t=t;
}
events = testFunction->getEventsPerIteration();
//print info only in verbose mode
if(verbose==TRUE){
/*
if(events == -1){
fprintf(stdout,"= %s end %f %i %i\n",name , t , loops, testFunction->getOperationsPerIteration());
fprintf(stdout, "= %s end: %f loops: %i operations: %li \n", name, t, (int)loops, ops);
}else{
fprintf(stdout,"= %s end %f %i %i %i\n",name , t , loops, testFunction->getOperationsPerIteration(), events);
}
*/
if(events == -1){
fprintf(stdout, "= %s end: %f loops: %i operations: %li \n", name, t, (int)loops, testFunction->getOperationsPerIteration());
}else{
fprintf(stdout, "= %s end: %f loops: %i operations: %li events: %li\n", name, t, (int)loops, testFunction->getOperationsPerIteration(), events);
fprintf(stdout, "= %s end: %f loops: %i operations: %li events: %li\n", name, t, (int)loops, ops, events);
}
}else{
/*
if(events == -1){
fprintf(stdout,"= %f %i %i \n", t , loops, testFunction->getOperationsPerIteration());
fprintf(stdout,"= %s end %f %i %li\n", name, t, (int)loops, ops);
}else{
fprintf(stdout,"= %f %i %i %i\n", t , loops, testFunction->getOperationsPerIteration(), events);
}
*/
if(events == -1){
fprintf(stdout,"= %s end %f %i %li\n", name, t, (int)loops, testFunction->getOperationsPerIteration());
}else{
fprintf(stdout,"= %s end %f %i %li %li\n", name, t, (int)loops, testFunction->getOperationsPerIteration(), events);
fprintf(stdout,"= %s end %f %i %li %li\n", name, t, (int)loops, ops, events);
}
}
}
if(verbose && U_SUCCESS(status)) {
double avg_t = sum_t/passes;
if(events == -1) {
fprintf(stdout, "%%= %s avg: %.4g loops: %i avg/op: %.4g ns\n",
name, avg_t, (int)loops, (avg_t*1E9)/(loops*ops));
fprintf(stdout, "_= %s min: %.4g loops: %i min/op: %.4g ns\n",
name, min_t, (int)loops, (min_t*1E9)/(loops*ops));
} else {
fprintf(stdout, "%%= %s avg: %.4g loops: %i avg/op: %.4g ns avg/event: %.4g ns\n",
name, avg_t, (int)loops, (avg_t*1E9)/(loops*ops), (avg_t*1E9)/(loops*events));
fprintf(stdout, "_= %s min: %.4g loops: %i min/op: %.4g ns min/event: %.4g ns\n",
name, min_t, (int)loops, (min_t*1E9)/(loops*ops), (min_t*1E9)/(loops*events));
}
}
delete testFunction;
}
index++;