ICU-1272 initial implementation of perl-ish character property syntax for UnicodeSet
X-SVN-Rev: 6281
This commit is contained in:
parent
a4a66fdc7f
commit
68744138aa
@ -71,7 +71,7 @@ cpdtrans.o hextouni.o rbt.o rbt_data.o rbt_pars.o rbt_rule.o rbt_set.o \
|
||||
dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o nultrans.o \
|
||||
remtrans.o titletrn.o tolowtrn.o toupptrn.o xformtrn.o \
|
||||
name2uni.o uni2name.o unitohex.o nortrans.o unifilt.o quant.o transreg.o \
|
||||
llong.o nfrs.o nfrule.o nfsubs.o rbnf.o
|
||||
llong.o nfrs.o nfrule.o nfsubs.o rbnf.o upropset.o
|
||||
|
||||
|
||||
|
||||
|
@ -370,6 +370,10 @@ SOURCE=.\unum.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\upropset.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\usearch.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -46,9 +46,15 @@ UMatchDegree Quantifier::matches(const Replaceable& text,
|
||||
int32_t start = offset;
|
||||
uint32_t count = 0;
|
||||
while (count < maxCount) {
|
||||
int32_t pos = offset;
|
||||
UMatchDegree m = matcher->matches(text, offset, limit, incremental);
|
||||
if (m == U_MATCH) {
|
||||
++count;
|
||||
if (pos == offset) {
|
||||
// If offset has not moved we have a zero-width match.
|
||||
// Don't keep matching it infinitely.
|
||||
break;
|
||||
}
|
||||
} else if (incremental && m == U_PARTIAL_MATCH) {
|
||||
return U_PARTIAL_MATCH;
|
||||
} else {
|
||||
|
@ -41,8 +41,6 @@
|
||||
#define SEGMENT_CLOSE ((UChar)0x0029) /*)*/
|
||||
#define CONTEXT_ANTE ((UChar)0x007B) /*{*/
|
||||
#define CONTEXT_POST ((UChar)0x007D) /*}*/
|
||||
#define SET_OPEN ((UChar)0x005B) /*[*/
|
||||
#define SET_CLOSE ((UChar)0x005D) /*]*/
|
||||
#define CURSOR_POS ((UChar)0x007C) /*|*/
|
||||
#define CURSOR_OFFSET ((UChar)0x0040) /*@*/
|
||||
#define ANCHOR_START ((UChar)0x005E) /*^*/
|
||||
@ -50,6 +48,13 @@
|
||||
#define ONE_OR_MORE ((UChar)0x002B) /*+*/
|
||||
#define ZERO_OR_ONE ((UChar)0x003F) /*?*/
|
||||
|
||||
#define DOT ((UChar)46) /*.*/
|
||||
|
||||
static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]";
|
||||
91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90,
|
||||
108, 58, 93, 92, 114, 92, 110, 36, 93, 0
|
||||
};
|
||||
|
||||
// By definition, the ANCHOR_END special character is a
|
||||
// trailing SymbolTable.SYMBOL_REF character.
|
||||
// private static final char ANCHOR_END = '$';
|
||||
@ -514,6 +519,15 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||
// Text after a presumed end anchor is a syntax err
|
||||
return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start);
|
||||
}
|
||||
if (UnicodeSet::resemblesPattern(rule, pos-1)) {
|
||||
pp.setIndex(pos-1); // Backup to opening '['
|
||||
buf.append(parser.parseSet(rule, pp));
|
||||
if (U_FAILURE(parser.status)) {
|
||||
return syntaxError(U_MALFORMED_SET, rule, start);
|
||||
}
|
||||
pos = pp.getIndex();
|
||||
continue;
|
||||
}
|
||||
// Handle escapes
|
||||
if (c == ESCAPE) {
|
||||
if (pos == limit) {
|
||||
@ -653,14 +667,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||
}
|
||||
post = buf.length();
|
||||
break;
|
||||
case SET_OPEN:
|
||||
pp.setIndex(pos-1); // Backup to opening '['
|
||||
buf.append(parser.parseSet(rule, pp));
|
||||
if (U_FAILURE(parser.status)) {
|
||||
return syntaxError(U_MALFORMED_SET, rule, start);
|
||||
}
|
||||
pos = pp.getIndex();
|
||||
break;
|
||||
case CURSOR_POS:
|
||||
if (cursor >= 0) {
|
||||
return syntaxError(U_MULTIPLE_CURSORS, rule, start);
|
||||
@ -689,6 +695,9 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||
}
|
||||
}
|
||||
break;
|
||||
case DOT:
|
||||
buf.append(parser.getDotStandIn());
|
||||
break;
|
||||
case KLEENE_STAR:
|
||||
case ONE_OR_MORE:
|
||||
case ZERO_OR_ONE:
|
||||
@ -749,7 +758,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
}
|
||||
break;
|
||||
// case SET_CLOSE:
|
||||
default:
|
||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||
// in the printable ASCII range. These characters are
|
||||
@ -892,6 +900,7 @@ void TransliteratorParser::parseRules(const UnicodeString& rules,
|
||||
}
|
||||
parseData->data = data;
|
||||
determineVariableRange(rules);
|
||||
dotStandIn = (UChar) -1;
|
||||
|
||||
UnicodeString str; // scratch
|
||||
idBlock.truncate(0);
|
||||
@ -1257,6 +1266,17 @@ UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
|
||||
return variableNext++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the stand-in for the dot set. It is allocated the first
|
||||
* time and reused thereafter.
|
||||
*/
|
||||
UChar TransliteratorParser::getDotStandIn() {
|
||||
if (dotStandIn == (UChar) -1) {
|
||||
dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET, status));
|
||||
}
|
||||
return dotStandIn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the value of the given variable name to the given
|
||||
* UnicodeString.
|
||||
|
@ -108,6 +108,13 @@ class TransliteratorParser {
|
||||
*/
|
||||
UnicodeString undefinedVariableName;
|
||||
|
||||
/**
|
||||
* The stand-in character for the 'dot' set, represented by '.' in
|
||||
* patterns. This is allocated the first time it is needed, and
|
||||
* reused thereafter.
|
||||
*/
|
||||
UChar dotStandIn;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
@ -190,6 +197,12 @@ private:
|
||||
*/
|
||||
UChar generateStandInFor(UnicodeMatcher* adopted);
|
||||
|
||||
/**
|
||||
* Return the stand-in for the dot set. It is allocated the first
|
||||
* time and reused thereafter.
|
||||
*/
|
||||
UChar getDotStandIn();
|
||||
|
||||
/**
|
||||
* Append the value of the given variable name to the given
|
||||
* UnicodeString.
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include "rbt_rule.h"
|
||||
#include "umutex.h"
|
||||
#include "ucln_in.h"
|
||||
#include "upropset.h"
|
||||
|
||||
// HIGH_VALUE > all valid values. 110000 for codepoints
|
||||
#define UNICODESET_HIGH 0x0110000
|
||||
@ -42,49 +43,40 @@
|
||||
#define UPPER_U ((UChar)0x0055) /*U*/
|
||||
#define LOWER_U ((UChar)0x0075) /*u*/
|
||||
|
||||
// N.B.: This mapping is different in ICU and Java
|
||||
//const UnicodeString UnicodeSet::CATEGORY_NAMES(
|
||||
// "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf", "");
|
||||
static const UChar CATEGORY_NAMES[] = {
|
||||
0x43, 0x6E, /* "Cn" */
|
||||
0x4C, 0x75, /* "Lu" */
|
||||
0x4C, 0x6C, /* "Ll" */
|
||||
0x4C, 0x74, /* "Lt" */
|
||||
0x4C, 0x6D, /* "Lm" */
|
||||
0x4C, 0x6F, /* "Lo" */
|
||||
0x4D, 0x6E, /* "Mn" */
|
||||
0x4D, 0x65, /* "Me" */
|
||||
0x4D, 0x63, /* "Mc" */
|
||||
0x4E, 0x64, /* "Nd" */
|
||||
0x4E, 0x6C, /* "Nl" */
|
||||
0x4E, 0x6F, /* "No" */
|
||||
0x5A, 0x73, /* "Zs" */
|
||||
0x5A, 0x6C, /* "Zl" */
|
||||
0x5A, 0x70, /* "Zp" */
|
||||
0x43, 0x63, /* "Cc" */
|
||||
0x43, 0x66, /* "Cf" */
|
||||
0x43, 0x6F, /* "Co" */
|
||||
0x43, 0x73, /* "Cs" */
|
||||
0x50, 0x64, /* "Pd" */
|
||||
0x50, 0x73, /* "Ps" */
|
||||
0x50, 0x65, /* "Pe" */
|
||||
0x50, 0x63, /* "Pc" */
|
||||
0x50, 0x6F, /* "Po" */
|
||||
0x53, 0x6D, /* "Sm" */
|
||||
0x53, 0x63, /* "Sc" */
|
||||
0x53, 0x6B, /* "Sk" */
|
||||
0x53, 0x6F, /* "So" */
|
||||
0x50, 0x69, /* "Pi" */
|
||||
0x50, 0x66, /* "Pf" */
|
||||
0x00
|
||||
};
|
||||
|
||||
/**
|
||||
* A cache mapping character category integers, as returned by
|
||||
* Unicode::getType(), to pairs strings. Entries are initially
|
||||
* zero length and are filled in on demand.
|
||||
*/
|
||||
static UnicodeSet* CATEGORY_CACHE = NULL;
|
||||
//// TEMPORARY: Remove when deprecated category code constructor is removed.
|
||||
//static const UChar CATEGORY_NAMES[] = {
|
||||
// 0x43, 0x6E, /* "Cn" */
|
||||
// 0x4C, 0x75, /* "Lu" */
|
||||
// 0x4C, 0x6C, /* "Ll" */
|
||||
// 0x4C, 0x74, /* "Lt" */
|
||||
// 0x4C, 0x6D, /* "Lm" */
|
||||
// 0x4C, 0x6F, /* "Lo" */
|
||||
// 0x4D, 0x6E, /* "Mn" */
|
||||
// 0x4D, 0x65, /* "Me" */
|
||||
// 0x4D, 0x63, /* "Mc" */
|
||||
// 0x4E, 0x64, /* "Nd" */
|
||||
// 0x4E, 0x6C, /* "Nl" */
|
||||
// 0x4E, 0x6F, /* "No" */
|
||||
// 0x5A, 0x73, /* "Zs" */
|
||||
// 0x5A, 0x6C, /* "Zl" */
|
||||
// 0x5A, 0x70, /* "Zp" */
|
||||
// 0x43, 0x63, /* "Cc" */
|
||||
// 0x43, 0x66, /* "Cf" */
|
||||
// 0x43, 0x6F, /* "Co" */
|
||||
// 0x43, 0x73, /* "Cs" */
|
||||
// 0x50, 0x64, /* "Pd" */
|
||||
// 0x50, 0x73, /* "Ps" */
|
||||
// 0x50, 0x65, /* "Pe" */
|
||||
// 0x50, 0x63, /* "Pc" */
|
||||
// 0x50, 0x6F, /* "Po" */
|
||||
// 0x53, 0x6D, /* "Sm" */
|
||||
// 0x53, 0x63, /* "Sc" */
|
||||
// 0x53, 0x6B, /* "Sk" */
|
||||
// 0x53, 0x6F, /* "So" */
|
||||
// 0x50, 0x69, /* "Pi" */
|
||||
// 0x50, 0x66, /* "Pf" */
|
||||
// 0x00
|
||||
//};
|
||||
|
||||
/**
|
||||
* Delimiter string used in patterns to close a category reference:
|
||||
@ -92,16 +84,12 @@ static UnicodeSet* CATEGORY_CACHE = NULL;
|
||||
*/
|
||||
static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
|
||||
|
||||
|
||||
/**
|
||||
* Cleanup function for transliterator component; delegates to
|
||||
* Transliterator::cleanupRegistry().
|
||||
*/
|
||||
U_CFUNC UBool unicodeset_cleanup(void) {
|
||||
if (CATEGORY_CACHE) {
|
||||
delete []CATEGORY_CACHE;
|
||||
CATEGORY_CACHE = NULL;
|
||||
}
|
||||
UnicodePropertySet::cleanup();
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
@ -174,24 +162,24 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
|
||||
applyPattern(pattern, pos, &symbols, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a set from the given Unicode character category.
|
||||
* @param category an integer indicating the character category as
|
||||
* returned by <code>Unicode::getType()</code>.
|
||||
*/
|
||||
UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) :
|
||||
len(0), capacity(START_EXTRA), bufferCapacity(0), list(0),
|
||||
buffer(0)
|
||||
{
|
||||
if (U_SUCCESS(status)) {
|
||||
if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
} else {
|
||||
list = new UChar32[capacity];
|
||||
*this = getCategorySet(category);
|
||||
}
|
||||
}
|
||||
}
|
||||
///**
|
||||
// * Constructs a set from the given Unicode character category.
|
||||
// * @param category an integer indicating the character category as
|
||||
// * returned by <code>Unicode::getType()</code>.
|
||||
// */
|
||||
//UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) :
|
||||
// len(0), capacity(START_EXTRA), bufferCapacity(0), list(0),
|
||||
// buffer(0)
|
||||
//{
|
||||
// if (U_SUCCESS(status)) {
|
||||
// if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) {
|
||||
// status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
// } else {
|
||||
// list = new UChar32[capacity];
|
||||
// *this = getCategorySet(category);
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
/**
|
||||
* Constructs a set that is identical to the given UnicodeSet.
|
||||
@ -319,6 +307,16 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the given position, in the given pattern, appears
|
||||
* to be the start of a UnicodeSet pattern.
|
||||
*/
|
||||
UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
|
||||
return ((pos+1) < pattern.length() &&
|
||||
pattern.charAt(pos) == (UChar)91/*[*/) ||
|
||||
UnicodePropertySet::resemblesPattern(pattern, pos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the <code>toPattern()</code> representation of a
|
||||
* character to the given <code>StringBuffer</code>.
|
||||
@ -339,6 +337,8 @@ void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool useHexEscape)
|
||||
case COMPLEMENT:
|
||||
case INTERSECTION:
|
||||
case BACKSLASH:
|
||||
case 123/*{*/:
|
||||
case 125/*}*/:
|
||||
buf.append(BACKSLASH);
|
||||
break;
|
||||
default:
|
||||
@ -451,15 +451,15 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
|
||||
UBool escapeUnprintable) const {
|
||||
result.append(SET_OPEN);
|
||||
|
||||
// Check against the predefined categories. We implicitly build
|
||||
// up ALL category sets the first time toPattern() is called.
|
||||
for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
|
||||
if (*this == getCategorySet(cat)) {
|
||||
result.append(COLON);
|
||||
result.append(CATEGORY_NAMES, cat*2, 2);
|
||||
return result.append(CATEGORY_CLOSE);
|
||||
}
|
||||
}
|
||||
// // Check against the predefined categories. We implicitly build
|
||||
// // up ALL category sets the first time toPattern() is called.
|
||||
// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
|
||||
// if (*this == getCategorySet(cat)) {
|
||||
// result.append(COLON);
|
||||
// result.append(CATEGORY_NAMES, cat*2, 2);
|
||||
// return result.append(CATEGORY_CLOSE);
|
||||
// }
|
||||
// }
|
||||
|
||||
int32_t count = getRangeCount();
|
||||
|
||||
@ -940,9 +940,9 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
||||
// mode 1: '[' seen; if next is '^' or ':' then special
|
||||
// mode 2: '[' '^'? seen; parse pattern and close with ']'
|
||||
// mode 3: '[:' seen; parse category and close with ':]'
|
||||
// mode 4: Pattern closed cleanly
|
||||
// mode 4: ']' seen; parse complete
|
||||
// mode 5: Top-level property pattern seen
|
||||
int8_t mode = 0;
|
||||
int32_t colonPos = 0; // Expected pos of ':' in '[:'
|
||||
int32_t i = pos.getIndex();
|
||||
int32_t limit = pattern.length();
|
||||
UnicodeSet nestedAux;
|
||||
@ -997,9 +997,11 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
||||
// Parse the opening '[' and optional following '^'
|
||||
switch (mode) {
|
||||
case 0:
|
||||
if (c == SET_OPEN) {
|
||||
if (UnicodePropertySet::resemblesPattern(pattern, i-1)) {
|
||||
mode = 3;
|
||||
break; // Fall through
|
||||
} else if (c == SET_OPEN) {
|
||||
mode = 1; // Next look for '^' or ':'
|
||||
colonPos = i; // Expect ':' at next offset
|
||||
continue;
|
||||
} else {
|
||||
// throw new IllegalArgumentException("Missing opening '['");
|
||||
@ -1013,18 +1015,6 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
||||
invert = TRUE;
|
||||
newPat.append(c);
|
||||
continue; // Back to top to fetch next character
|
||||
case COLON:
|
||||
// '[:' cannot have whitespace in it. 'i' has already
|
||||
// been advanced.
|
||||
if (i-1 == colonPos) {
|
||||
--i; // Backup to the '['
|
||||
c = SET_OPEN;
|
||||
mode = 3;
|
||||
// Fall through and parse category using the same
|
||||
// code used to parse a nested category. The mode
|
||||
// will indicate that this is actually top level.
|
||||
}
|
||||
break; // Fall through
|
||||
case HYPHEN:
|
||||
isLiteral = TRUE; // Treat leading '-' as a literal
|
||||
break; // Fall through
|
||||
@ -1041,12 +1031,59 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
||||
// buffer. Characters in the variable buffer have already
|
||||
// benn through escape and variable reference processing.
|
||||
if (varValueBuffer == NULL) {
|
||||
/**
|
||||
* Handle property set patterns.
|
||||
*/
|
||||
if (UnicodePropertySet::resemblesPattern(pattern, i-1)) {
|
||||
ParsePosition pp(i-1);
|
||||
nestedSet = UnicodePropertySet::createFromPattern(pattern, pp);
|
||||
if (nestedSet == NULL) {
|
||||
// assert(pp.getIndex() == i-1);
|
||||
//throw new IllegalArgumentException("Invalid property pattern " +
|
||||
// pattern.substring(i-1));
|
||||
status = U_INVALID_PROPERTY_PATTERN;
|
||||
return;
|
||||
}
|
||||
// TODO This is very inefficient. We create a new UnicodeSet,
|
||||
// then do an assignment, then delete it. Clean this up in
|
||||
// the future so that either (1) we just use the new set
|
||||
// directly, and delete it when we're done, or (2) even better,
|
||||
// UnicodePropertySet takes an existing set.
|
||||
nestedAux = *nestedSet;
|
||||
delete nestedSet;
|
||||
nestedSet = &nestedAux;
|
||||
nestedPatStart = newPat.length();
|
||||
nestedPatDone = TRUE; // we're going to do it just below
|
||||
|
||||
// If we have a top-level property pattern, then trim
|
||||
// off the opening '[' and use the property pattern
|
||||
// as the entire pattern.
|
||||
if (mode == 3) {
|
||||
newPat.truncate(0);
|
||||
}
|
||||
UnicodeString str;
|
||||
pattern.extractBetween(i-1, pp.getIndex(), str);
|
||||
newPat.append(str);
|
||||
rebuildPattern = TRUE;
|
||||
|
||||
i = pp.getIndex(); // advance past property pattern
|
||||
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse
|
||||
// loop. This is one of 2 ways we leave this
|
||||
// loop if the pattern is well-formed.
|
||||
*this = nestedAux;
|
||||
mode = 5;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Handle escapes. If a character is escaped, then it assumes its
|
||||
* literal value. This is true for all characters, both special
|
||||
* characters and characters with no special meaning. We also
|
||||
* interpret '\\uxxxx' Unicode escapes here (as literals).
|
||||
*/
|
||||
if (c == BACKSLASH) {
|
||||
else if (c == BACKSLASH) {
|
||||
UChar32 escaped = pattern.unescapeAt(i);
|
||||
if (escaped == (UChar32) -1) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
@ -1084,73 +1121,28 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
||||
}
|
||||
|
||||
/* An opening bracket indicates the first bracket of a nested
|
||||
* subpattern, either a normal pattern or a category pattern. We
|
||||
* recognize these here and set nestedSet accordingly.
|
||||
*
|
||||
* The other way we wind up here is with a top level category.
|
||||
* If that is the case, the mode will be set accordingly.
|
||||
* subpattern.
|
||||
*/
|
||||
else if (!isLiteral && c == SET_OPEN) {
|
||||
// Record position before nested pattern
|
||||
nestedPatStart = newPat.length();
|
||||
|
||||
// Handle "[:...:]", representing a character category
|
||||
if (i < pattern.length() && pattern.charAt(i) == COLON) {
|
||||
++i;
|
||||
int32_t j = pattern.indexOf(CATEGORY_CLOSE, i);
|
||||
if (j < 0) {
|
||||
// throw new IllegalArgumentException("Missing \":]\"");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
scratch.truncate(0);
|
||||
pattern.extractBetween(i, j, scratch);
|
||||
nestedAux.applyCategory(scratch, status);
|
||||
nestedSet = &nestedAux;
|
||||
nestedPatDone = TRUE; // We're going to do it just below
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
i = j+2; // Advance i past ":]"
|
||||
|
||||
// Use a rebuilt pattern. If we are top level,
|
||||
// then there is already a SET_OPEN in newPat, and
|
||||
// SET_CLOSE will be appended elsewhere.
|
||||
if (mode != 3) {
|
||||
newPat.append(SET_OPEN);
|
||||
}
|
||||
newPat.append(COLON).append(scratch).append(COLON);
|
||||
if (mode != 3) {
|
||||
newPat.append(SET_CLOSE);
|
||||
}
|
||||
rebuildPattern = TRUE;
|
||||
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse
|
||||
// loop. This is one of 2 ways we leave this
|
||||
// loop if the pattern is well-formed.
|
||||
*this = *nestedSet;
|
||||
mode = 4;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Recurse to get the pairs for this nested set.
|
||||
// Backup i to '['.
|
||||
pos.setIndex(--i);
|
||||
switch (lastOp) {
|
||||
case HYPHEN:
|
||||
case INTERSECTION:
|
||||
newPat.append(lastOp);
|
||||
break;
|
||||
}
|
||||
nestedAux._applyPattern(pattern, pos, symbols, newPat, status);
|
||||
nestedSet = &nestedAux;
|
||||
nestedPatDone = TRUE;
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
i = pos.getIndex();
|
||||
// Recurse to get the pairs for this nested set.
|
||||
// Backup i to '['.
|
||||
pos.setIndex(--i);
|
||||
switch (lastOp) {
|
||||
case HYPHEN:
|
||||
case INTERSECTION:
|
||||
newPat.append(lastOp);
|
||||
break;
|
||||
}
|
||||
nestedAux._applyPattern(pattern, pos, symbols, newPat, status);
|
||||
nestedSet = &nestedAux;
|
||||
nestedPatDone = TRUE;
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
i = pos.getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
@ -1255,7 +1247,22 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
||||
}
|
||||
}
|
||||
|
||||
if (lastChar != NONE) {
|
||||
if (mode < 4) {
|
||||
// throw new IllegalArgumentException("Missing ']'");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
// Treat a trailing '$' as indicating ETHER. This code is only
|
||||
// executed if symbols == NULL; otherwise other code parses the
|
||||
// anchor.
|
||||
if (lastChar == (UChar)SymbolTable::SYMBOL_REF) {
|
||||
rebuildPattern = TRUE;
|
||||
newPat.append(lastChar);
|
||||
add(TransliterationRule::ETHER);
|
||||
}
|
||||
|
||||
else if (lastChar != NONE) {
|
||||
add(lastChar, lastChar);
|
||||
_appendToPat(newPat, lastChar, FALSE);
|
||||
}
|
||||
@ -1271,7 +1278,9 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
||||
return;
|
||||
}
|
||||
|
||||
newPat.append(SET_CLOSE);
|
||||
if (mode == 4) {
|
||||
newPat.append(SET_CLOSE);
|
||||
}
|
||||
|
||||
/**
|
||||
* If we saw a '^' after the initial '[' of this pattern, then perform
|
||||
@ -1281,12 +1290,6 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
||||
complement();
|
||||
}
|
||||
|
||||
if (mode != 4) {
|
||||
// throw new IllegalArgumentException("Missing ']'");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
pos.setIndex(i);
|
||||
|
||||
// Use the rebuilt pattern (newPat) only if necessary. Prefer the
|
||||
@ -1298,157 +1301,6 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Implementation: Generation of pairs for Unicode categories
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Sets this object to the given category, given its name.
|
||||
* The category name must be either a two-letter name, such as
|
||||
* "Lu", or a one letter name, such as "L". One-letter names
|
||||
* indicate the logical union of all two-letter names that start
|
||||
* with that letter. Case is significant. If the name starts
|
||||
* with the character '^' then the complement of the given
|
||||
* character set is returned.
|
||||
*
|
||||
* Although individual categories such as "Lu" are cached, we do
|
||||
* not currently cache single-letter categories such as "L" or
|
||||
* complements such as "^Lu" or "^L". It would be easy to cache
|
||||
* these as well in a hashtable should the need arise.
|
||||
*/
|
||||
void UnicodeSet::applyCategory(const UnicodeString& catName,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
UnicodeString cat(catName);
|
||||
UBool invert = (catName.length() > 1 &&
|
||||
catName.charAt(0) == COMPLEMENT);
|
||||
if (invert) {
|
||||
cat.remove(0, 1);
|
||||
}
|
||||
|
||||
UBool match = FALSE;
|
||||
|
||||
// if we have two characters, search the category map for that
|
||||
// code and either construct and return a UnicodeSet from the
|
||||
// data in the category map or throw an exception
|
||||
if (cat.length() == 2) {
|
||||
int32_t i = 0;
|
||||
int32_t numCategories = Unicode::GENERAL_TYPES_COUNT * 2;
|
||||
|
||||
while (i < numCategories)
|
||||
{
|
||||
if (CATEGORY_NAMES[i] == cat.charAt(0)
|
||||
&& CATEGORY_NAMES[i+1] == cat.charAt(1))
|
||||
{
|
||||
*this = getCategorySet((int8_t)(i/2));
|
||||
match = TRUE;
|
||||
break;
|
||||
}
|
||||
i += 2;
|
||||
}
|
||||
} else if (cat.length() == 1) {
|
||||
// if we have one character, search the category map for
|
||||
// codes beginning with that letter, and union together
|
||||
// all of the matching sets that we find (or throw an
|
||||
// exception if there are no matches)
|
||||
clear();
|
||||
for (int32_t i=0; i<Unicode::GENERAL_TYPES_COUNT; ++i) {
|
||||
if (CATEGORY_NAMES[2*i] == cat.charAt(0)) {
|
||||
addAll(getCategorySet((int8_t)i));
|
||||
match = TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!match) {
|
||||
// TODO: Add caching of these, if desired
|
||||
char buf[128];
|
||||
catName.extract(buf, sizeof(buf), NULL, status);
|
||||
UScriptCode script = uscript_getCode(buf, &status);
|
||||
if (script != USCRIPT_INVALID_CODE) {
|
||||
match = TRUE;
|
||||
clear();
|
||||
int32_t start = -1;
|
||||
int32_t end = -2;
|
||||
for (UChar32 i=MIN_VALUE; i<=MAX_VALUE; ++i) {
|
||||
if (uscript_getScript(i, &status) == script) {
|
||||
if ((end+1) == (int32_t) i) {
|
||||
end = i;
|
||||
} else {
|
||||
if (start >= 0) {
|
||||
add((UChar32) start, (UChar32) end);
|
||||
}
|
||||
start = end = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (start >= 0) {
|
||||
add((UChar32) start, (UChar32) end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!match) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (invert) {
|
||||
complement();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a pairs string for the given category. This string is
|
||||
* cached and returned again if this method is called again with
|
||||
* the same parameter.
|
||||
*/
|
||||
const UnicodeSet& UnicodeSet::getCategorySet(int8_t cat) {
|
||||
// In order to tell what cache entries are empty, we assume
|
||||
// every category specifies at least one character. Thus
|
||||
// sets in the cache that are empty are uninitialized.
|
||||
if (CATEGORY_CACHE == NULL) {
|
||||
umtx_lock(NULL);
|
||||
if (CATEGORY_CACHE == NULL) {
|
||||
CATEGORY_CACHE = new UnicodeSet[Unicode::GENERAL_TYPES_COUNT];
|
||||
ucln_i18n_registerCleanup();
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
}
|
||||
if (CATEGORY_CACHE[cat].isEmpty()) {
|
||||
// Walk through all Unicode characters, noting the start
|
||||
// and end of each range for which Character.getType(c)
|
||||
// returns the given category integer. Since we are
|
||||
// iterating in order, we can simply append the resulting
|
||||
// ranges to the pairs string.
|
||||
UnicodeSet& set = CATEGORY_CACHE[cat];
|
||||
int32_t start = -1;
|
||||
int32_t end = -2;
|
||||
// N.B.: There seems to be a bug that deadlocks if you
|
||||
// call getType() with a supplemental character right now.
|
||||
// TODO: Change 0xFFFF to MAX_VALUE later.
|
||||
for (int32_t i=MIN_VALUE; i<=0xFFFF/*TEMPORARY*/; ++i) {
|
||||
if (Unicode::getType((UChar)i) == cat) {
|
||||
if ((end+1) == i) {
|
||||
end = i;
|
||||
} else {
|
||||
if (start >= 0) {
|
||||
set.add((UChar32)start, (UChar32)end);
|
||||
}
|
||||
start = end = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (start >= 0) {
|
||||
set.add((UChar32)start, (UChar32)end);
|
||||
}
|
||||
}
|
||||
return CATEGORY_CACHE[cat];
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Implementation: Utility methods
|
||||
//----------------------------------------------------------------
|
||||
|
597
icu4c/source/i18n/upropset.cpp
Normal file
597
icu4c/source/i18n/upropset.cpp
Normal file
@ -0,0 +1,597 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu/source/i18n/Attic/upropset.cpp,v $
|
||||
* $Date: 2001/10/17 19:20:41 $
|
||||
* $Revision: 1.1 $
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "upropset.h"
|
||||
#include "ustrfmt.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "hash.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
static Hashtable* NAME_MAP = NULL;
|
||||
|
||||
static Hashtable* CATEGORY_MAP = NULL;
|
||||
|
||||
/**
|
||||
* A cache mapping character category integers, as returned by
|
||||
* UCharacter.getType(), to sets. Entries are initially
|
||||
* null and are created on demand.
|
||||
*/
|
||||
static UnicodeSet* CATEGORY_CACHE = NULL;
|
||||
|
||||
/**
|
||||
* A cache mapping script integers, as defined by
|
||||
* UScript, to sets. Entries are initially
|
||||
* null and are created on demand.
|
||||
*/
|
||||
static UnicodeSet* SCRIPT_CACHE = NULL;
|
||||
|
||||
// Special value codes
|
||||
static const int32_t ANY = -1; // general category: all code points
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Unicode string and character constants
|
||||
//----------------------------------------------------------------
|
||||
|
||||
static const UChar POSIX_OPEN[] = { 91,58,0 }; // "[:"
|
||||
static const UChar POSIX_CLOSE[] = { 58,93,0 }; // ":]"
|
||||
|
||||
static const UChar PERL_OPEN[] = { 92,112,0 }; // "\\p"
|
||||
static const UChar PERL_CLOSE[] = { 125,0 }; // "}"
|
||||
|
||||
static const UChar HAT = 0x005E; /*^*/
|
||||
static const UChar UPPER_P = 0x0050; /*P*/
|
||||
static const UChar LEFT_BRACE = 0x007B; /*{*/
|
||||
static const UChar EQUALS = 0x003D; /*=*/
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// class _CharString
|
||||
// An identical class named CharString can be found in transreg.cpp.
|
||||
// If we find ourselves needing another copy of this utility class we
|
||||
// should probably pull it out into putil or some such place.
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
class _CharString {
|
||||
public:
|
||||
_CharString(const UnicodeString& str);
|
||||
~_CharString();
|
||||
operator char*() { return ptr; }
|
||||
private:
|
||||
char buf[128];
|
||||
char* ptr;
|
||||
};
|
||||
|
||||
_CharString::_CharString(const UnicodeString& str) {
|
||||
if (str.length() >= (int32_t)sizeof(buf)) {
|
||||
ptr = new char[str.length() + 8];
|
||||
} else {
|
||||
ptr = buf;
|
||||
}
|
||||
str.extract(0, 0x7FFFFFFF, ptr, "");
|
||||
}
|
||||
|
||||
_CharString::~_CharString() {
|
||||
if (ptr != buf) {
|
||||
delete[] ptr;
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Public API
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Return true if the given position, in the given pattern, appears
|
||||
* to be the start of a property set pattern [:foo:], \p{foo}, or
|
||||
* \P{foo}.
|
||||
*/
|
||||
UBool UnicodePropertySet::resemblesPattern(const UnicodeString& pattern,
|
||||
int32_t pos) {
|
||||
// Patterns are at least 5 characters long
|
||||
if ((pos+5) > pattern.length()) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// Look for an opening [:, [:^, \p, or \P
|
||||
return (0 == pattern.compare(pos, 2, POSIX_OPEN)) ||
|
||||
(0 == pattern.caseCompare(pos, 2, PERL_OPEN, U_FOLD_CASE_DEFAULT));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a UnicodeSet by parsing the given pattern at the given
|
||||
* parse position.
|
||||
*
|
||||
* @param pattern the pattern string
|
||||
* @param ppos on entry, the position at which to begin parsing.
|
||||
* This shold be one of the locations marked '^':
|
||||
*
|
||||
* [:blah:] \p{blah} \P{blah}
|
||||
* ^ % ^ % ^ %
|
||||
*
|
||||
* On return, the position after the last character parsed, that is,
|
||||
* the locations marked '%'. If the parse fails, ppos is returned
|
||||
* unchanged.
|
||||
* @return a newly-constructed UnicodeSet object, or null upon
|
||||
* failure.
|
||||
*/
|
||||
UnicodeSet* UnicodePropertySet::createFromPattern(const UnicodeString& pattern,
|
||||
ParsePosition& ppos) {
|
||||
init();
|
||||
|
||||
UnicodeSet* set = NULL;
|
||||
|
||||
int32_t pos = ppos.getIndex();
|
||||
|
||||
// On entry, ppos should point to one of the following locations:
|
||||
|
||||
// Minimum length is 5 characters, e.g. \p{L}
|
||||
if ((pos+5) > pattern.length()) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat}
|
||||
UBool invert = FALSE;
|
||||
|
||||
// Look for an opening [:, [:^, \p, or \P
|
||||
if (0 == pattern.compare(pos, 2, POSIX_OPEN)) {
|
||||
posix = TRUE;
|
||||
pos = skipWhitespace(pattern, pos+2);
|
||||
if (pos < pattern.length() && pattern.charAt(pos) == HAT) {
|
||||
++pos;
|
||||
invert = TRUE;
|
||||
}
|
||||
} else if (0 == pattern.caseCompare(pos, 2, PERL_OPEN, U_FOLD_CASE_DEFAULT)) {
|
||||
invert = (pattern.charAt(pos+1) == UPPER_P);
|
||||
pos = skipWhitespace(pattern, pos+2);
|
||||
if (pos == pattern.length() || pattern.charAt(pos++) != LEFT_BRACE) {
|
||||
// Syntax error; "\p" or "\P" not followed by "{"
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
// Open delimiter not seen
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Look for the matching close delimiter, either :] or }
|
||||
int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
|
||||
if (close < 0) {
|
||||
// Syntax error; close delimiter missing
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Look for an '=' sign. If this is present, we will parse a
|
||||
// medium \p{gc=Cf} or long \p{GeneralCategory=Format}
|
||||
// pattern.
|
||||
int32_t equals = pattern.indexOf(EQUALS, pos);
|
||||
if (equals >= 0 && equals < close) {
|
||||
// Equals seen; parse medium/long pattern
|
||||
UnicodeString typeName = munge(pattern, pos, equals);
|
||||
UnicodeString valueName = munge(pattern, equals+1, close);
|
||||
SetFactory factory;
|
||||
factory = voidPtrToSetFactory(NAME_MAP->get(typeName));
|
||||
if (factory == NULL) {
|
||||
// Syntax error; type name not recognized
|
||||
return NULL;
|
||||
}
|
||||
set = (*factory)(valueName);
|
||||
} else {
|
||||
// No equals seen; parse short format \p{Cf}
|
||||
UnicodeString shortName = munge(pattern, pos, close);
|
||||
|
||||
// First try general category
|
||||
set = createCategorySet(shortName);
|
||||
|
||||
// If this fails, try script
|
||||
if (set == NULL) {
|
||||
set = createScriptSet(shortName);
|
||||
}
|
||||
}
|
||||
|
||||
// Upon failure, return NULL with ppos unchanged
|
||||
if (set == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (invert) {
|
||||
set->complement();
|
||||
}
|
||||
|
||||
// Move to the limit position after the close delimiter
|
||||
ppos.setIndex(close + (posix ? 2 : 1));
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Property set factory static methods
|
||||
// NOTE: This will change/go away when we implement UCharacter
|
||||
// based property retrieval.
|
||||
//----------------------------------------------------------------
|
||||
|
||||
static UBool _numericValueFilter(UChar32 c, void* context) {
|
||||
int32_t value = * (int32_t*) context;
|
||||
// TODO: Change this to a more generic function, like
|
||||
// u_charNumericValue (when one exists).
|
||||
return u_charDigitValue(c) == value;
|
||||
}
|
||||
|
||||
UnicodeSet* UnicodePropertySet::createNumericValueSet(const UnicodeString& valueName) {
|
||||
_CharString cvalueName(valueName);
|
||||
UnicodeSet* set = new UnicodeSet();
|
||||
char* end;
|
||||
double value = uprv_strtod(cvalueName, &end);
|
||||
int32_t ivalue = (int32_t) value;
|
||||
if (ivalue != value || ivalue < 0 || *end != 0) {
|
||||
// UCharacter doesn't support negative or non-integral
|
||||
// values, so just return an empty set
|
||||
return set;
|
||||
}
|
||||
initSetFromFilter(*set, _numericValueFilter, &ivalue);
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a general category value name, create a corresponding
|
||||
* set and return it, or return null if the name is invalid.
|
||||
* @param valueName a pre-munged general category value name
|
||||
*/
|
||||
UnicodeSet* UnicodePropertySet::createCategorySet(const UnicodeString& valueName) {
|
||||
int32_t valueCode = CATEGORY_MAP->geti(valueName);
|
||||
if (valueCode == 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
UnicodeSet* set = new UnicodeSet();
|
||||
if (valueCode == ANY) {
|
||||
set->complement();
|
||||
return set;
|
||||
}
|
||||
for (int32_t cat=0; cat<U_CHAR_CATEGORY_COUNT; ++cat) {
|
||||
if ((valueCode & (1 << cat)) != 0) {
|
||||
set->addAll(getCategorySet(cat));
|
||||
}
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a script value name, create a corresponding set and
|
||||
* return it, or return null if the name is invalid.
|
||||
* @param valueName a pre-munged script value name
|
||||
*/
|
||||
UnicodeSet* UnicodePropertySet::createScriptSet(const UnicodeString& valueName) {
|
||||
_CharString cvalueName(valueName);
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
UScriptCode script = uscript_getCode(cvalueName, &ec);
|
||||
if (script == USCRIPT_INVALID_CODE || U_FAILURE(ec)) {
|
||||
// Syntax error; unknown short name
|
||||
return NULL;
|
||||
}
|
||||
return new UnicodeSet(getScriptSet(script));
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Utility methods
|
||||
//----------------------------------------------------------------
|
||||
|
||||
static UBool _categoryFilter(UChar32 c, void* context) {
|
||||
int32_t value = * (int32_t*) context;
|
||||
return u_charType(c) == value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a UnicodeSet for the given category. This set is
|
||||
* cached and returned again if this method is called again with
|
||||
* the same parameter.
|
||||
*
|
||||
* Callers MUST NOT MODIFY the returned set.
|
||||
*/
|
||||
const UnicodeSet& UnicodePropertySet::getCategorySet(int32_t cat) {
|
||||
if (CATEGORY_CACHE[cat].isEmpty()) {
|
||||
initSetFromFilter(CATEGORY_CACHE[cat], _categoryFilter, &cat);
|
||||
}
|
||||
return CATEGORY_CACHE[cat];
|
||||
}
|
||||
|
||||
static UBool _scriptFilter(UChar32 c, void* context) {
|
||||
UScriptCode value = * (UScriptCode*) context;
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
return uscript_getScript(c, &ec) == value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a UnicodeSet for the given script. This set is
|
||||
* cached and returned again if this method is called again with
|
||||
* the same parameter.
|
||||
*
|
||||
* Callers MUST NOT MODIFY the returned set.
|
||||
*/
|
||||
const UnicodeSet& UnicodePropertySet::getScriptSet(UScriptCode script) {
|
||||
if (SCRIPT_CACHE[script].isEmpty()) {
|
||||
initSetFromFilter(CATEGORY_CACHE[script], _scriptFilter, &script);
|
||||
}
|
||||
return SCRIPT_CACHE[script];
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a string, munge it to lost the whitespace. So "General
|
||||
* Category " becomes "GeneralCategory". We munge all type and value
|
||||
* strings, and store all type and value keys pre-munged. NOTE:
|
||||
* Unlike the Java version, we do not modify the case, since we use a
|
||||
* case-insensitive compare function.
|
||||
*/
|
||||
UnicodeString UnicodePropertySet::munge(const UnicodeString& str,
|
||||
int32_t start, int32_t limit) {
|
||||
UnicodeString buf;
|
||||
for (int32_t i=start; i<limit; ) {
|
||||
UChar32 c = str.char32At(i);
|
||||
i += UTF_CHAR_LENGTH(c);
|
||||
if (!u_isspace(c)) {
|
||||
buf.append(c);
|
||||
}
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip over a sequence of zero or more white space characters
|
||||
* at pos. Return the index of the first non-white-space character
|
||||
* at or after pos, or str.length(), if there is none.
|
||||
*/
|
||||
int32_t UnicodePropertySet::skipWhitespace(const UnicodeString& str,
|
||||
int32_t pos) {
|
||||
while (pos < str.length()) {
|
||||
UChar32 c = str.char32At(pos);
|
||||
if (!u_isspace(c)) {
|
||||
break;
|
||||
}
|
||||
pos += UTF_CHAR_LENGTH(c);
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Generic filter-based scanning code
|
||||
//
|
||||
// NOTE: In general, we don't want to do this! This is a temporary
|
||||
// implementation until we have time for something that examines
|
||||
// the underlying UCharacter data structures in an intelligent
|
||||
// way. Iterating over all code points is dumb. What we want to
|
||||
// do, for instance, is iterate over internally-stored ranges
|
||||
// of characters that have a given property.
|
||||
//----------------------------------------------------------------
|
||||
|
||||
void UnicodePropertySet::initSetFromFilter(UnicodeSet& set, Filter filter,
|
||||
void* context) {
|
||||
// Walk through all Unicode characters, noting the start
|
||||
// and end of each range for which filter.contain(c) is
|
||||
// true. Add each range to a set.
|
||||
set.clear();
|
||||
|
||||
int32_t start = -1;
|
||||
int32_t end = -2;
|
||||
|
||||
// TODO Extend this up to UnicodeSet.MAX_VALUE when we have
|
||||
// better performance; i.e., when this code can get moved into
|
||||
// the UCharacter class and not have to iterate over code
|
||||
// points. Right now it's way too slow to iterate to 10FFFF.
|
||||
|
||||
for (int32_t i=UnicodeSet::MIN_VALUE; i<=0xFFFF/*TEMPORARY*/; ++i) {
|
||||
if ((*filter)((UChar32) i, context)) {
|
||||
if ((end+1) == i) {
|
||||
end = i;
|
||||
} else {
|
||||
if (start >= 0) {
|
||||
set.add((UChar32)start, (UChar32)end);
|
||||
}
|
||||
start = end = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (start >= 0) {
|
||||
set.add((UChar32)start, (UChar32)end);
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Type and value name maps
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Add a type mapping to the name map.
|
||||
*/
|
||||
void UnicodePropertySet::addType(const UnicodeString& shortName,
|
||||
const UnicodeString& longName,
|
||||
SetFactory factory) {
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
void* p = setFactoryToVoidPtr(factory);
|
||||
NAME_MAP->put(shortName, p, ec);
|
||||
NAME_MAP->put(longName, p, ec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a value mapping to the name map.
|
||||
*/
|
||||
void UnicodePropertySet::addValue(Hashtable* map,
|
||||
const UnicodeString& shortName,
|
||||
const UnicodeString& longName,
|
||||
int32_t value) {
|
||||
// assert(value != 0);
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
map->puti(shortName, value, ec);
|
||||
if (longName.length() != 0) {
|
||||
map->puti(longName, value, ec);
|
||||
}
|
||||
}
|
||||
|
||||
void UnicodePropertySet::init() {
|
||||
if (NAME_MAP != NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
NAME_MAP = new Hashtable(TRUE);
|
||||
CATEGORY_MAP = new Hashtable(TRUE);
|
||||
CATEGORY_CACHE = new UnicodeSet[U_CHAR_CATEGORY_COUNT];
|
||||
SCRIPT_CACHE = new UnicodeSet[USCRIPT_CODE_LIMIT];
|
||||
|
||||
// NOTE: We munge all search keys to have no whitespace
|
||||
// and upper case. As such, all stored keys should have
|
||||
// this format.
|
||||
|
||||
// Load the map with type data
|
||||
|
||||
addType("GC", "GENERALCATEGORY", createCategorySet);
|
||||
|
||||
//addType("CC", "COMBININGCLASS", COMBINING_CLASS);
|
||||
//addType("BC", "BIDICLASS", BIDI_CLASS);
|
||||
//addType("DT", "DECOMPOSITIONTYPE", DECOMPOSITION_TYPE);
|
||||
|
||||
addType("NV", "NUMERICVALUE", createNumericValueSet);
|
||||
|
||||
//addType("NT", "NUMERICTYPE", NUMERIC_TYPE);
|
||||
//addType("EA", "EASTASIANWIDTH", EAST_ASIAN_WIDTH);
|
||||
//addType("LB", "LINEBREAK", LINE_BREAK);
|
||||
//addType("JT", "JOININGTYPE", JOINING_TYPE);
|
||||
|
||||
addType("SC", "SCRIPT", createScriptSet);
|
||||
|
||||
// Load the map with value data
|
||||
|
||||
// General Category
|
||||
|
||||
addValue(CATEGORY_MAP, "ANY", "", ANY); // special case
|
||||
|
||||
addValue(CATEGORY_MAP, "C", "OTHER",
|
||||
(1 << U_CONTROL_CHAR) |
|
||||
(1 << U_FORMAT_CHAR) |
|
||||
(1 << U_GENERAL_OTHER_TYPES) |
|
||||
(1 << U_PRIVATE_USE_CHAR) |
|
||||
(1 << U_SURROGATE));
|
||||
|
||||
addValue(CATEGORY_MAP, "CC", "CONTROL",
|
||||
1 << U_CONTROL_CHAR);
|
||||
addValue(CATEGORY_MAP, "CF", "FORMAT",
|
||||
1 << U_FORMAT_CHAR);
|
||||
addValue(CATEGORY_MAP, "CN", "UNASSIGNED",
|
||||
1 << U_GENERAL_OTHER_TYPES);
|
||||
addValue(CATEGORY_MAP, "CO", "PRIVATEUSE",
|
||||
1 << U_PRIVATE_USE_CHAR);
|
||||
addValue(CATEGORY_MAP, "CS", "SURROGATE",
|
||||
1 << U_SURROGATE);
|
||||
|
||||
addValue(CATEGORY_MAP, "L", "LETTER",
|
||||
(1 << U_LOWERCASE_LETTER) |
|
||||
(1 << U_MODIFIER_LETTER) |
|
||||
(1 << U_OTHER_LETTER) |
|
||||
(1 << U_TITLECASE_LETTER) |
|
||||
(1 << U_UPPERCASE_LETTER));
|
||||
|
||||
addValue(CATEGORY_MAP, "LL", "LOWERCASELETTER",
|
||||
1 << U_LOWERCASE_LETTER);
|
||||
addValue(CATEGORY_MAP, "LM", "MODIFIERLETTER",
|
||||
1 << U_MODIFIER_LETTER);
|
||||
addValue(CATEGORY_MAP, "LO", "OTHERLETTER",
|
||||
1 << U_OTHER_LETTER);
|
||||
addValue(CATEGORY_MAP, "LT", "TITLECASELETTER",
|
||||
1 << U_TITLECASE_LETTER);
|
||||
addValue(CATEGORY_MAP, "LU", "UPPERCASELETTER",
|
||||
1 << U_UPPERCASE_LETTER);
|
||||
|
||||
addValue(CATEGORY_MAP, "M", "MARK",
|
||||
(1 << U_NON_SPACING_MARK) |
|
||||
(1 << U_COMBINING_SPACING_MARK) |
|
||||
(1 << U_ENCLOSING_MARK));
|
||||
|
||||
addValue(CATEGORY_MAP, "MN", "NONSPACINGMARK",
|
||||
1 << U_NON_SPACING_MARK);
|
||||
addValue(CATEGORY_MAP, "MC", "SPACINGMARK",
|
||||
1 << U_COMBINING_SPACING_MARK);
|
||||
addValue(CATEGORY_MAP, "ME", "ENCLOSINGMARK",
|
||||
1 << U_ENCLOSING_MARK);
|
||||
|
||||
addValue(CATEGORY_MAP, "N", "NUMBER",
|
||||
(1 << U_DECIMAL_DIGIT_NUMBER) |
|
||||
(1 << U_LETTER_NUMBER) |
|
||||
(1 << U_OTHER_NUMBER));
|
||||
|
||||
addValue(CATEGORY_MAP, "ND", "DECIMALNUMBER",
|
||||
1 << U_DECIMAL_DIGIT_NUMBER);
|
||||
addValue(CATEGORY_MAP, "NL", "LETTERNUMBER",
|
||||
1 << U_LETTER_NUMBER);
|
||||
addValue(CATEGORY_MAP, "NO", "OTHERNUMBER",
|
||||
1 << U_OTHER_NUMBER);
|
||||
|
||||
addValue(CATEGORY_MAP, "P", "PUNCTUATION",
|
||||
(1 << U_CONNECTOR_PUNCTUATION) |
|
||||
(1 << U_DASH_PUNCTUATION) |
|
||||
(1 << U_END_PUNCTUATION) |
|
||||
(1 << U_FINAL_PUNCTUATION) |
|
||||
(1 << U_INITIAL_PUNCTUATION) |
|
||||
(1 << U_OTHER_PUNCTUATION) |
|
||||
(1 << U_START_PUNCTUATION));
|
||||
|
||||
addValue(CATEGORY_MAP, "PC", "CONNECTORPUNCTUATION",
|
||||
1 << U_CONNECTOR_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PD", "DASHPUNCTUATION",
|
||||
1 << U_DASH_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PE", "ENDPUNCTUATION",
|
||||
1 << U_END_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PF", "FINALPUNCTUATION",
|
||||
1 << U_FINAL_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PI", "INITIALPUNCTUATION",
|
||||
1 << U_INITIAL_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PO", "OTHERPUNCTUATION",
|
||||
1 << U_OTHER_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PS", "STARTPUNCTUATION",
|
||||
1 << U_START_PUNCTUATION);
|
||||
|
||||
addValue(CATEGORY_MAP, "S", "SYMBOL",
|
||||
(1 << U_CURRENCY_SYMBOL) |
|
||||
(1 << U_MODIFIER_SYMBOL) |
|
||||
(1 << U_MATH_SYMBOL) |
|
||||
(1 << U_OTHER_SYMBOL));
|
||||
|
||||
addValue(CATEGORY_MAP, "SC", "CURRENCYSYMBOL",
|
||||
1 << U_CURRENCY_SYMBOL);
|
||||
addValue(CATEGORY_MAP, "SK", "MODIFIERSYMBOL",
|
||||
1 << U_MODIFIER_SYMBOL);
|
||||
addValue(CATEGORY_MAP, "SM", "MATHSYMBOL",
|
||||
1 << U_MATH_SYMBOL);
|
||||
addValue(CATEGORY_MAP, "SO", "OTHERSYMBOL",
|
||||
1 << U_OTHER_SYMBOL);
|
||||
|
||||
addValue(CATEGORY_MAP, "Z", "SEPARATOR",
|
||||
(1 << U_LINE_SEPARATOR) |
|
||||
(1 << U_PARAGRAPH_SEPARATOR) |
|
||||
(1 << U_SPACE_SEPARATOR));
|
||||
|
||||
addValue(CATEGORY_MAP, "ZL", "LINESEPARATOR",
|
||||
1 << U_LINE_SEPARATOR);
|
||||
addValue(CATEGORY_MAP, "ZP", "PARAGRAPHSEPARATOR",
|
||||
1 << U_PARAGRAPH_SEPARATOR);
|
||||
addValue(CATEGORY_MAP, "ZS", "SPACESEPARATOR",
|
||||
1 << U_SPACE_SEPARATOR);
|
||||
}
|
||||
|
||||
void UnicodePropertySet::cleanup() {
|
||||
if (NAME_MAP != NULL) {
|
||||
delete NAME_MAP; NAME_MAP = NULL;
|
||||
delete CATEGORY_MAP; CATEGORY_MAP = NULL;
|
||||
delete[] CATEGORY_CACHE; CATEGORY_CACHE = NULL;
|
||||
delete[] SCRIPT_CACHE; SCRIPT_CACHE = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
//eof
|
240
icu4c/source/i18n/upropset.h
Normal file
240
icu4c/source/i18n/upropset.h
Normal file
@ -0,0 +1,240 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu/source/i18n/Attic/upropset.h,v $
|
||||
* $Date: 2001/10/17 19:20:41 $
|
||||
* $Revision: 1.1 $
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef _UPROPSET_H_
|
||||
#define _UPROPSET_H_
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uscript.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UnicodeString;
|
||||
class UnicodeSet;
|
||||
class ParsePosition;
|
||||
class Hashtable;
|
||||
|
||||
/**
|
||||
* INTERNAL CLASS implementing the UnicodeSet properties as outlined
|
||||
* at:
|
||||
*
|
||||
* http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html
|
||||
*
|
||||
* Recognized syntax:
|
||||
*
|
||||
* [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
|
||||
* \p{foo} \P{foo} - white space not allowed within "\p" or "\P"
|
||||
*
|
||||
* Other than the above restrictions, white space is ignored. Case
|
||||
* is ignored except in "\p" and "\P".
|
||||
*
|
||||
* This class cannot be instantiated. It has a public static method,
|
||||
* createPropertySet(), with takes a pattern to be parsed and returns
|
||||
* a new UnicodeSet. Another public static method,
|
||||
* resemblesPattern(), returns true if a given pattern string appears
|
||||
* to be a property set pattern, and therefore should be passed in to
|
||||
* createPropertySet().
|
||||
*
|
||||
* NOTE: Current implementation is incomplete. The following list
|
||||
* indicates which properties are supported.
|
||||
*
|
||||
* + GeneralCategory
|
||||
* CombiningClass
|
||||
* BidiClass
|
||||
* DecompositionType
|
||||
* + NumericValue
|
||||
* NumericType
|
||||
* EastAsianWidth
|
||||
* LineBreak
|
||||
* JoiningType
|
||||
* + Script
|
||||
*
|
||||
* '+' indicates a supported property.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: upropset.h,v $ $Revision: 1.1 $ $Date: 2001/10/17 19:20:41 $
|
||||
*/
|
||||
class UnicodePropertySet {
|
||||
|
||||
public:
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Public API
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Return true if the given position, in the given pattern, appears
|
||||
* to be the start of a property set pattern [:foo:], \p{foo}, or
|
||||
* \P{foo}.
|
||||
*/
|
||||
static UBool resemblesPattern(const UnicodeString& pattern, int32_t pos);
|
||||
|
||||
/**
|
||||
* Create a UnicodeSet by parsing the given pattern at the given
|
||||
* parse position.
|
||||
*
|
||||
* @param pattern the pattern string
|
||||
* @param ppos on entry, the position at which to begin parsing.
|
||||
* This shold be one of the locations marked '^':
|
||||
*
|
||||
* [:blah:] \p{blah} \P{blah}
|
||||
* ^ % ^ % ^ %
|
||||
*
|
||||
* On return, the position after the last character parsed, that is,
|
||||
* the locations marked '%'. If the parse fails, ppos is returned
|
||||
* unchanged.
|
||||
* @return a newly-constructed UnicodeSet object, or null upon
|
||||
* failure.
|
||||
*/
|
||||
static UnicodeSet* createFromPattern(const UnicodeString& pattern,
|
||||
ParsePosition& ppos);
|
||||
|
||||
private:
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Property set factory static methods
|
||||
// NOTE: This will change/go away when we implement UCharacter
|
||||
// based property retrieval.
|
||||
//----------------------------------------------------------------
|
||||
|
||||
typedef UnicodeSet* (*SetFactory)(const UnicodeString& valueName);
|
||||
|
||||
static UnicodeSet* createNumericValueSet(const UnicodeString& valueName);
|
||||
|
||||
/**
|
||||
* Given a general category value name, create a corresponding
|
||||
* set and return it, or return null if the name is invalid.
|
||||
* @param valueName a pre-munged general category value name
|
||||
*/
|
||||
static UnicodeSet* createCategorySet(const UnicodeString& valueName);
|
||||
|
||||
/**
|
||||
* Given a script value name, create a corresponding set and
|
||||
* return it, or return null if the name is invalid.
|
||||
* @param valueName a pre-munged script value name
|
||||
*/
|
||||
static UnicodeSet* createScriptSet(const UnicodeString& valueName);
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Utility methods
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns a UnicodeSet for the given category. This set is
|
||||
* cached and returned again if this method is called again with
|
||||
* the same parameter.
|
||||
*
|
||||
* Callers MUST NOT MODIFY the returned set.
|
||||
*/
|
||||
static const UnicodeSet& getCategorySet(int32_t cat);
|
||||
|
||||
/**
|
||||
* Returns a UnicodeSet for the given script. This set is
|
||||
* cached and returned again if this method is called again with
|
||||
* the same parameter.
|
||||
*
|
||||
* Callers MUST NOT MODIFY the returned set.
|
||||
*/
|
||||
static const UnicodeSet& getScriptSet(UScriptCode script);
|
||||
|
||||
/**
|
||||
* Given a string, munge it to upper case and lose the whitespace.
|
||||
* So "General Category " becomes "GENERALCATEGORY". We munge all
|
||||
* type and value strings, and store all type and value keys
|
||||
* pre-munged.
|
||||
*/
|
||||
static UnicodeString munge(const UnicodeString& str, int32_t start, int32_t limit);
|
||||
|
||||
/**
|
||||
* Skip over a sequence of zero or more white space characters
|
||||
* at pos. Return the index of the first non-white-space character
|
||||
* at or after pos, or str.length(), if there is none.
|
||||
*/
|
||||
static int32_t skipWhitespace(const UnicodeString& str, int32_t pos);
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Generic filter-based scanning code
|
||||
//
|
||||
// NOTE: In general, we don't want to do this! This is a temporary
|
||||
// implementation until we have time for something that examines
|
||||
// the underlying UCharacter data structures in an intelligent
|
||||
// way. Iterating over all code points is dumb. What we want to
|
||||
// do, for instance, is iterate over internally-stored ranges
|
||||
// of characters that have a given property.
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* A filter that returns TRUE if the given code point should be
|
||||
* included in the UnicodeSet being constructed.
|
||||
*/
|
||||
typedef UBool (*Filter)(UChar32 codePoint, void* context);
|
||||
|
||||
/**
|
||||
* Set the given UnicodeSet to contain all code points for which
|
||||
* filter returns TRUE. The context parameter is passed unchanged
|
||||
* to the filter function.
|
||||
*/
|
||||
static void initSetFromFilter(UnicodeSet& set, Filter filter,
|
||||
void* context);
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Type and value name maps
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Add a type mapping to the name map.
|
||||
*/
|
||||
static void addType(const UnicodeString& shortName,
|
||||
const UnicodeString& longName,
|
||||
SetFactory factory);
|
||||
|
||||
/**
|
||||
* Add a value mapping to the name map.
|
||||
*/
|
||||
static void addValue(Hashtable* map,
|
||||
const UnicodeString& shortName,
|
||||
const UnicodeString& longName,
|
||||
int32_t value);
|
||||
|
||||
static void init();
|
||||
|
||||
public:
|
||||
static void cleanup();
|
||||
|
||||
private:
|
||||
//----------------------------------------------------------------
|
||||
// SetFactory <=> void*
|
||||
// I don't know why the compiler won't cast between these types.
|
||||
// They should be interconvertible. Does C++ distinguish between
|
||||
// pointers into code and pointers into data? In any case, we
|
||||
// convert between these types in a safe way here.
|
||||
//----------------------------------------------------------------
|
||||
|
||||
union SetFactoryTok {
|
||||
void* voidPointer;
|
||||
SetFactory functionPointer;
|
||||
};
|
||||
|
||||
inline static void* setFactoryToVoidPtr(SetFactory f) {
|
||||
SetFactoryTok tok;
|
||||
tok.functionPointer = f;
|
||||
return tok.voidPointer;
|
||||
}
|
||||
|
||||
inline static SetFactory voidPtrToSetFactory(void* p) {
|
||||
SetFactoryTok tok;
|
||||
tok.voidPointer = p;
|
||||
return tok.functionPointer;
|
||||
}
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
@ -124,10 +124,11 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
|
||||
TESTCASE(42,TestUndefinedVariable);
|
||||
TESTCASE(43,TestEmptyContext);
|
||||
TESTCASE(44,TestCompoundFilterID);
|
||||
TESTCASE(45,TestDevanagariLatinRT);
|
||||
TESTCASE(46,TestTeluguLatinRT);
|
||||
TESTCASE(47,TestCompoundLatinRT);
|
||||
TESTCASE(48,TestSanskritLatinRT);
|
||||
TESTCASE(45,TestPropertySet);
|
||||
TESTCASE(46,TestDevanagariLatinRT);
|
||||
TESTCASE(47,TestTeluguLatinRT);
|
||||
TESTCASE(48,TestCompoundLatinRT);
|
||||
TESTCASE(49,TestSanskritLatinRT);
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
@ -2066,6 +2067,15 @@ void TransliteratorTest::TestCompoundFilterID(void) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test new property set syntax
|
||||
*/
|
||||
void TransliteratorTest::TestPropertySet() {
|
||||
expect("a>A; \\p{Lu}>x; \\p{ANY}>y;", "abcDEF", "Ayyxxx");
|
||||
expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
|
||||
"[ a stitch ]\n[ in time ]\r[ saves 9]");
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Ram's tests
|
||||
//======================================================================
|
||||
|
@ -210,6 +210,11 @@ class TransliteratorTest : public IntlTest {
|
||||
*/
|
||||
void TestCompoundFilterID(void);
|
||||
|
||||
/**
|
||||
* Test new property set syntax
|
||||
*/
|
||||
void TestPropertySet(void);
|
||||
|
||||
/* Devanagari-Latin rules Test */
|
||||
void TestDevanagariLatinRT(void);
|
||||
|
||||
|
@ -41,7 +41,8 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
|
||||
CASE(4,TestMinimalRep);
|
||||
CASE(5,TestAPI);
|
||||
CASE(6,TestScriptSet);
|
||||
CASE(7,TestExhaustive);
|
||||
CASE(7,TestPropertySet);
|
||||
CASE(8,TestExhaustive);
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
@ -103,16 +104,18 @@ UnicodeSetTest::TestCategories(void) {
|
||||
void
|
||||
UnicodeSetTest::TestCloneEqualHash(void) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int8_t category=Unicode::LOWERCASE_LETTER;
|
||||
UnicodeSet *set1=new UnicodeSet(category, status); // :Li: Letter, lowercase
|
||||
//int8_t category=Unicode::LOWERCASE_LETTER;
|
||||
//UnicodeSet *set1=new UnicodeSet(category, status); // :Li: Letter, lowercase
|
||||
UnicodeSet *set1=new UnicodeSet("[:Ll:]", status); // Letter, lowercase
|
||||
if (U_FAILURE(status)){
|
||||
errln((UnicodeString)"FAIL: Can't construst set with cateegory->Ll");
|
||||
errln((UnicodeString)"FAIL: Can't construst set with category->Ll");
|
||||
return;
|
||||
}
|
||||
category=Unicode::DECIMAL_DIGIT_NUMBER;
|
||||
UnicodeSet *set2=new UnicodeSet(category, status); //Number, Decimal digit
|
||||
//category=Unicode::DECIMAL_DIGIT_NUMBER;
|
||||
//UnicodeSet *set2=new UnicodeSet(category, status); //Number, Decimal digit
|
||||
UnicodeSet *set2=new UnicodeSet("[:Nd:]", status); //Number, Decimal digit
|
||||
if (U_FAILURE(status)){
|
||||
errln((UnicodeString)"FAIL: Can't construct set with cateegory->Nd");
|
||||
errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
|
||||
return;
|
||||
}
|
||||
|
||||
@ -407,6 +410,22 @@ void UnicodeSetTest::TestScriptSet() {
|
||||
expectContainment(set2, "[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the [:Latin:] syntax.
|
||||
*/
|
||||
void UnicodeSetTest::TestPropertySet() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSet set("[:Latin:]", status);
|
||||
if (U_FAILURE(status)) { errln("FAIL"); return; }
|
||||
expectContainment(set, "aA", CharsToUnicodeString("\\u0391\\u03B1"));
|
||||
set.applyPattern("[\\p{Greek}]", status);
|
||||
if (U_FAILURE(status)) { errln("FAIL"); return; }
|
||||
expectContainment(set, CharsToUnicodeString("\\u0391\\u03B1"), "aA");
|
||||
set.applyPattern("\\P{ GENERAL Category = upper case letter }", status);
|
||||
if (U_FAILURE(status)) { errln("FAIL"); return; }
|
||||
expectContainment(set, "abc", "ABC");
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestExhaustive() {
|
||||
// exhaustive tests. Simulate UnicodeSets with integers.
|
||||
// That gives us very solid tests (except for large memory tests).
|
||||
@ -569,6 +588,15 @@ UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
|
||||
return pairs;
|
||||
}
|
||||
|
||||
void
|
||||
UnicodeSetTest::expectContainment(const UnicodeSet& set,
|
||||
const UnicodeString& charsIn,
|
||||
const UnicodeString& charsOut) {
|
||||
UnicodeString pat;
|
||||
set.toPattern(pat);
|
||||
expectContainment(set, pat, charsIn, charsOut);
|
||||
}
|
||||
|
||||
void
|
||||
UnicodeSetTest::expectContainment(const UnicodeSet& set,
|
||||
const UnicodeString& setName,
|
||||
|
@ -41,6 +41,11 @@ private:
|
||||
|
||||
void TestScriptSet(void);
|
||||
|
||||
/**
|
||||
* Test the [:Latin:] syntax.
|
||||
*/
|
||||
void TestPropertySet(void);
|
||||
|
||||
void TestExhaustive(void);
|
||||
|
||||
private:
|
||||
@ -79,6 +84,9 @@ private:
|
||||
*/
|
||||
static UnicodeString getPairs(const UnicodeSet& set);
|
||||
|
||||
void expectContainment(const UnicodeSet& set,
|
||||
const UnicodeString& charsIn,
|
||||
const UnicodeString& charsOut);
|
||||
void expectContainment(const UnicodeSet& set,
|
||||
const UnicodeString& setName,
|
||||
const UnicodeString& charsIn,
|
||||
|
Loading…
Reference in New Issue
Block a user