ICU-199 rewrite UnicodeSet pattern parser
X-SVN-Rev: 547
This commit is contained in:
parent
b06425886b
commit
13955b6ce2
@ -14,6 +14,7 @@
|
||||
#include "unicode/unistr.h"
|
||||
|
||||
class ParsePosition;
|
||||
class TransliterationRuleData;
|
||||
|
||||
/**
|
||||
* A mutable set of Unicode characters. Objects of this class
|
||||
@ -203,6 +204,24 @@ class U_I18N_API UnicodeSet {
|
||||
*/
|
||||
static UnicodeString* CATEGORY_PAIRS_CACHE;
|
||||
|
||||
/**
|
||||
* Delimiter string used in patterns to close a category reference:
|
||||
* ":]". Example: "[:Lu:]".
|
||||
*/
|
||||
static const UnicodeString CATEGORY_CLOSE;
|
||||
|
||||
/**
|
||||
* Delimiter char beginning a variable reference:
|
||||
* "{". Example: "{var}".
|
||||
*/
|
||||
static const UChar VARIABLE_REF_OPEN;
|
||||
|
||||
/**
|
||||
* Delimiter char ending a variable reference:
|
||||
* "}". Example: "{var}".
|
||||
*/
|
||||
static const UChar VARIABLE_REF_CLOSE;
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Debugging and testing
|
||||
//----------------------------------------------------------------
|
||||
@ -503,6 +522,7 @@ private:
|
||||
static UnicodeString& parse(UnicodeString& pairsBuf /*result*/,
|
||||
const UnicodeString& pattern,
|
||||
ParsePosition& pos,
|
||||
const TransliterationRuleData* data,
|
||||
UErrorCode& status);
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "rbt_data.h"
|
||||
|
||||
// N.B.: This mapping is different in ICU and Java
|
||||
const UnicodeString UnicodeSet::CATEGORY_NAMES(
|
||||
@ -23,6 +24,24 @@ const UnicodeString UnicodeSet::CATEGORY_NAMES(
|
||||
UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE =
|
||||
new UnicodeString[Unicode::GENERAL_TYPES_COUNT];
|
||||
|
||||
/**
|
||||
* Delimiter string used in patterns to close a category reference:
|
||||
* ":]". Example: "[:Lu:]".
|
||||
*/
|
||||
const UnicodeString UnicodeSet::CATEGORY_CLOSE(":]", "");
|
||||
|
||||
/**
|
||||
* Delimiter char beginning a variable reference:
|
||||
* "{". Example: "{var}".
|
||||
*/
|
||||
const UChar UnicodeSet::VARIABLE_REF_OPEN = '{';
|
||||
|
||||
/**
|
||||
* Delimiter char ending a variable reference:
|
||||
* "}". Example: "{var}".
|
||||
*/
|
||||
const UChar UnicodeSet::VARIABLE_REF_CLOSE = '}';
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Debugging and testing
|
||||
//----------------------------------------------------------------
|
||||
@ -175,7 +194,14 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern,
|
||||
}
|
||||
}
|
||||
|
||||
parse(pairs, *pat, pos, status);
|
||||
parse(pairs, *pat, pos, NULL, status);
|
||||
|
||||
// Skip over trailing whitespace -- clean up later
|
||||
while (pos.getIndex() < pat->length() &&
|
||||
Unicode::isWhitespace(pat->charAt(pos.getIndex()))) {
|
||||
pos.setIndex(pos.getIndex() + 1);
|
||||
}
|
||||
|
||||
if (pos.getIndex() != pat->length()) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
@ -418,6 +444,7 @@ void UnicodeSet::clear(void) {
|
||||
UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
|
||||
const UnicodeString& pattern,
|
||||
ParsePosition& pos,
|
||||
const TransliterationRuleData* data,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return pairsBuf;
|
||||
@ -426,34 +453,96 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
|
||||
bool_t invert = FALSE;
|
||||
pairsBuf.remove();
|
||||
|
||||
/**
|
||||
* Nodes: 0 - idle, waiting for '['
|
||||
* 10 - like 11, but immediately after "[" or "[^"
|
||||
* 11 - awaiting x, "]", "[...]", or "[:...:]"
|
||||
* 21 - after x
|
||||
* 23 - after x-
|
||||
*
|
||||
* The parsing state machine moves from node 0 through zero or more
|
||||
* other nodes back to node 0, in a U_SUCCESSful parse.
|
||||
int32_t lastChar = -1; // This is either a char (0..FFFF) or -1
|
||||
UChar lastOp = 0;
|
||||
|
||||
/* This loop iterates over the characters in the pattern. We start at
|
||||
* the position specified by pos. We exit the loop when either a
|
||||
* matching closing ']' is seen, or we read all characters of the
|
||||
* pattern. In the latter case an error will be thrown.
|
||||
*/
|
||||
int32_t node = 0;
|
||||
UChar first = 0;
|
||||
int32_t i;
|
||||
/**
|
||||
* This loop iterates over the characters in the pattern. We
|
||||
* start at the position specified by pos. We exit the loop
|
||||
* when either a matching closing ']' is seen, or we read all
|
||||
* characters of the pattern.
|
||||
|
||||
/* Pattern syntax:
|
||||
* pat := '[' '^'? elem* ']'
|
||||
* elem := a | a '-' a | set | set op set
|
||||
* set := pat | (a set variable)
|
||||
* op := '&' | '-'
|
||||
* a := (a character, possibly defined by a var)
|
||||
*/
|
||||
for (i=pos.getIndex(); i<pattern.length(); ++i) {
|
||||
UChar c = pattern.charAt(i); /**
|
||||
* Handle escapes here. If a character is escaped, then
|
||||
* it assumes its literal value. This is true for all
|
||||
* characters, both special characters and characters with
|
||||
* no special meaning. We also interpret '\\uxxxx' Unicode
|
||||
* escapes here.
|
||||
|
||||
// mode 0: No chars parsed yet; next must be '['
|
||||
// mode 1: '[' seen; if next is '^' or ':' then special
|
||||
// mode 2: '[' '^'? seen; parse pattern and close with ']'
|
||||
// mode 3: '[:' seen; parse category and close with ':]'
|
||||
int8_t mode = 0;
|
||||
int32_t openPos = 0; // offset to opening '['
|
||||
int32_t i = pos.getIndex();
|
||||
int32_t limit = pattern.length();
|
||||
UnicodeString nestedAux;
|
||||
UnicodeString* nestedPairs;
|
||||
UnicodeString scratch;
|
||||
for (; i<limit; ++i) {
|
||||
/* If the next element is a single character, c will be set to it,
|
||||
* and nestedPairs will be null. In this case isLiteral indicates
|
||||
* whether the character should assume special meaning if it has
|
||||
* one. If the next element is a nested set, either via a variable
|
||||
* reference, or via an embedded "[..]" or "[:..:]" pattern, then
|
||||
* nestedPairs will be set to the pairs list for the nested set, and
|
||||
* c's value should be ignored.
|
||||
*/
|
||||
UChar c = pattern.charAt(i);
|
||||
nestedPairs = NULL;
|
||||
bool_t isLiteral = FALSE;
|
||||
|
||||
// Ignore whitespace. This is not Unicode whitespace, but Java
|
||||
// whitespace, a subset of Unicode whitespace.
|
||||
if (Unicode::isWhitespace(c)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse the opening '[' and optional following '^'
|
||||
switch (mode) {
|
||||
case 0:
|
||||
if (c == '[') {
|
||||
mode = 1; // Next look for '^'
|
||||
openPos = i;
|
||||
continue;
|
||||
} else {
|
||||
// throw new IllegalArgumentException("Missing opening '['");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
case 1:
|
||||
mode = 2;
|
||||
switch (c) {
|
||||
case '^':
|
||||
invert = TRUE;
|
||||
continue; // Back to top to fetch next character
|
||||
case ':':
|
||||
if (i == openPos+1) {
|
||||
// '[:' cannot have whitespace in it
|
||||
--i;
|
||||
c = '[';
|
||||
mode = 3;
|
||||
// Fall through and parse category normally
|
||||
}
|
||||
break; // Fall through
|
||||
case '-':
|
||||
isLiteral = TRUE; // Treat leading '-' as a literal
|
||||
break; // Fall through
|
||||
}
|
||||
// else fall through and parse this character normally
|
||||
}
|
||||
|
||||
// After opening matter is parsed ("[", "[^", or "[:"), the mode
|
||||
// will be 2 if we want a closing ']', or 3 if we should parse a
|
||||
// category and close with ":]".
|
||||
|
||||
/* Handle escapes. If a character is escaped, then it assumes its
|
||||
* literal value. This is true for all characters, both special
|
||||
* characters and characters with no special meaning. We also
|
||||
* interpret '\\uxxxx' Unicode escapes here (as literals).
|
||||
*/
|
||||
if (c == '\\') {
|
||||
++i;
|
||||
if (i < pattern.length()) {
|
||||
@ -480,216 +569,164 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
|
||||
return pairsBuf;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Within this loop, we handle each of the four
|
||||
* conditions: '[', ']', '-', other. The first three
|
||||
* characters must not be escaped.
|
||||
*/
|
||||
|
||||
/**
|
||||
* An opening bracket indicates either the first bracket
|
||||
* of the entire subpattern we are parsing, in which case
|
||||
* we are in node 0 and move into node 10. We also check
|
||||
* for an immediately following '^', indicating the
|
||||
* complement of the following pattern. ('^' is any other
|
||||
* position has no special meaning.) If we are not in
|
||||
* node 0, '[' represents a nested subpattern that must be
|
||||
* recursively parsed and checked for following operators
|
||||
* ('&' or '|'). If two nested subpatterns follow one
|
||||
* another with no operator, their union is formed, just
|
||||
* as with any other elements that follow one another
|
||||
* without intervening operator. The other thing we
|
||||
* handle here is the syntax "[:Xx:]" or "[:X:]" that
|
||||
* indicates a Unicode category or supercategory.
|
||||
/* Parse variable references. These are treated as literals. If a
|
||||
* variable refers to a UnicodeSet, nestedPairs is assigned here.
|
||||
* Variable names are only parsed if varNameToChar is not null.
|
||||
* Set variables are only looked up if varCharToSet is not null.
|
||||
*/
|
||||
if (!isLiteral && c == '[') {
|
||||
bool_t parseOp = FALSE;
|
||||
else if (data != NULL && !isLiteral && c == VARIABLE_REF_OPEN) {
|
||||
++i;
|
||||
int32_t j = pattern.indexOf(VARIABLE_REF_CLOSE, i);
|
||||
if (i == j || j < 0) { // empty or unterminated
|
||||
// throw new IllegalArgumentException("Illegal variable reference");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
} else {
|
||||
scratch.truncate(0);
|
||||
pattern.extractBetween(i, j, scratch);
|
||||
++j;
|
||||
c = data->lookupVariable(scratch, status);
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
// Either the reference was ill-formed (empty name, or no
|
||||
// closing '}', or the specified name is not defined.
|
||||
return pairsBuf;
|
||||
}
|
||||
isLiteral = TRUE;
|
||||
|
||||
UnicodeSet* set = data->lookupSet(c);
|
||||
if (set != NULL) {
|
||||
nestedPairs = &set->pairs;
|
||||
}
|
||||
}
|
||||
|
||||
/* An opening bracket indicates the first bracket of a nested
|
||||
* subpattern, either a normal pattern or a category pattern. We
|
||||
* recognize these here and set nestedPairs accordingly.
|
||||
*/
|
||||
else if (!isLiteral && c == '[') {
|
||||
// Handle "[:...:]", representing a character category
|
||||
UChar d = charAfter(pattern, i);
|
||||
// "[:...:]" represents a character category
|
||||
if (d == ':') {
|
||||
if (node == 23) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
if (node == 21) {
|
||||
addPair(pairsBuf, first, first);
|
||||
node = 11;
|
||||
}
|
||||
i += 2;
|
||||
int32_t j = pattern.indexOf(":]", i);
|
||||
int32_t j = pattern.indexOf(CATEGORY_CLOSE, i);
|
||||
if (j < 0) {
|
||||
// throw new IllegalArgumentException("Missing \":]\"");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
UnicodeString categoryName;
|
||||
pattern.extract(i, j-i, categoryName);
|
||||
UnicodeString temp;
|
||||
doUnion(pairsBuf,
|
||||
getCategoryPairs(temp, categoryName, status));
|
||||
scratch.truncate(0);
|
||||
pattern.extractBetween(i, j, scratch);
|
||||
nestedPairs = &getCategoryPairs(nestedAux, scratch, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return pairsBuf;
|
||||
}
|
||||
i = j+1;
|
||||
if (node == 10) {
|
||||
node = 11;
|
||||
parseOp = TRUE;
|
||||
} else if (node == 0) {
|
||||
i = j+1; // Make i point to ']'
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse loop
|
||||
pairsBuf.append(*nestedPairs);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (node == 0) {
|
||||
node = 10;
|
||||
if (d == '^') {
|
||||
invert = TRUE;
|
||||
++i;
|
||||
}
|
||||
} else {
|
||||
// Nested '['
|
||||
pos.setIndex(i);
|
||||
UnicodeString subPairs; // Pairs for the nested []
|
||||
doUnion(pairsBuf, parse(subPairs, pattern, pos, status));
|
||||
if (U_FAILURE(status)) {
|
||||
return pairsBuf;
|
||||
}
|
||||
i = pos.getIndex() - 1; // Subtract 1 to point at ']'
|
||||
parseOp = TRUE;
|
||||
// Recurse to get the pairs for this nested set.
|
||||
pos.setIndex(i);
|
||||
nestedPairs = &parse(nestedAux, pattern, pos, data, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return pairsBuf;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* parseOp is true after "[:...:]" or a nested
|
||||
* "[...]". It is false only after the final closing
|
||||
* ']'. If parseOp is true, we look past the closing
|
||||
* ']' to see if we have an operator character. If
|
||||
* so, we parse the subsequent "[...]" recursively,
|
||||
* then perform the operation. We do this in a loop
|
||||
* until there are no more operators. Note that this
|
||||
* means the operators have equal precedence and are
|
||||
* bound left-to-right.
|
||||
*/
|
||||
if (parseOp) {
|
||||
for (;;) {
|
||||
// Is the next character an operator?
|
||||
UChar op = charAfter(pattern, i);
|
||||
if (op == '-' || op == '&') {
|
||||
pos.setIndex(i+2); // Add 2 to point AFTER op
|
||||
UnicodeString rhs;
|
||||
parse(rhs, pattern, pos, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return pairsBuf;
|
||||
}
|
||||
if (op == '-') {
|
||||
doDifference(pairsBuf, rhs);
|
||||
} else if (op == '&') {
|
||||
doIntersection(pairsBuf, rhs);
|
||||
}
|
||||
i = pos.getIndex() - 1; // - 1 to point at ']'
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* A closing bracket can only be a closing bracket for
|
||||
* "[...]", since the closing bracket for "[:...:]" is
|
||||
* taken care of when the initial "[:" is seen. When we
|
||||
* see a closing bracket, we then know, if we were in node
|
||||
* 21 (after x) or 23 (after x-) that nothing more is
|
||||
* coming, and we add the last character(s) we saw to the
|
||||
* set. Note that a trailing '-' assumes its literal
|
||||
* meaning, just as a leading '-' after "[" or "[^".
|
||||
*/
|
||||
else if (!isLiteral && c == ']') {
|
||||
if (node == 0) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
if (node == 21 || node == 23) {
|
||||
addPair(pairsBuf, first, first);
|
||||
if (node == 23) {
|
||||
addPair(pairsBuf, '-', '-');
|
||||
}
|
||||
}
|
||||
node = 0;
|
||||
break;
|
||||
}
|
||||
/**
|
||||
* '-' has the following interpretations: 1. Within
|
||||
* "[...]", between two letters, it indicates a range.
|
||||
* 2. Between two nested bracket patterns, "[[...]-[...]",
|
||||
* it indicates asymmetric difference. 3. At the start of
|
||||
* a bracket pattern, "[-...]", "[^-...]", it indicates
|
||||
* the literal character '-'. 4. At the end of a bracket
|
||||
* pattern, "[...-]", it indicates the literal character
|
||||
* '-'.
|
||||
*
|
||||
* We handle cases 1 and 3 here. Cases 2 and 4 are
|
||||
* handled in the ']' parsing code.
|
||||
*/
|
||||
else if (!isLiteral && c == '-') {
|
||||
if (node == 10) {
|
||||
addPair(pairsBuf, c, c); // Handle "[-...]", "[^-...]"
|
||||
} else if (node == 21) {
|
||||
node = 23;
|
||||
} else {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
i = pos.getIndex() - 1; // - 1 to point at ']'
|
||||
}
|
||||
}
|
||||
/**
|
||||
* If we fall through to this point, we have a literal
|
||||
* character, either one that has been escaped with a
|
||||
* backslash, escaped with a backslash u, or that isn't
|
||||
* a special '[', ']', or '-'.
|
||||
*
|
||||
* Literals can either start a range "x-...", end a range,
|
||||
* "...-x", or indicate a single character "x".
|
||||
|
||||
/* At this point we have either a character c, or a nested set. If
|
||||
* we have encountered a nested set, either embedded in the pattern,
|
||||
* or as a variable, we have a non-null nestedPairs, and c should be
|
||||
* ignored. Otherwise c is the current character, and isLiteral
|
||||
* indicates whether it is an escaped literal (or variable) or a
|
||||
* normal unescaped character. Unescaped characters '-', '&', and
|
||||
* ']' have special meanings.
|
||||
*/
|
||||
else {
|
||||
if (node == 10 || node == 11) {
|
||||
first = c;
|
||||
node = 21;
|
||||
} else if (node == 21) {
|
||||
addPair(pairsBuf, first, first);
|
||||
first = c;
|
||||
node = 21;
|
||||
} else if (node == 23) {
|
||||
if (c < first) {
|
||||
if (nestedPairs != NULL) {
|
||||
if (lastChar >= 0) {
|
||||
if (lastOp != 0) {
|
||||
// throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
addPair(pairsBuf, first, c);
|
||||
node = 11;
|
||||
} else {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
addPair(pairsBuf, (UChar)lastChar, (UChar)lastChar);
|
||||
lastChar = -1;
|
||||
}
|
||||
switch (lastOp) {
|
||||
case '-':
|
||||
doDifference(pairsBuf, *nestedPairs);
|
||||
break;
|
||||
case '&':
|
||||
doIntersection(pairsBuf, *nestedPairs);
|
||||
break;
|
||||
case 0:
|
||||
doUnion(pairsBuf, *nestedPairs);
|
||||
break;
|
||||
}
|
||||
lastOp = 0;
|
||||
} else if (!isLiteral && c == ']') {
|
||||
// Final closing delimiter. This is the only way we leave this
|
||||
// loop if the pattern is well-formed.
|
||||
break;
|
||||
} else if (lastOp == 0 && !isLiteral && (c == '-' || c == '&')) {
|
||||
lastOp = c;
|
||||
} else if (lastOp == '-') {
|
||||
addPair(pairsBuf, (UChar)lastChar, c);
|
||||
lastOp = 0;
|
||||
lastChar = -1;
|
||||
} else if (lastOp != 0) {
|
||||
// We have <set>&<char> or <char>&<char>
|
||||
// throw new IllegalArgumentException("Unquoted " + lastOp);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
} else {
|
||||
if (lastChar >= 0) {
|
||||
// We have <char><char>
|
||||
addPair(pairsBuf, (UChar)lastChar, (UChar)lastChar);
|
||||
}
|
||||
lastChar = c;
|
||||
}
|
||||
}
|
||||
|
||||
if (node != 0) {
|
||||
// Handle unprocessed stuff preceding the closing ']'
|
||||
if (lastOp == '-') {
|
||||
// Trailing '-' is treated as literal
|
||||
addPair(pairsBuf, lastOp, lastOp);
|
||||
} else if (lastOp == '&') {
|
||||
// throw new IllegalArgumentException("Unquoted trailing " + lastOp);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
if (lastChar >= 0) {
|
||||
addPair(pairsBuf, (UChar)lastChar, (UChar)lastChar);
|
||||
}
|
||||
|
||||
/**
|
||||
* i indexes the last character we parsed or is
|
||||
* pattern.length(). In the latter case, the node will not be
|
||||
* zero, since we have run off the end without finding a
|
||||
* closing ']'. Therefore, the above statement will have
|
||||
* thrown an exception, and we'll never get here. If we get
|
||||
* here, we know i < pattern.length(), and we set the
|
||||
* ParsePosition to the next character to be parsed.
|
||||
*/
|
||||
pos.setIndex(i+1);
|
||||
/**
|
||||
* If we saw a '^' after the initial '[' of this pattern, then
|
||||
* perform the complement. (Inversion after '[:' is handled
|
||||
* elsewhere.)
|
||||
* If we saw a '^' after the initial '[' of this pattern, then perform
|
||||
* the complement. (Inversion after '[:' is handled elsewhere.)
|
||||
*/
|
||||
if (invert) {
|
||||
doComplement(pairsBuf);
|
||||
}
|
||||
|
||||
/**
|
||||
* i indexes the last character we parsed or is pattern.length(). In
|
||||
* the latter case, we have run off the end without finding a closing
|
||||
* ']'. Otherwise, we know i < pattern.length(), and we set the
|
||||
* ParsePosition to the next character to be parsed.
|
||||
*/
|
||||
if (i == limit) {
|
||||
// throw new IllegalArgumentException("Missing ']'");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
|
||||
pos.setIndex(i+1);
|
||||
|
||||
return pairsBuf;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user