ICU-199 rewrite UnicodeSet pattern parser

X-SVN-Rev: 547
This commit is contained in:
Alan Liu 2000-01-12 18:03:53 +00:00
parent b06425886b
commit 13955b6ce2
2 changed files with 257 additions and 200 deletions

View File

@ -14,6 +14,7 @@
#include "unicode/unistr.h"
class ParsePosition;
class TransliterationRuleData;
/**
* A mutable set of Unicode characters. Objects of this class
@ -203,6 +204,24 @@ class U_I18N_API UnicodeSet {
*/
static UnicodeString* CATEGORY_PAIRS_CACHE;
/**
* Delimiter string used in patterns to close a category reference:
* ":]". Example: "[:Lu:]".
*/
static const UnicodeString CATEGORY_CLOSE;
/**
* Delimiter char beginning a variable reference:
* "{". Example: "{var}".
*/
static const UChar VARIABLE_REF_OPEN;
/**
* Delimiter char ending a variable reference:
* "}". Example: "{var}".
*/
static const UChar VARIABLE_REF_CLOSE;
//----------------------------------------------------------------
// Debugging and testing
//----------------------------------------------------------------
@ -503,6 +522,7 @@ private:
static UnicodeString& parse(UnicodeString& pairsBuf /*result*/,
const UnicodeString& pattern,
ParsePosition& pos,
const TransliterationRuleData* data,
UErrorCode& status);
//----------------------------------------------------------------

View File

@ -10,6 +10,7 @@
#include "unicode/uniset.h"
#include "unicode/parsepos.h"
#include "rbt_data.h"
// N.B.: This mapping is different in ICU and Java
const UnicodeString UnicodeSet::CATEGORY_NAMES(
@ -23,6 +24,24 @@ const UnicodeString UnicodeSet::CATEGORY_NAMES(
UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE =
new UnicodeString[Unicode::GENERAL_TYPES_COUNT];
/**
* Delimiter string used in patterns to close a category reference:
* ":]". Example: "[:Lu:]".
*/
const UnicodeString UnicodeSet::CATEGORY_CLOSE(":]", "");
/**
* Delimiter char beginning a variable reference:
* "{". Example: "{var}".
*/
const UChar UnicodeSet::VARIABLE_REF_OPEN = '{';
/**
* Delimiter char ending a variable reference:
* "}". Example: "{var}".
*/
const UChar UnicodeSet::VARIABLE_REF_CLOSE = '}';
//----------------------------------------------------------------
// Debugging and testing
//----------------------------------------------------------------
@ -175,7 +194,14 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern,
}
}
parse(pairs, *pat, pos, status);
parse(pairs, *pat, pos, NULL, status);
// Skip over trailing whitespace -- clean up later
while (pos.getIndex() < pat->length() &&
Unicode::isWhitespace(pat->charAt(pos.getIndex()))) {
pos.setIndex(pos.getIndex() + 1);
}
if (pos.getIndex() != pat->length()) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
@ -418,6 +444,7 @@ void UnicodeSet::clear(void) {
UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
const UnicodeString& pattern,
ParsePosition& pos,
const TransliterationRuleData* data,
UErrorCode& status) {
if (U_FAILURE(status)) {
return pairsBuf;
@ -426,34 +453,96 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
bool_t invert = FALSE;
pairsBuf.remove();
/**
* Nodes: 0 - idle, waiting for '['
* 10 - like 11, but immediately after "[" or "[^"
* 11 - awaiting x, "]", "[...]", or "[:...:]"
* 21 - after x
* 23 - after x-
*
* The parsing state machine moves from node 0 through zero or more
* other nodes back to node 0, in a U_SUCCESSful parse.
int32_t lastChar = -1; // This is either a char (0..FFFF) or -1
UChar lastOp = 0;
/* This loop iterates over the characters in the pattern. We start at
* the position specified by pos. We exit the loop when either a
* matching closing ']' is seen, or we read all characters of the
* pattern. In the latter case an error will be thrown.
*/
int32_t node = 0;
UChar first = 0;
int32_t i;
/**
* This loop iterates over the characters in the pattern. We
* start at the position specified by pos. We exit the loop
* when either a matching closing ']' is seen, or we read all
* characters of the pattern.
/* Pattern syntax:
* pat := '[' '^'? elem* ']'
* elem := a | a '-' a | set | set op set
* set := pat | (a set variable)
* op := '&' | '-'
* a := (a character, possibly defined by a var)
*/
for (i=pos.getIndex(); i<pattern.length(); ++i) {
UChar c = pattern.charAt(i); /**
* Handle escapes here. If a character is escaped, then
* it assumes its literal value. This is true for all
* characters, both special characters and characters with
* no special meaning. We also interpret '\\uxxxx' Unicode
* escapes here.
// mode 0: No chars parsed yet; next must be '['
// mode 1: '[' seen; if next is '^' or ':' then special
// mode 2: '[' '^'? seen; parse pattern and close with ']'
// mode 3: '[:' seen; parse category and close with ':]'
int8_t mode = 0;
int32_t openPos = 0; // offset to opening '['
int32_t i = pos.getIndex();
int32_t limit = pattern.length();
UnicodeString nestedAux;
UnicodeString* nestedPairs;
UnicodeString scratch;
for (; i<limit; ++i) {
/* If the next element is a single character, c will be set to it,
* and nestedPairs will be null. In this case isLiteral indicates
* whether the character should assume special meaning if it has
* one. If the next element is a nested set, either via a variable
* reference, or via an embedded "[..]" or "[:..:]" pattern, then
* nestedPairs will be set to the pairs list for the nested set, and
* c's value should be ignored.
*/
UChar c = pattern.charAt(i);
nestedPairs = NULL;
bool_t isLiteral = FALSE;
// Ignore whitespace. This is not Unicode whitespace, but Java
// whitespace, a subset of Unicode whitespace.
if (Unicode::isWhitespace(c)) {
continue;
}
// Parse the opening '[' and optional following '^'
switch (mode) {
case 0:
if (c == '[') {
mode = 1; // Next look for '^'
openPos = i;
continue;
} else {
// throw new IllegalArgumentException("Missing opening '['");
status = U_ILLEGAL_ARGUMENT_ERROR;
return pairsBuf;
}
case 1:
mode = 2;
switch (c) {
case '^':
invert = TRUE;
continue; // Back to top to fetch next character
case ':':
if (i == openPos+1) {
// '[:' cannot have whitespace in it
--i;
c = '[';
mode = 3;
// Fall through and parse category normally
}
break; // Fall through
case '-':
isLiteral = TRUE; // Treat leading '-' as a literal
break; // Fall through
}
// else fall through and parse this character normally
}
// After opening matter is parsed ("[", "[^", or "[:"), the mode
// will be 2 if we want a closing ']', or 3 if we should parse a
// category and close with ":]".
/* Handle escapes. If a character is escaped, then it assumes its
* literal value. This is true for all characters, both special
* characters and characters with no special meaning. We also
* interpret '\\uxxxx' Unicode escapes here (as literals).
*/
if (c == '\\') {
++i;
if (i < pattern.length()) {
@ -480,216 +569,164 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
return pairsBuf;
}
}
/**
* Within this loop, we handle each of the four
* conditions: '[', ']', '-', other. The first three
* characters must not be escaped.
*/
/**
* An opening bracket indicates either the first bracket
* of the entire subpattern we are parsing, in which case
* we are in node 0 and move into node 10. We also check
* for an immediately following '^', indicating the
* complement of the following pattern. ('^' is any other
* position has no special meaning.) If we are not in
* node 0, '[' represents a nested subpattern that must be
* recursively parsed and checked for following operators
* ('&' or '|'). If two nested subpatterns follow one
* another with no operator, their union is formed, just
* as with any other elements that follow one another
* without intervening operator. The other thing we
* handle here is the syntax "[:Xx:]" or "[:X:]" that
* indicates a Unicode category or supercategory.
/* Parse variable references. These are treated as literals. If a
* variable refers to a UnicodeSet, nestedPairs is assigned here.
* Variable names are only parsed if varNameToChar is not null.
* Set variables are only looked up if varCharToSet is not null.
*/
if (!isLiteral && c == '[') {
bool_t parseOp = FALSE;
else if (data != NULL && !isLiteral && c == VARIABLE_REF_OPEN) {
++i;
int32_t j = pattern.indexOf(VARIABLE_REF_CLOSE, i);
if (i == j || j < 0) { // empty or unterminated
// throw new IllegalArgumentException("Illegal variable reference");
status = U_ILLEGAL_ARGUMENT_ERROR;
} else {
scratch.truncate(0);
pattern.extractBetween(i, j, scratch);
++j;
c = data->lookupVariable(scratch, status);
}
if (U_FAILURE(status)) {
// Either the reference was ill-formed (empty name, or no
// closing '}', or the specified name is not defined.
return pairsBuf;
}
isLiteral = TRUE;
UnicodeSet* set = data->lookupSet(c);
if (set != NULL) {
nestedPairs = &set->pairs;
}
}
/* An opening bracket indicates the first bracket of a nested
* subpattern, either a normal pattern or a category pattern. We
* recognize these here and set nestedPairs accordingly.
*/
else if (!isLiteral && c == '[') {
// Handle "[:...:]", representing a character category
UChar d = charAfter(pattern, i);
// "[:...:]" represents a character category
if (d == ':') {
if (node == 23) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return pairsBuf;
}
if (node == 21) {
addPair(pairsBuf, first, first);
node = 11;
}
i += 2;
int32_t j = pattern.indexOf(":]", i);
int32_t j = pattern.indexOf(CATEGORY_CLOSE, i);
if (j < 0) {
// throw new IllegalArgumentException("Missing \":]\"");
status = U_ILLEGAL_ARGUMENT_ERROR;
return pairsBuf;
}
UnicodeString categoryName;
pattern.extract(i, j-i, categoryName);
UnicodeString temp;
doUnion(pairsBuf,
getCategoryPairs(temp, categoryName, status));
scratch.truncate(0);
pattern.extractBetween(i, j, scratch);
nestedPairs = &getCategoryPairs(nestedAux, scratch, status);
if (U_FAILURE(status)) {
return pairsBuf;
}
i = j+1;
if (node == 10) {
node = 11;
parseOp = TRUE;
} else if (node == 0) {
i = j+1; // Make i point to ']'
if (mode == 3) {
// Entire pattern is a category; leave parse loop
pairsBuf.append(*nestedPairs);
break;
}
} else {
if (node == 0) {
node = 10;
if (d == '^') {
invert = TRUE;
++i;
}
} else {
// Nested '['
pos.setIndex(i);
UnicodeString subPairs; // Pairs for the nested []
doUnion(pairsBuf, parse(subPairs, pattern, pos, status));
if (U_FAILURE(status)) {
return pairsBuf;
}
i = pos.getIndex() - 1; // Subtract 1 to point at ']'
parseOp = TRUE;
// Recurse to get the pairs for this nested set.
pos.setIndex(i);
nestedPairs = &parse(nestedAux, pattern, pos, data, status);
if (U_FAILURE(status)) {
return pairsBuf;
}
}
/**
* parseOp is true after "[:...:]" or a nested
* "[...]". It is false only after the final closing
* ']'. If parseOp is true, we look past the closing
* ']' to see if we have an operator character. If
* so, we parse the subsequent "[...]" recursively,
* then perform the operation. We do this in a loop
* until there are no more operators. Note that this
* means the operators have equal precedence and are
* bound left-to-right.
*/
if (parseOp) {
for (;;) {
// Is the next character an operator?
UChar op = charAfter(pattern, i);
if (op == '-' || op == '&') {
pos.setIndex(i+2); // Add 2 to point AFTER op
UnicodeString rhs;
parse(rhs, pattern, pos, status);
if (U_FAILURE(status)) {
return pairsBuf;
}
if (op == '-') {
doDifference(pairsBuf, rhs);
} else if (op == '&') {
doIntersection(pairsBuf, rhs);
}
i = pos.getIndex() - 1; // - 1 to point at ']'
} else {
break;
}
}
}
}
/**
* A closing bracket can only be a closing bracket for
* "[...]", since the closing bracket for "[:...:]" is
* taken care of when the initial "[:" is seen. When we
* see a closing bracket, we then know, if we were in node
* 21 (after x) or 23 (after x-) that nothing more is
* coming, and we add the last character(s) we saw to the
* set. Note that a trailing '-' assumes its literal
* meaning, just as a leading '-' after "[" or "[^".
*/
else if (!isLiteral && c == ']') {
if (node == 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return pairsBuf;
}
if (node == 21 || node == 23) {
addPair(pairsBuf, first, first);
if (node == 23) {
addPair(pairsBuf, '-', '-');
}
}
node = 0;
break;
}
/**
* '-' has the following interpretations: 1. Within
* "[...]", between two letters, it indicates a range.
* 2. Between two nested bracket patterns, "[[...]-[...]",
* it indicates asymmetric difference. 3. At the start of
* a bracket pattern, "[-...]", "[^-...]", it indicates
* the literal character '-'. 4. At the end of a bracket
* pattern, "[...-]", it indicates the literal character
* '-'.
*
* We handle cases 1 and 3 here. Cases 2 and 4 are
* handled in the ']' parsing code.
*/
else if (!isLiteral && c == '-') {
if (node == 10) {
addPair(pairsBuf, c, c); // Handle "[-...]", "[^-...]"
} else if (node == 21) {
node = 23;
} else {
status = U_ILLEGAL_ARGUMENT_ERROR;
return pairsBuf;
i = pos.getIndex() - 1; // - 1 to point at ']'
}
}
/**
* If we fall through to this point, we have a literal
* character, either one that has been escaped with a
* backslash, escaped with a backslash u, or that isn't
* a special '[', ']', or '-'.
*
* Literals can either start a range "x-...", end a range,
* "...-x", or indicate a single character "x".
/* At this point we have either a character c, or a nested set. If
* we have encountered a nested set, either embedded in the pattern,
* or as a variable, we have a non-null nestedPairs, and c should be
* ignored. Otherwise c is the current character, and isLiteral
* indicates whether it is an escaped literal (or variable) or a
* normal unescaped character. Unescaped characters '-', '&', and
* ']' have special meanings.
*/
else {
if (node == 10 || node == 11) {
first = c;
node = 21;
} else if (node == 21) {
addPair(pairsBuf, first, first);
first = c;
node = 21;
} else if (node == 23) {
if (c < first) {
if (nestedPairs != NULL) {
if (lastChar >= 0) {
if (lastOp != 0) {
// throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
status = U_ILLEGAL_ARGUMENT_ERROR;
return pairsBuf;
}
addPair(pairsBuf, first, c);
node = 11;
} else {
status = U_ILLEGAL_ARGUMENT_ERROR;
return pairsBuf;
addPair(pairsBuf, (UChar)lastChar, (UChar)lastChar);
lastChar = -1;
}
switch (lastOp) {
case '-':
doDifference(pairsBuf, *nestedPairs);
break;
case '&':
doIntersection(pairsBuf, *nestedPairs);
break;
case 0:
doUnion(pairsBuf, *nestedPairs);
break;
}
lastOp = 0;
} else if (!isLiteral && c == ']') {
// Final closing delimiter. This is the only way we leave this
// loop if the pattern is well-formed.
break;
} else if (lastOp == 0 && !isLiteral && (c == '-' || c == '&')) {
lastOp = c;
} else if (lastOp == '-') {
addPair(pairsBuf, (UChar)lastChar, c);
lastOp = 0;
lastChar = -1;
} else if (lastOp != 0) {
// We have <set>&<char> or <char>&<char>
// throw new IllegalArgumentException("Unquoted " + lastOp);
status = U_ILLEGAL_ARGUMENT_ERROR;
return pairsBuf;
} else {
if (lastChar >= 0) {
// We have <char><char>
addPair(pairsBuf, (UChar)lastChar, (UChar)lastChar);
}
lastChar = c;
}
}
if (node != 0) {
// Handle unprocessed stuff preceding the closing ']'
if (lastOp == '-') {
// Trailing '-' is treated as literal
addPair(pairsBuf, lastOp, lastOp);
} else if (lastOp == '&') {
// throw new IllegalArgumentException("Unquoted trailing " + lastOp);
status = U_ILLEGAL_ARGUMENT_ERROR;
return pairsBuf;
}
if (lastChar >= 0) {
addPair(pairsBuf, (UChar)lastChar, (UChar)lastChar);
}
/**
* i indexes the last character we parsed or is
* pattern.length(). In the latter case, the node will not be
* zero, since we have run off the end without finding a
* closing ']'. Therefore, the above statement will have
* thrown an exception, and we'll never get here. If we get
* here, we know i < pattern.length(), and we set the
* ParsePosition to the next character to be parsed.
*/
pos.setIndex(i+1);
/**
* If we saw a '^' after the initial '[' of this pattern, then
* perform the complement. (Inversion after '[:' is handled
* elsewhere.)
* If we saw a '^' after the initial '[' of this pattern, then perform
* the complement. (Inversion after '[:' is handled elsewhere.)
*/
if (invert) {
doComplement(pairsBuf);
}
/**
* i indexes the last character we parsed or is pattern.length(). In
* the latter case, we have run off the end without finding a closing
* ']'. Otherwise, we know i < pattern.length(), and we set the
* ParsePosition to the next character to be parsed.
*/
if (i == limit) {
// throw new IllegalArgumentException("Missing ']'");
status = U_ILLEGAL_ARGUMENT_ERROR;
return pairsBuf;
}
pos.setIndex(i+1);
return pairsBuf;
}