ICU-1406 make quantified segments behave like perl counterparts

X-SVN-Rev: 6493
This commit is contained in:
Alan Liu 2001-10-30 18:08:53 +00:00
parent 0d08aaadcc
commit 2c2b11dfe8
13 changed files with 1073 additions and 1463 deletions

View File

@ -63,6 +63,10 @@ static const UChar gOPERATORS[] = {
0x3D, 0x3E, 0x3C, 0 // "=><" 0x3D, 0x3E, 0x3C, 0 // "=><"
}; };
static const UChar HALF_ENDERS[] = {
0x3D, 0x3E, 0x3C, 59, 0 // "=><;"
};
// These are also used in Transliterator::toRules() // These are also used in Transliterator::toRules()
static const int32_t ID_TOKEN_LEN = 2; static const int32_t ID_TOKEN_LEN = 2;
static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':' static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'
@ -147,256 +151,6 @@ UnicodeString ParseData::parseReference(const UnicodeString& text,
return result; return result;
} }
//----------------------------------------------------------------------
// Segments
//----------------------------------------------------------------------
/**
* Segments are parentheses-enclosed regions of the input string.
* These are referenced in the output string using the notation $1,
* $2, etc. Numbering is in order of appearance of the left
* parenthesis. Number is one-based. Segments are defined as start,
* limit pairs. Segments may nest.
*
* During parsing, segment data is encoded in an object of class
* Segments. At runtime, the same data is encoded in compact form as
* an array of integers in a TransliterationRule. The runtime encoding
* must satisfy three goals:
*
* 1. Iterate over the offsets in a pattern, from left to right,
* and indicate all segment boundaries, in order. This is done
* during matching.
*
* 2. Given a reference $n, produce the start and limit offsets
* for that segment. This is done during replacement.
*
* 3. Similar to goal 1, but in addition, indicate whether each
* segment boundary is a start or a limit, in other words, whether
* each is an open paren or a close paren. This is required by
* the toRule() method.
*
* Goal 1 must be satisfied at high speed since this is done during
* matching. Goal 2 is next most important. Goal 3 is not performance
* critical since it is only needed by toRule().
*
* The array of integers is actually two arrays concatenated. The
* first gives the index values of the open and close parentheses in
* the order they appear. The second maps segment numbers to the
* indices of the first array. The two arrays have the same length.
* Iterating over the first array satisfies goal 1. Indexing into the
* second array satisfies goal 2. Goal 3 is satisfied by iterating
* over the second array and constructing the required data when
* needed. This is what toRule() does.
*
* Example: (a b(c d)e f)
* 0 1 2 3 4 5 6
*
* First array: Indices are 0, 2, 4, and 6.
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
* second array is 0, 3, 1 2 -- these give the indices in the
* first array at which $1:open, $1:close, $2:open, and $2:close
* occur.
*
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
*
* Each subarray is terminated with a -1, and two leading entries
* give the number of segments and the offset to the first entry
* of the second array. In addition, the second array value are
* all offset by 2 so they index directly into the final array.
* The total array size is 4*segments[0] + 4. The second index is
* 2*segments[0] + 3.
*
* In the output string, a segment reference is indicated by a
* character in a special range, as defined by
* RuleBasedTransliterator.Data.
*
* Most rules have no segments, in which case segments is null, and the
* output string need not be checked for segment reference characters.
*
* See also rbt_rule.h/cpp.
*/
class Segments {
UVector offsets;
UVector isOpenParen;
public:
Segments(UErrorCode &status);
~Segments();
void addParenthesisAt(int32_t offset, UBool isOpenParen, UErrorCode &status);
int32_t getLastParenOffset(UBool& isOpenParen) const;
UBool extractLastParenSubstring(int32_t& start, int32_t& limit);
int32_t* createArray(UErrorCode &status) const;
UBool validate() const;
int32_t count() const; // number of segments
private:
int32_t offset(int32_t i) const;
UBool isOpen(int32_t i) const;
int32_t size() const; // size of the UVectors
};
int32_t Segments::offset(int32_t i) const {
return offsets.elementAti(i);
}
UBool Segments::isOpen(int32_t i) const {
return isOpenParen.elementAti(i) != 0;
}
int32_t Segments::size() const {
// assert(offset.size() == isOpenParen.size());
return offsets.size();
}
Segments::Segments(UErrorCode &status)
: offsets(status),
isOpenParen(status)
{}
Segments::~Segments() {}
void Segments::addParenthesisAt(int32_t offset, UBool isOpen, UErrorCode &status) {
offsets.addElement(offset, status);
isOpenParen.addElement(isOpen ? 1 : 0, status);
}
int32_t Segments::getLastParenOffset(UBool& isOpenParenReturn) const {
if (size() == 0) {
return -1;
}
isOpenParenReturn = isOpen(size()-1);
return offset(size()-1);
}
// Remove the last (rightmost) segment. Store its offsets in start
// and limit, and then convert all offsets at or after start to be
// equal to start. Upon failure, return FALSE. Assume that the
// caller has already called getLastParenOffset() and validated that
// there is at least one parenthesis and that the last one is a close
// paren.
UBool Segments::extractLastParenSubstring(int32_t& start, int32_t& limit) {
// assert(offsets.size() > 0);
// assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
int32_t i = size() - 1;
int32_t n = 1; // count of close parens we need to match
// Record position of the last close paren
limit = offset(i);
--i; // back up to the one before the last one
while (i >= 0 && n != 0) {
n += isOpen(i) ? -1 : 1;
}
if (n != 0) {
return FALSE;
}
// assert(i>=0);
start = offset(i);
// Reset all segment pairs from i to size() - 1 to [start, start+1).
while (i<size()) {
int32_t o = isOpen(i) ? start : (start+1);
offsets.setElementAt(o, i);
++i;
}
return TRUE;
}
// Assume caller has already gotten a TRUE validate().
int32_t* Segments::createArray(UErrorCode &status) const {
int32_t c = count(); // number of segments
int32_t arrayLen = 4*c + 4;
int32_t *array = new int32_t[arrayLen];
int32_t a2offset = 2*c + 3; // offset to array 2
if (array == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
array[0] = c;
array[1] = a2offset;
int32_t i;
for (i=0; i<2*c; ++i) {
array[2+i] = offset(i);
}
array[a2offset-1] = -1;
array[arrayLen-1] = -1;
// Now walk through and match up segment numbers with parentheses.
// Number segments from 0. We're going to offset all entries by 2
// to skip the first two elements, array[0] and array[1].
UStack stack(status);
int32_t nextOpen = 0; // seg # of next open, 0-based
if (U_FAILURE(status)) {
return NULL;
}
for (i=0; i<2*c; ++i) {
UBool open = isOpen(i);
// Let seg be the zero-based segment number.
// Open parens are at 2*seg in array 2.
// Close parens are at 2*seg+1 in array 2.
if (open) {
array[a2offset + 2*nextOpen] = 2+i;
stack.push(nextOpen, status);
++nextOpen;
} else {
int32_t nextClose = stack.popi();
array[a2offset + 2*nextClose+1] = 2+i;
}
}
// assert(stack.empty());
// Perform a series of checks on the array. DO NOT COMPILE INTO
// PRODUCTION CODE. Use to debug array building problems.
//
//::if (!stack.empty()) {
//:: __asm int 03;
//::}
//::// check the array
//::if (array[0] < 1) {
//:: __asm int 03;
//::}
//::if (array[1] < 5) {
//:: __asm int 03;
//::}
//::for (i=2; i<2+array[0]*2; ++i) {
//:: if (array[i] < 0) { // array[i] is an offset into the rule
//:: __asm int 03;
//:: }
//::}
//::if (array[2+array[0]*2] != -1) {
//:: __asm int 03;
//::}
//::for (i=array[1]; i<array[1]+array[0]*2; ++i) {
//:: if (array[i] < 2 || array[i] >= (2+2*array[0])) {
//:: __asm int 03;
//:: }
//::}
//::if (array[array[1]+array[0]*2] != -1) {
//:: __asm int 03;
//::}
return array;
}
UBool Segments::validate() const {
// want number of parens >= 2
// want number of parens to be even
// want first paren '('
// want parens to match up in the end
if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
return FALSE;
}
int32_t n = 0;
for (int32_t i=0; i<size(); ++i) {
n += isOpen(i) ? 1 : -1;
if (n < 0) {
return FALSE;
}
}
return n == 0;
}
// Assume caller has already gotten a TRUE validate().
int32_t Segments::count() const {
// assert(validate());
return size() / 2;
}
//---------------------------------------------------------------------- //----------------------------------------------------------------------
// BEGIN RuleHalf // BEGIN RuleHalf
//---------------------------------------------------------------------- //----------------------------------------------------------------------
@ -416,11 +170,7 @@ public:
int32_t ante; // position of ante context marker '{' in text int32_t ante; // position of ante context marker '{' in text
int32_t post; // position of post context marker '}' in text int32_t post; // position of post context marker '}' in text
// Record the position of the segment substrings and references. A int32_t maxRef; // n where maximum segment ref is $n; 1-based
// given side should have segments or segment references, but not
// both.
Segments* segments;
int32_t maxRef; // index of largest ref ($n) on the right
// Record the offset to the cursor either to the left or to the // Record the offset to the cursor either to the left or to the
// right of the key. This is indicated by characters on the output // right of the key. This is indicated by characters on the output
@ -432,9 +182,26 @@ public:
// output text. // output text.
int32_t cursorOffset; // only nonzero on output side int32_t cursorOffset; // only nonzero on output side
// Position of first CURSOR_OFFSET on _right_. This will be -1
// for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
int32_t cursorOffsetPos;
UBool anchorStart; UBool anchorStart;
UBool anchorEnd; UBool anchorEnd;
UErrorCode ec;
/**
* UnicodeMatcher objects corresponding to each segment.
*/
UVector segments;
/**
* The segment number from 0..n-1 of the next '(' we see
* during parsing; 0-based.
*/
int32_t nextSegmentNumber;
TransliteratorParser& parser; TransliteratorParser& parser;
//-------------------------------------------------- //--------------------------------------------------
@ -443,22 +210,22 @@ public:
RuleHalf(TransliteratorParser& parser); RuleHalf(TransliteratorParser& parser);
~RuleHalf(); ~RuleHalf();
/**
* Parse one side of a rule, stopping at either the limit,
* the END_OF_RULE character, or an operator. Return
* the pos of the terminating character (or limit).
*/
int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit); int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit);
int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
UnicodeString& buf,
UBool isSegment);
/** /**
* Remove context. * Remove context.
*/ */
void removeContext(); void removeContext();
/** /**
* Create and return an int[] array of segments. * Create and return a UnicodeMatcher*[] array of segments,
* or NULL if there are no segments.
*/ */
int32_t* createSegments(UErrorCode& status) const; UnicodeMatcher** createSegments(UErrorCode& status) const;
int syntaxError(UErrorCode code, int syntaxError(UErrorCode code,
const UnicodeString& rule, const UnicodeString& rule,
@ -472,30 +239,69 @@ private:
RuleHalf& operator=(const RuleHalf&); RuleHalf& operator=(const RuleHalf&);
}; };
RuleHalf::RuleHalf(TransliteratorParser& p) : parser(p) { RuleHalf::RuleHalf(TransliteratorParser& p) :
ec(U_ZERO_ERROR),
segments(ec),
parser(p)
{
cursor = -1; cursor = -1;
ante = -1; ante = -1;
post = -1; post = -1;
segments = NULL;
maxRef = -1; maxRef = -1;
cursorOffset = 0; cursorOffset = 0;
cursorOffsetPos = 0;
anchorStart = anchorEnd = FALSE; anchorStart = anchorEnd = FALSE;
segments.removeAllElements();
nextSegmentNumber = 0;
} }
RuleHalf::~RuleHalf() { RuleHalf::~RuleHalf() {
delete segments;
} }
/** /**
* Parse one side of a rule, stopping at either the limit, * Parse one side of a rule, stopping at either the limit,
* the END_OF_RULE character, or an operator. Return * the END_OF_RULE character, or an operator.
* the pos of the terminating character (or limit). * @return the index after the terminating character, or
* if limit was reached, limit
*/ */
int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) { int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
int32_t start = pos; int32_t start = pos;
UnicodeString& buf = text; text.truncate(0);
pos = parseSection(rule, pos, limit, text, FALSE);
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
}
return pos;
}
/**
* Parse a section of one side of a rule, stopping at either
* the limit, the END_OF_RULE character, an operator, or a
* segment close character. This method parses both a
* top-level rule half and a segment within such a rule half.
* It calls itself recursively to parse segments and nested
* segments.
* @param buf buffer into which to accumulate the rule pattern
* characters, either literal characters from the rule or
* standins for UnicodeMatcher objects including segments.
* @param isSegment if true, then we've already seen a '(' and
* pos on entry points right after it. Accumulate everything
* up to the closing ')', put it in a segment matcher object,
* generate a standin for it, and add the standin to buf. As
* a side effect, update the segments vector with a reference
* to the segment matcher. This works recursively for nested
* segments. If isSegment is false, just accumulate
* characters into buf.
* @return the index after the terminating character, or
* if limit was reached, limit
*/
int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
UnicodeString& buf,
UBool isSegment) {
int32_t start = pos;
ParsePosition pp; ParsePosition pp;
int32_t cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
UnicodeString scratch; UnicodeString scratch;
UBool done = FALSE; UBool done = FALSE;
int32_t quoteStart = -1; // Most recent 'single quoted string' int32_t quoteStart = -1; // Most recent 'single quoted string'
@ -503,6 +309,15 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
int32_t varStart = -1; // Most recent $variableReference int32_t varStart = -1; // Most recent $variableReference
int32_t varLimit = -1; int32_t varLimit = -1;
// If isSegment, then bufSegStart is the offset in buf to
// the first character of the segment we are parsing.
int32_t bufSegStart = 0;
int32_t segmentNumber = 0;
if (isSegment) {
bufSegStart = buf.length();
segmentNumber = nextSegmentNumber++;
}
while (pos < limit && !done) { while (pos < limit && !done) {
UChar c = rule.charAt(pos++); UChar c = rule.charAt(pos++);
if (u_isWhitespace(c)) { if (u_isWhitespace(c)) {
@ -511,8 +326,11 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
// whitespace likely to be seen in code. // whitespace likely to be seen in code.
continue; continue;
} }
if (u_strchr(gOPERATORS, c) != NULL) { if (u_strchr(HALF_ENDERS, c) != NULL) {
--pos; // Backup to point to operator if (isSegment) {
// Unclosed segment
return syntaxError(U_UNCLOSED_SEGMENT, rule, start);
}
break; break;
} }
if (anchorEnd) { if (anchorEnd) {
@ -575,6 +393,10 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
continue; continue;
} }
switch (c) { switch (c) {
//------------------------------------------------------
// Elements allowed within and out of segments
//------------------------------------------------------
case ANCHOR_START: case ANCHOR_START:
if (buf.length() == 0 && !anchorStart) { if (buf.length() == 0 && !anchorStart) {
anchorStart = TRUE; anchorStart = TRUE;
@ -584,17 +406,7 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
} }
break; break;
case SEGMENT_OPEN: case SEGMENT_OPEN:
case SEGMENT_CLOSE: pos = parseSection(rule, pos, limit, buf, TRUE);
// Handle segment definitions "(" and ")"
// Parse "(", ")"
if (segments == NULL) {
segments = new Segments(parser.status);
}
segments->addParenthesisAt(buf.length(), c == SEGMENT_OPEN, parser.status);
break;
case END_OF_RULE:
--pos; // Backup to point to END_OF_RULE
done = TRUE;
break; break;
case SymbolTable::SYMBOL_REF: case SymbolTable::SYMBOL_REF:
// Handle variable references and segment references "$1" .. "$9" // Handle variable references and segment references "$1" .. "$9"
@ -655,25 +467,128 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
} }
} }
break; break;
case DOT:
buf.append(parser.getDotStandIn());
break;
case KLEENE_STAR:
case ONE_OR_MORE:
case ZERO_OR_ONE:
// Quantifiers. We handle single characters, quoted strings,
// variable references, and segments.
// a+ matches aaa
// 'foo'+ matches foofoofoo
// $v+ matches xyxyxy if $v == xy
// (seg)+ matches segsegseg
{
if (isSegment && buf.length() == bufSegStart) {
// The */+ immediately follows '('
return syntaxError(U_MISPLACED_QUANTIFIER, rule, start);
}
int32_t qstart, qlimit;
// The */+ follows an isolated character or quote
// or variable reference
if (buf.length() == quoteLimit) {
// The */+ follows a 'quoted string'
qstart = quoteStart;
qlimit = quoteLimit;
} else if (buf.length() == varLimit) {
// The */+ follows a $variableReference
qstart = varStart;
qlimit = varLimit;
} else {
// The */+ follows a single character, possibly
// a segment standin
qstart = buf.length() - 1;
qlimit = qstart + 1;
}
UnicodeMatcher *m =
new StringMatcher(buf, qstart, qlimit, FALSE, *parser.data);
int32_t min = 0;
int32_t max = Quantifier::MAX;
switch (c) {
case ONE_OR_MORE:
min = 1;
break;
case ZERO_OR_ONE:
min = 0;
max = 1;
break;
// case KLEENE_STAR:
// do nothing -- min, max already set
}
m = new Quantifier(m, min, max);
buf.truncate(qstart);
buf.append(parser.generateStandInFor(m));
}
break;
//------------------------------------------------------
// Elements allowed ONLY WITHIN segments
//------------------------------------------------------
case SEGMENT_CLOSE:
if (isSegment) {
// We're done parsing a segment. The relevant
// characters are in buf, starting at offset
// bufSegStart. Extract them into a string
// matcher, and replace them with a standin
// for that matcher.
StringMatcher *m =
new StringMatcher(buf, bufSegStart, buf.length(),
TRUE, *parser.data);
// Since we call parseSection() recursively,
// nested segments will result in segment i+1
// getting parsed and stored before segment i;
// be careful with the vector handling here.
if ((segmentNumber+1) > segments.size()) {
segments.setSize(segmentNumber+1);
}
segments.setElementAt(m, segmentNumber);
buf.truncate(bufSegStart);
buf.append(parser.generateStandInFor(m));
done = TRUE;
break;
}
// If we aren't in a segment, then a segment close
// character is a syntax error.
return syntaxError(U_UNQUOTED_SPECIAL, rule, start);
//------------------------------------------------------
// Elements allowed ONLY OUTSIDE segments
//------------------------------------------------------
case CONTEXT_ANTE: case CONTEXT_ANTE:
if (isSegment) {
return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
}
if (ante >= 0) { if (ante >= 0) {
return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start); return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start);
} }
ante = buf.length(); ante = buf.length();
break; break;
case CONTEXT_POST: case CONTEXT_POST:
if (isSegment) {
return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
}
if (post >= 0) { if (post >= 0) {
return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start); return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start);
} }
post = buf.length(); post = buf.length();
break; break;
case CURSOR_POS: case CURSOR_POS:
if (isSegment) {
return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
}
if (cursor >= 0) { if (cursor >= 0) {
return syntaxError(U_MULTIPLE_CURSORS, rule, start); return syntaxError(U_MULTIPLE_CURSORS, rule, start);
} }
cursor = buf.length(); cursor = buf.length();
break; break;
case CURSOR_OFFSET: case CURSOR_OFFSET:
if (isSegment) {
return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
}
if (cursorOffset < 0) { if (cursorOffset < 0) {
if (buf.length() > 0) { if (buf.length() > 0) {
return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start); return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
@ -695,69 +610,11 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
} }
} }
break; break;
case DOT:
buf.append(parser.getDotStandIn());
break; //------------------------------------------------------
case KLEENE_STAR: // Non-special characters
case ONE_OR_MORE: //------------------------------------------------------
case ZERO_OR_ONE:
// Quantifiers. We handle single characters, quoted strings,
// variable references, and segments.
// a+ matches aaa
// 'foo'+ matches foofoofoo
// $v+ matches xyxyxy if $v == xy
// (seg)+ matches segsegseg
{
int32_t start, limit;
UBool isOpenParen;
UBool isSegment = FALSE;
if (segments != 0 &&
segments->getLastParenOffset(isOpenParen) == buf.length()) {
// The */+ immediately follows a segment
if (isOpenParen) {
return syntaxError(U_MISPLACED_QUANTIFIER, rule, start);
}
if (!segments->extractLastParenSubstring(start, limit)) {
return syntaxError(U_MISMATCHED_SEGMENT_DELIMITERS, rule, start);
}
isSegment = TRUE;
} else {
// The */+ follows an isolated character or quote
// or variable reference
if (buf.length() == quoteLimit) {
// The */+ follows a 'quoted string'
start = quoteStart;
limit = quoteLimit;
} else if (buf.length() == varLimit) {
// The */+ follows a $variableReference
start = varStart;
limit = varLimit;
} else {
// The */+ follows a single character
start = buf.length() - 1;
limit = start + 1;
}
}
UnicodeMatcher *m =
new StringMatcher(buf, start, limit, isSegment, *parser.data);
int32_t min = 0;
int32_t max = Quantifier::MAX;
switch (c) {
case ONE_OR_MORE:
min = 1;
break;
case ZERO_OR_ONE:
min = 0;
max = 1;
break;
// case KLEENE_STAR:
// do nothing -- min, max already set
}
m = new Quantifier(m, min, max);
buf.truncate(start);
buf.append(parser.generateStandInFor(m));
}
break;
default: default:
// Disallow unquoted characters other than [0-9A-Za-z] // Disallow unquoted characters other than [0-9A-Za-z]
// in the printable ASCII range. These characters are // in the printable ASCII range. These characters are
@ -773,10 +630,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
} }
} }
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
}
// text = buf.toString();
return pos; return pos;
} }
@ -797,10 +650,15 @@ void RuleHalf::removeContext() {
} }
/** /**
* Create and return an int32_t[] array of segments. * Create and return a UnicodeMatcher*[] array of segments,
* or NULL if there are no segments.
*/ */
int32_t* RuleHalf::createSegments(UErrorCode& status) const { UnicodeMatcher** RuleHalf::createSegments(UErrorCode& status) const {
return (segments == 0) ? 0 : segments->createArray(status); if (segments.size() == 0) {
return NULL;
}
UnicodeMatcher** result = new UnicodeMatcher*[segments.size()];
return (UnicodeMatcher**) segments.toArray((void**) result);
} }
//---------------------------------------------------------------------- //----------------------------------------------------------------------
@ -1172,9 +1030,10 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
return start; return start;
} }
if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(pos++))) == NULL) { if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) {
return syntaxError(U_MISSING_OPERATOR, rule, start); return syntaxError(U_MISSING_OPERATOR, rule, start);
} }
++pos;
// Found an operator char. Check for forward-reverse operator. // Found an operator char. Check for forward-reverse operator.
if (op == REVERSE_RULE_OP && if (op == REVERSE_RULE_OP &&
@ -1189,7 +1048,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
} }
if (pos < limit) { if (pos < limit) {
if (rule.charAt(pos) == END_OF_RULE) { if (rule.charAt(--pos) == END_OF_RULE) {
++pos; ++pos;
} else { } else {
// RuleHalf parser must have terminated at an operator // RuleHalf parser must have terminated at an operator
@ -1251,8 +1110,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
// apply. // apply.
if (op == FWDREV_RULE_OP) { if (op == FWDREV_RULE_OP) {
right->removeContext(); right->removeContext();
delete right->segments; right->segments.removeAllElements();
right->segments = NULL;
left->cursor = left->maxRef = -1; left->cursor = left->maxRef = -1;
left->cursorOffset = 0; left->cursorOffset = 0;
} }
@ -1272,7 +1130,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
// cannot place the cursor outside the limits of the context. // cannot place the cursor outside the limits of the context.
// Anchors are only allowed on the input side. // Anchors are only allowed on the input side.
if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 || if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 ||
right->segments != NULL || left->maxRef >= 0 || right->segments.size() > 0 || left->maxRef >= 0 ||
(right->cursorOffset != 0 && right->cursor < 0) || (right->cursorOffset != 0 && right->cursor < 0) ||
// - The following two checks were used to ensure that the // - The following two checks were used to ensure that the
// - the cursor offset stayed within the ante- or postcontext. // - the cursor offset stayed within the ante- or postcontext.
@ -1288,20 +1146,15 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
// Check integrity of segments and segment references. Each // Check integrity of segments and segment references. Each
// segment's start must have a corresponding limit, and the // segment's start must have a corresponding limit, and the
// references must not refer to segments that do not exist. // references must not refer to segments that do not exist.
if (left->segments != NULL) { if (right->maxRef > left->segments.size()) {
if (!left->segments->validate()) {
return syntaxError(U_MISSING_SEGMENT_CLOSE, rule, start);
}
int32_t n = left->segments->count();
if (right->maxRef > n) {
return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start); return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start);
} }
}
data->ruleSet.addRule(new TransliterationRule( data->ruleSet.addRule(new TransliterationRule(
left->text, left->ante, left->post, left->text, left->ante, left->post,
right->text, right->cursor, right->cursorOffset, right->text, right->cursor, right->cursorOffset,
left->createSegments(status), left->createSegments(status),
left->segments.size(),
left->anchorStart, left->anchorEnd, left->anchorStart, left->anchorEnd,
data, data,
status), status); status), status);
@ -1366,7 +1219,7 @@ UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
if (variableNext >= variableLimit) { if (variableNext >= variableLimit) {
// throw new RuntimeException("Private use variables exhausted"); // throw new RuntimeException("Private use variables exhausted");
delete adopted; delete adopted;
status = U_ILLEGAL_ARGUMENT_ERROR; status = U_VARIABLE_RANGE_EXHAUSTED;
return 0; return 0;
} }
variablesVector->addElement(adopted, status); variablesVector->addElement(adopted, status);

View File

@ -14,28 +14,11 @@
#include "unicode/uniset.h" #include "unicode/uniset.h"
#include "unicode/unicode.h" #include "unicode/unicode.h"
#include "cmemory.h" #include "cmemory.h"
#include "strmatch.h"
static const UChar APOSTROPHE = 0x0027; // '\'' static const UChar APOSTROPHE = 0x0027; // '\''
static const UChar BACKSLASH = 0x005C; // '\' static const UChar BACKSLASH = 0x005C; // '\'
// To process segments we need to allocate arrays of integers. We use
// stack storage as long as the segment count is <= MAX_STATIC_SEGS.
// Otherwise, we allocate heap space.
#define MAX_STATIC_SEGS 20
// Macros for accessing the array of integers encoding the position of
// SEGMENTS_COUNT number of segments, n (half the number of parens)
// SEGMENTS_LEN length of the segments array (number of elements)
// SEGMENTS_POS position in 'pattern' of parenthesis i, where i=0..2n-1
// SEGMENTS_NUM index into segments to access POS of $1.open,
// $1.close, $2.open, $2.close,.., $n.open, $n.close
// Relative to FIRST_SEG_POS_INDEX. Ranges from 0..2n-1.
#define FIRST_SEG_POS_INDEX 2
#define SEGMENTS_COUNT(x) x[0]
#define SEGMENTS_LEN(x) (SEGMENTS_COUNT(x)*4+4)
#define SEGMENTS_POS(x,i) x[FIRST_SEG_POS_INDEX+i]
#define SEGMENTS_NUM(x,i) (x[x[1]+i]-FIRST_SEG_POS_INDEX)
U_NAMESPACE_BEGIN U_NAMESPACE_BEGIN
const UChar TransliterationRule::ETHER = 0xFFFF; const UChar TransliterationRule::ETHER = 0xFFFF;
@ -56,11 +39,10 @@ const UChar TransliterationRule::ETHER = 0xFFFF;
* <code>output</code>; that is, -1 is equivalent to * <code>output</code>; that is, -1 is equivalent to
* <code>output.length()</code>. If greater than * <code>output.length()</code>. If greater than
* <code>output.length()</code> then an exception is thrown. * <code>output.length()</code> then an exception is thrown.
* @param adoptedSegs array of 2n integers. Each of n pairs consists of offset, * @param segs array of UnicodeMatcher corresponding to input pattern
* limit for a segment of the input string. Characters in the output string * segments, or null if there are none. The array itself is adopted,
* refer to these segments if they are in a special range determined by the * but the pointers within it are not.
* associated RuleBasedTransliterator.Data object. May be null if there are * @param segsCount number of elements in segs[]
* no segments.
* @param anchorStart TRUE if the the rule is anchored on the left to * @param anchorStart TRUE if the the rule is anchored on the left to
* the context start * the context start
* @param anchorEnd TRUE if the rule is anchored on the right to the * @param anchorEnd TRUE if the rule is anchored on the right to the
@ -70,7 +52,8 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
int32_t anteContextPos, int32_t postContextPos, int32_t anteContextPos, int32_t postContextPos,
const UnicodeString& outputStr, const UnicodeString& outputStr,
int32_t cursorPosition, int32_t cursorOffset, int32_t cursorPosition, int32_t cursorOffset,
int32_t* adoptedSegs, UnicodeMatcher** segs,
int32_t segsCount,
UBool anchorStart, UBool anchorEnd, UBool anchorStart, UBool anchorEnd,
const TransliterationRuleData* theData, const TransliterationRuleData* theData,
UErrorCode& status) : UErrorCode& status) :
@ -113,23 +96,11 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
this->cursorPos = cursorPosition + cursorOffset; this->cursorPos = cursorPosition + cursorOffset;
this->output = outputStr; this->output = outputStr;
// We don't validate the segments array. The caller must // We don't validate the segments array. The caller must
// guarantee that the segments are well-formed. // guarantee that the segments are well-formed (that is, that
this->segments = adoptedSegs; // all $n references in the output refer to indices of this
// Find the position of the first segment index that is after the // array, and that no array elements are null).
// anteContext (in the key). Note that this may be a start or a this->segments = segs;
// limit index. If all segments are in the ante context, this->segmentsCount = segsCount;
// firstKeySeg should point past the last segment -- that is, it
// should point at the end marker, which is -1. This allows the
// code to back up by one to obtain the last ante context segment.
firstKeySeg = -1;
if (segments != 0) {
firstKeySeg = FIRST_SEG_POS_INDEX;
while (segments[firstKeySeg] >= 0 &&
segments[firstKeySeg] < anteContextLength) {
++firstKeySeg;
}
firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
}
pattern = input; pattern = input;
flags = 0; flags = 0;
@ -149,18 +120,17 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
TransliterationRule::TransliterationRule(TransliterationRule& other) : TransliterationRule::TransliterationRule(TransliterationRule& other) :
pattern(other.pattern), pattern(other.pattern),
output(other.output), output(other.output),
firstKeySeg(other.firstKeySeg),
anteContextLength(other.anteContextLength), anteContextLength(other.anteContextLength),
keyLength(other.keyLength), keyLength(other.keyLength),
cursorPos(other.cursorPos), cursorPos(other.cursorPos),
flags(other.flags), flags(other.flags),
data(other.data) { data(other.data) {
segments = 0; segments = NULL;
if (other.segments != 0) { segmentsCount = 0;
int32_t len = SEGMENTS_LEN(other.segments); if (other.segmentsCount > 0) {
segments = new int32_t[len]; segments = new UnicodeMatcher*[other.segmentsCount];
uprv_memcpy(segments, other.segments, len*sizeof(segments[0])); uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0]));
} }
} }
@ -341,26 +311,12 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
// ============================ MATCH =========================== // ============================ MATCH ===========================
// Record the actual positions, in the text, of the segments. // Reset segment match data
// These are recorded in the order that they occur in the pattern. if (segments != NULL) {
for (int32_t i=0; i<segmentsCount; ++i) {
// segPos[] is an array of 2*SEGMENTS_COUNT elements. It ((StringMatcher*) segments[i])->resetMatch();
// records the position in 'text' of each segment boundary, in }
// the order that they occur in 'pattern'.
int32_t _segPos[2*MAX_STATIC_SEGS];
int32_t *segPos = _segPos;
if (segments != 0 && SEGMENTS_COUNT(segments) > MAX_STATIC_SEGS) {
segPos = new int32_t[2*SEGMENTS_COUNT(segments)];
} }
// iSeg is an index into segments[] that accesses the first
// array. As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
// When indexing into segments[] FIRST_SEG_POS_INDEX must be
// added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
int32_t iSeg = firstKeySeg - 1;
// nextSegPos is an offset in 'pattern'. When the cursor is
// equal to nextSegPos, we are at a segment boundary, and we
// record the position in the real text in segPos[].
int32_t nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
UMatchDegree m; UMatchDegree m;
int32_t lenDelta, keyLimit; int32_t lenDelta, keyLimit;
@ -386,26 +342,15 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
keyChar == text.charAt(oText)) { keyChar == text.charAt(oText)) {
--oText; --oText;
} else { } else {
m = U_MISMATCH; return U_MISMATCH;
goto exit;
} }
} else { } else {
// Subtract 1 from contextStart to make it a reverse limit // Subtract 1 from contextStart to make it a reverse limit
if (matcher->matches(text, oText, pos.contextStart-1, FALSE) if (matcher->matches(text, oText, pos.contextStart-1, FALSE)
!= U_MATCH) { != U_MATCH) {
m = U_MISMATCH; return U_MISMATCH;
goto exit;
} }
} }
while (nextSegPos == oPattern) {
segPos[iSeg] = oText;
if (oText >= 0) {
segPos[iSeg] += UTF_CHAR_LENGTH(text.char32At(oText));
} else {
++segPos[iSeg];
}
nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
}
} }
minOText = posAfter(text, oText); minOText = posAfter(text, oText);
@ -413,15 +358,11 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
// ------------------------ Start Anchor ------------------------ // ------------------------ Start Anchor ------------------------
if ((flags & ANCHOR_START) && oText != posBefore(text, pos.contextStart)) { if ((flags & ANCHOR_START) && oText != posBefore(text, pos.contextStart)) {
m = U_MISMATCH; return U_MISMATCH;
goto exit;
} }
// -------------------- Key and Post Context -------------------- // -------------------- Key and Post Context --------------------
iSeg = firstKeySeg;
nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
oPattern = 0; oPattern = 0;
oText = pos.start; oText = pos.start;
keyLimit = 0; keyLimit = 0;
@ -429,8 +370,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
if (incremental && oText == pos.limit) { if (incremental && oText == pos.limit) {
// We've reached the limit without a mismatch and // We've reached the limit without a mismatch and
// without completing our match. // without completing our match.
m = U_PARTIAL_MATCH; return U_PARTIAL_MATCH;
goto exit;
} }
// It might seem that we could do a check like this here: // It might seem that we could do a check like this here:
@ -445,10 +385,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
// depending on whether we're in the key or in the post // depending on whether we're in the key or in the post
// context. // context.
while (oPattern == nextSegPos) {
segPos[iSeg] = oText;
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
}
if (oPattern == keyLength) { if (oPattern == keyLength) {
keyLimit = oText; keyLimit = oText;
} }
@ -467,13 +403,12 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
keyChar == text.charAt(oText)) { keyChar == text.charAt(oText)) {
++oText; ++oText;
} else { } else {
m = U_MISMATCH; return U_MISMATCH;
goto exit;
} }
} else { } else {
m = matcher->matches(text, oText, matchLimit, incremental); m = matcher->matches(text, oText, matchLimit, incremental);
if (m != U_MATCH) { if (m != U_MATCH) {
goto exit; return m;
} }
} }
@ -486,10 +421,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
//! // at the end of the key. //! // at the end of the key.
//! return UnicodeMatcher.U_MISMATCH; //! return UnicodeMatcher.U_MISMATCH;
//!} //!}
}
while (oPattern == nextSegPos) {
segPos[iSeg] = oText;
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
} }
if (oPattern == keyLength) { if (oPattern == keyLength) {
keyLimit = oText; keyLimit = oText;
@ -509,8 +440,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
// =========================== REPLACE ========================== // =========================== REPLACE ==========================
// We have a full match. The key is between pos.start and // We have a full match. The key is between pos.start and
// keyLimit. Segment indices have been recorded in segPos[]. // keyLimit.
// Perform a replacement.
if (segments == NULL) { if (segments == NULL) {
text.handleReplaceBetween(pos.start, keyLimit, output); text.handleReplaceBetween(pos.start, keyLimit, output);
@ -562,12 +492,23 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
buf.remove(); buf.remove();
} }
// Copy segment with out-of-band data // Copy segment with out-of-band data
b *= 2; StringMatcher* m = (StringMatcher*) segments[b];
int32_t start = segPos[SEGMENTS_NUM(segments,b)]; int32_t start = m->getMatchStart();
int32_t limit = segPos[SEGMENTS_NUM(segments,b+1)]; int32_t limit = m->getMatchLimit();
// If there was no match, that means that a quantifier
// matched zero-length. E.g., x (a)* y matched "xy".
if (start >= 0) {
// Adjust indices for segments in post context
// for any inserted text between the key and
// the post context.
if (start >= keyLimit) {
start += dest - keyLimit;
limit += dest - keyLimit;
}
text.copy(start, limit, dest); text.copy(start, limit, dest);
dest += limit - start; dest += limit - start;
} }
}
oOutput += UTF_CHAR_LENGTH(c); oOutput += UTF_CHAR_LENGTH(c);
} }
// Insert any accumulated straight text. // Insert any accumulated straight text.
@ -600,13 +541,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
pos.contextLimit += lenDelta; pos.contextLimit += lenDelta;
// Restrict new value of start to [minOText, min(oText, pos.limit)]. // Restrict new value of start to [minOText, min(oText, pos.limit)].
pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart)); pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));
m = U_MATCH; return U_MATCH;
exit:
if (segPos != _segPos) {
delete[] segPos;
}
return m;
} }
/** /**
@ -727,23 +662,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
UBool escapeUnprintable) const { UBool escapeUnprintable) const {
int32_t i; int32_t i;
// iseg indexes into segments[] directly (not offset from FSPI)
int32_t iseg = FIRST_SEG_POS_INDEX-1;
int32_t nextSeg = -1;
// Build an array of booleans specifying open vs. close paren
UBool _isOpen[2*MAX_STATIC_SEGS];
UBool *isOpen = _isOpen;
if (segments != 0) {
if (SEGMENTS_COUNT(segments) > MAX_STATIC_SEGS) {
isOpen = new UBool[2*SEGMENTS_COUNT(segments)];
}
for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
isOpen[SEGMENTS_NUM(segments,i) ] = TRUE;
isOpen[SEGMENTS_NUM(segments,i+1)] = FALSE;
}
nextSeg = segments[++iseg];
}
// Accumulate special characters (and non-specials following them) // Accumulate special characters (and non-specials following them)
// into quoteBuf. Append quoteBuf, within single quotes, when // into quoteBuf. Append quoteBuf, within single quotes, when
// a non-quoted element must be inserted. // a non-quoted element must be inserted.
@ -765,14 +683,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
} }
// Append either '(' or ')' if we are at a segment index
if (i == nextSeg) {
appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
(UChar)0x0028 : (UChar)0x0029,
TRUE, escapeUnprintable, quoteBuf);
nextSeg = segments[++iseg];
}
if (emitBraces && i == (anteContextLength + keyLength)) { if (emitBraces && i == (anteContextLength + keyLength)) {
appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
} }
@ -787,11 +697,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
} }
} }
if (i == nextSeg) {
// assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
appendToRule(rule, (UChar)0x0029 /*)*/, TRUE, escapeUnprintable, quoteBuf);
}
if (emitBraces && i == (anteContextLength + keyLength)) { if (emitBraces && i == (anteContextLength + keyLength)) {
appendToRule(rule, (UChar)0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); appendToRule(rule, (UChar)0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
} }
@ -854,9 +759,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);
if (isOpen != _isOpen) {
delete[] isOpen;
}
return rule; return rule;
} }

View File

@ -33,6 +33,16 @@ class TransliterationRuleData;
* Variables are detected by looking up each character in a supplied * Variables are detected by looking up each character in a supplied
* variable list to see if it has been so defined. * variable list to see if it has been so defined.
* *
* <p>A rule may contain segments in its input string and segment
* references in its output string. A segment is a substring of the
* input pattern, indicated by an offset and limit. The segment may
* be in the preceding or following context. It may not span a
* context boundary. A segment reference is a special character in
* the output string that causes a segment of the input string (not
* the input pattern) to be copied to the output string. The range of
* special characters that represent segment references is defined by
* RuleBasedTransliterator.Data.
*
* @author Alan Liu * @author Alan Liu
*/ */
class TransliterationRule { class TransliterationRule {
@ -65,20 +75,20 @@ private:
UnicodeString output; UnicodeString output;
/** /**
* An array of integers encoding the position of the segments. * An array of matcher objects corresponding to the input pattern
* See rbt_pars.cpp::Segments for more details. * segments. If there are no segments this is null. N.B. This is
* a UnicodeMatcher for generality, but in practice it is always a
* StringMatcher. In the future we may generalize this, but for
* now we sometimes cast down to StringMatcher.
*
* The array is owned, but the pointers within it are not.
*/ */
int32_t* segments; UnicodeMatcher** segments;
/** /**
* A value we compute from segments. The first index into segments[] * The number of elements in segments[] or zero if segments is NULL.
* that is >= anteContextLength. That is, the first one that is within
* the forward scanned part of the pattern -- the key or the postContext.
* If there are no segments, this has the value -1. This index is relative
* to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
* segments[FIRST_SEG_POS_INDEX + firstKeySeg].
*/ */
int32_t firstKeySeg; int32_t segmentsCount;
/** /**
* The length of the string that must match before the key. If * The length of the string that must match before the key. If
@ -143,11 +153,10 @@ public:
* 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to * 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
* "xyz" and moves the cursor to before "a". It would have a cursorOffset * "xyz" and moves the cursor to before "a". It would have a cursorOffset
* of -3. * of -3.
* @param adoptedSegs array of 2n integers. Each of n pairs consists of offset, * @param segs array of UnicodeMatcher corresponding to input pattern
* limit for a segment of the input string. Characters in the output string * segments, or null if there are none. The array itself is adopted,
* refer to these segments if they are in a special range determined by the * but the pointers within it are not.
* associated RuleBasedTransliterator.Data object. May be null if there are * @param segsCount number of elements in segs[]
* no segments.
* @param anchorStart TRUE if the the rule is anchored on the left to * @param anchorStart TRUE if the the rule is anchored on the left to
* the context start * the context start
* @param anchorEnd TRUE if the rule is anchored on the right to the * @param anchorEnd TRUE if the rule is anchored on the right to the
@ -157,7 +166,8 @@ public:
int32_t anteContextPos, int32_t postContextPos, int32_t anteContextPos, int32_t postContextPos,
const UnicodeString& outputStr, const UnicodeString& outputStr,
int32_t cursorPosition, int32_t cursorOffset, int32_t cursorPosition, int32_t cursorOffset,
int32_t* adoptedSegs, UnicodeMatcher** segs,
int32_t segsCount,
UBool anchorStart, UBool anchorEnd, UBool anchorStart, UBool anchorEnd,
const TransliterationRuleData* data, const TransliterationRuleData* data,
UErrorCode& status); UErrorCode& status);

View File

@ -18,7 +18,9 @@ StringMatcher::StringMatcher(const UnicodeString& theString,
UBool isSeg, UBool isSeg,
const TransliterationRuleData& theData) : const TransliterationRuleData& theData) :
data(theData), data(theData),
isSegment(isSeg) isSegment(isSeg),
matchStart(-1),
matchLimit(-1)
{ {
theString.extractBetween(start, limit, pattern); theString.extractBetween(start, limit, pattern);
} }
@ -27,7 +29,9 @@ StringMatcher::StringMatcher(const StringMatcher& o) :
UnicodeMatcher(o), UnicodeMatcher(o),
pattern(o.pattern), pattern(o.pattern),
data(o.data), data(o.data),
isSegment(o.isSegment) isSegment(o.isSegment),
matchStart(o.matchStart),
matchLimit(o.matchStart)
{ {
} }
@ -54,6 +58,7 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
int32_t i; int32_t i;
int32_t cursor = offset; int32_t cursor = offset;
if (limit < cursor) { if (limit < cursor) {
// Match in the reverse direction
for (i=pattern.length()-1; i>=0; --i) { for (i=pattern.length()-1; i>=0; --i) {
UChar keyChar = pattern.charAt(i); UChar keyChar = pattern.charAt(i);
const UnicodeMatcher* subm = data.lookup(keyChar); const UnicodeMatcher* subm = data.lookup(keyChar);
@ -72,6 +77,14 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
} }
} }
} }
// Record the match position, but adjust for a normal
// forward start, limit, and only if a prior match does not
// exist -- we want the rightmost match.
if (matchStart < 0) {
// cast away const -- should modify method to be non-const
((StringMatcher*)this)->matchStart = cursor+1;
((StringMatcher*)this)->matchLimit = offset+1;
}
} else { } else {
for (i=0; i<pattern.length(); ++i) { for (i=0; i<pattern.length(); ++i) {
if (incremental && cursor == limit) { if (incremental && cursor == limit) {
@ -99,6 +112,10 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
} }
} }
} }
// Record the match position
// cast away const -- should modify method to be non-const
((StringMatcher*)this)->matchStart = offset;
((StringMatcher*)this)->matchLimit = cursor;
} }
offset = cursor; offset = cursor;
@ -128,7 +145,7 @@ UnicodeString& StringMatcher::toPattern(UnicodeString& result,
result.append((UChar)41); /*)*/ result.append((UChar)41); /*)*/
} }
// Flush quoteBuf out to result // Flush quoteBuf out to result
TransliterationRule::appendToRule(result, (UChar32)(isSegment?41/*)*/:-1), TransliterationRule::appendToRule(result, -1,
TRUE, escapeUnprintable, quoteBuf); TRUE, escapeUnprintable, quoteBuf);
return result; return result;
} }
@ -145,6 +162,32 @@ UBool StringMatcher::matchesIndexValue(uint8_t v) const {
return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v); return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
} }
/**
* Remove any match data. This must be called before performing a
* set of matches with this segment.
*/
void StringMatcher::resetMatch() {
matchStart = matchLimit = -1;
}
/**
* Return the start offset, in the match text, of the <em>rightmost</em>
* match. This method may get moved up into the UnicodeMatcher if
* it turns out to be useful to generalize this.
*/
int32_t StringMatcher::getMatchStart() const {
return matchStart;
}
/**
* Return the limit offset, in the match text, of the <em>rightmost</em>
* match. This method may get moved up into the UnicodeMatcher if
* it turns out to be useful to generalize this.
*/
int32_t StringMatcher::getMatchLimit() const {
return matchLimit;
}
U_NAMESPACE_END U_NAMESPACE_END
//eof //eof

View File

@ -59,6 +59,26 @@ class StringMatcher : public UnicodeMatcher {
*/ */
virtual UBool matchesIndexValue(uint8_t v) const; virtual UBool matchesIndexValue(uint8_t v) const;
/**
* Remove any match data. This must be called before performing a
* set of matches with this segment.
*/
void resetMatch();
/**
* Return the start offset, in the match text, of the <em>rightmost</em>
* match. This method may get moved up into the UnicodeMatcher if
* it turns out to be useful to generalize this.
*/
int32_t getMatchStart() const;
/**
* Return the limit offset, in the match text, of the <em>rightmost</em>
* match. This method may get moved up into the UnicodeMatcher if
* it turns out to be useful to generalize this.
*/
int32_t getMatchLimit() const;
private: private:
UnicodeString pattern; UnicodeString pattern;
@ -66,6 +86,10 @@ class StringMatcher : public UnicodeMatcher {
const TransliterationRuleData& data; const TransliterationRuleData& data;
UBool isSegment; UBool isSegment;
int32_t matchStart;
int32_t matchLimit;
}; };
U_NAMESPACE_END U_NAMESPACE_END

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
* $Date: 2001/10/26 22:59:26 $ * $Date: 2001/10/30 18:08:19 $
* $Revision: 1.57 $ * $Revision: 1.58 $
* *
***************************************************************************************** *****************************************************************************************
*/ */
@ -1268,9 +1268,11 @@ public class TransliteratorTest extends TestFmwk {
"c abc ababc", "c abc ababc",
"d d abd"); "d d abd");
// NOTE: The (ab)+ when referenced just yields a single "ab",
// not the full sequence of them. This accords with perl behavior.
expect("(ab)+ {x} > '(' $1 ')';", expect("(ab)+ {x} > '(' $1 ')';",
"x abx ababxy", "x abx ababxy",
"x ab(ab) abab(abab)y"); "x ab(ab) abab(ab)y");
expect("b+ > x;", expect("b+ > x;",
"ac abc abbc abbbc", "ac abc abbc abbbc",
@ -1288,12 +1290,11 @@ public class TransliteratorTest extends TestFmwk {
"qa qab qaba qababc", "qa qab qaba qababc",
"xa x xa xc"); "xa x xa xc");
// Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s // NOTE: The (ab)+ when referenced just yields a single "ab",
// In perl, it only matches the first occurrence, so the output // not the full sequence of them. This accords with perl behavior.
// is "()a (ab) (ab)a (ab)c".
expect("q(ab)* > '(' $1 ')';", expect("q(ab)* > '(' $1 ')';",
"qa qab qaba qababc", "qa qab qaba qababc",
"()a (ab) (ab)a (abab)c"); "()a (ab) (ab)a (ab)c");
// 'foo'+ and 'foo'* -- the quantifier should apply to the entire // 'foo'+ and 'foo'* -- the quantifier should apply to the entire
// quoted string // quoted string
@ -1574,6 +1575,46 @@ public class TransliteratorTest extends TestFmwk {
expect(gr, "\u03B1\u0314", "ha"); expect(gr, "\u03B1\u0314", "ha");
} }
/**
* Test quantified segment behavior. We want:
* ([abc])+ > x $1 x; applied to "cba" produces "xax"
*/
public void TestQuantifiedSegment() {
// The normal case
expect("([abc]+) > x $1 x;", "cba", "xcbax");
// The tricky case; the quantifier is around the segment
expect("([abc])+ > x $1 x;", "cba", "xax");
// Tricky case in reverse direction
expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
// Check post-context segment
expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
// Test toRule/toPattern for non-quantified segment.
// Careful with spacing here.
String r = "([a-c]){q} > x $1 x;";
Transliterator t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
String rr = t.toRules(true);
if (!r.equals(rr)) {
errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
} else {
logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
}
// Test toRule/toPattern for quantified segment.
// Careful with spacing here.
r = "([a-c])+{q} > x $1 x;";
t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
rr = t.toRules(true);
if (!r.equals(rr)) {
errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
} else {
logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
}
}
//====================================================================== //======================================================================
// icu4j ONLY // icu4j ONLY
// These tests are not mirrored (yet) in icu4c at // These tests are not mirrored (yet) in icu4c at

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/StringMatcher.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/StringMatcher.java,v $
* $Date: 2001/10/25 22:32:02 $ * $Date: 2001/10/30 18:04:08 $
* $Revision: 1.2 $ * $Revision: 1.3 $
* *
***************************************************************************************** *****************************************************************************************
*/ */
@ -18,16 +18,27 @@ class StringMatcher implements UnicodeMatcher {
private boolean isSegment; private boolean isSegment;
private int matchStart;
private int matchLimit;
private final RuleBasedTransliterator.Data data; private final RuleBasedTransliterator.Data data;
public StringMatcher(String theString,
boolean isSeg,
RuleBasedTransliterator.Data theData) {
data = theData;
isSegment = isSeg;
pattern = theString;
matchStart = matchLimit = -1;
}
public StringMatcher(String theString, public StringMatcher(String theString,
int start, int start,
int limit, int limit,
boolean isSeg, boolean isSeg,
RuleBasedTransliterator.Data theData) { RuleBasedTransliterator.Data theData) {
data = theData; this(theString.substring(start, limit), isSeg, theData);
isSegment = isSeg;
pattern = theString.substring(start, limit);
} }
/** /**
@ -40,6 +51,7 @@ class StringMatcher implements UnicodeMatcher {
int i; int i;
int[] cursor = new int[] { offset[0] }; int[] cursor = new int[] { offset[0] };
if (limit < cursor[0]) { if (limit < cursor[0]) {
// Match in the reverse direction
for (i=pattern.length()-1; i>=0; --i) { for (i=pattern.length()-1; i>=0; --i) {
char keyChar = pattern.charAt(i); char keyChar = pattern.charAt(i);
UnicodeMatcher subm = data.lookup(keyChar); UnicodeMatcher subm = data.lookup(keyChar);
@ -58,6 +70,13 @@ class StringMatcher implements UnicodeMatcher {
} }
} }
} }
// Record the match position, but adjust for a normal
// forward start, limit, and only if a prior match does not
// exist -- we want the rightmost match.
if (matchStart < 0) {
matchStart = cursor[0]+1;
matchLimit = offset[0]+1;
}
} else { } else {
for (i=0; i<pattern.length(); ++i) { for (i=0; i<pattern.length(); ++i) {
if (incremental && cursor[0] == limit) { if (incremental && cursor[0] == limit) {
@ -85,6 +104,9 @@ class StringMatcher implements UnicodeMatcher {
} }
} }
} }
// Record the match position
matchStart = offset[0];
matchLimit = cursor[0];
} }
offset[0] = cursor[0]; offset[0] = cursor[0];
@ -114,7 +136,7 @@ class StringMatcher implements UnicodeMatcher {
result.append(')'); result.append(')');
} }
// Flush quoteBuf out to result // Flush quoteBuf out to result
TransliterationRule.appendToRule(result, (isSegment?')':-1), TransliterationRule.appendToRule(result, -1,
true, escapeUnprintable, quoteBuf); true, escapeUnprintable, quoteBuf);
return result.toString(); return result.toString();
} }
@ -130,6 +152,32 @@ class StringMatcher implements UnicodeMatcher {
UnicodeMatcher m = data.lookup(c); UnicodeMatcher m = data.lookup(c);
return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v); return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
} }
/**
* Remove any match data. This must be called before performing a
* set of matches with this segment.
*/
public void resetMatch() {
matchStart = matchLimit = -1;
}
/**
* Return the start offset, in the match text, of the <em>rightmost</em>
* match. This method may get moved up into the UnicodeMatcher if
* it turns out to be useful to generalize this.
*/
public int getMatchStart() {
return matchStart;
}
/**
* Return the limit offset, in the match text, of the <em>rightmost</em>
* match. This method may get moved up into the UnicodeMatcher if
* it turns out to be useful to generalize this.
*/
public int getMatchLimit() {
return matchLimit;
}
} }
//eof //eof

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
* $Date: 2001/10/25 23:22:15 $ * $Date: 2001/10/30 18:04:08 $
* $Revision: 1.33 $ * $Revision: 1.34 $
* *
***************************************************************************************** *****************************************************************************************
*/ */
@ -30,13 +30,15 @@ import com.ibm.util.Utility;
* Variables are detected by looking up each character in a supplied * Variables are detected by looking up each character in a supplied
* variable list to see if it has been so defined. * variable list to see if it has been so defined.
* *
* <p>A rule may contain segments in its input string and segment references in * <p>A rule may contain segments in its input string and segment
* its output string. A segment is a substring of the input pattern, indicated * references in its output string. A segment is a substring of the
* by an offset and limit. The segment may span the preceding or following * input pattern, indicated by an offset and limit. The segment may
* context. A segment reference is a special character in the output string * be in the preceding or following context. It may not span a
* that causes a segment of the input string (not the input pattern) to be * context boundary. A segment reference is a special character in
* copied to the output string. The range of special characters that represent * the output string that causes a segment of the input string (not
* segment references is defined by RuleBasedTransliterator.Data. * the input pattern) to be copied to the output string. The range of
* special characters that represent segment references is defined by
* RuleBasedTransliterator.Data.
* *
* <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input * <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
* string "abc.123" to "ab1.c23". * string "abc.123" to "ab1.c23".
@ -44,7 +46,7 @@ import com.ibm.util.Utility;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved. * <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
* *
* @author Alan Liu * @author Alan Liu
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.33 $ $Date: 2001/10/25 23:22:15 $ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.34 $ $Date: 2001/10/30 18:04:08 $
*/ */
class TransliterationRule { class TransliterationRule {
@ -64,20 +66,13 @@ class TransliterationRule {
private String output; private String output;
/** /**
* An array of integers encoding the position of the segments. * An array of matcher objects corresponding to the input pattern
* See RuleBasedTransliterator.Segments for more details. * segments. If there are no segments this is null. N.B. This is
* a UnicodeMatcher for generality, but in practice it is always a
* StringMatcher. In the future we may generalize this, but for
* now we sometimes cast down to StringMatcher.
*/ */
int[] segments; UnicodeMatcher[] segments;
/**
* A value we compute from segments. The first index into segments[]
* that is >= anteContextLength. That is, the first one that is within
* the forward scanned part of the pattern -- the key or the postContext.
* If there are no segments, this has the value -1. This index is relative
* to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
* segments[FIRST_SEG_POS_INDEX + firstKeySeg].
*/
int firstKeySeg;
/** /**
* The length of the string that must match before the key. If * The length of the string that must match before the key. If
@ -127,20 +122,6 @@ class TransliterationRule {
private static final char APOSTROPHE = '\''; private static final char APOSTROPHE = '\'';
private static final char BACKSLASH = '\\'; private static final char BACKSLASH = '\\';
// Macros for accessing the array of integers encoding the position of
// the segments. See RuleBasedTransliterator.Segments for more details.
// SEGMENTS_COUNT number of segments, n (half the number of parens)
// SEGMENTS_LEN length of the segments array (number of elements)
// SEGMENTS_POS position in 'pattern' of parenthesis i, where i=0..2n-1
// SEGMENTS_NUM index into segments to access POS of $1.open,
// $1.close, $2.open, $2.close,.., $n.open, $n.close
// Relative to FIRST_SEG_POS_INDEX. Ranges from 0..2n-1.
static final int FIRST_SEG_POS_INDEX = 2;
static final int SEGMENTS_COUNT(int[] x) { return x[0]; }
static final int SEGMENTS_LEN(int[] x) { return (SEGMENTS_COUNT(x)*4+4); }
static final int SEGMENTS_POS(int[] x,int i) { return x[FIRST_SEG_POS_INDEX+i]; }
static final int SEGMENTS_NUM(int[] x,int i) { return x[x[1]+i]-FIRST_SEG_POS_INDEX; }
private static final String COPYRIGHT = private static final String COPYRIGHT =
"\u00A9 IBM Corporation 1999-2001. All rights reserved."; "\u00A9 IBM Corporation 1999-2001. All rights reserved.";
@ -165,12 +146,8 @@ class TransliterationRule {
* 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to * 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
* "xyz" and moves the cursor to before "a". It would have a cursorOffset * "xyz" and moves the cursor to before "a". It would have a cursorOffset
* of -3. * of -3.
* @param segs array of 2n integers. Each of n pairs consists of offset, * @param segs array of UnicodeMatcher corresponding to input pattern
* limit for a segment of the input string. Characters in the output string * segments, or null if there are none
* refer to these segments if they are in a special range determined by the
* associated RuleBasedTransliterator.Data object. May be null if there are
* no segments. The caller is responsible for validating that segments
* are well-formed.
* @param anchorStart true if the the rule is anchored on the left to * @param anchorStart true if the the rule is anchored on the left to
* the context start * the context start
* @param anchorEnd true if the rule is anchored on the right to the * @param anchorEnd true if the rule is anchored on the right to the
@ -180,7 +157,7 @@ class TransliterationRule {
int anteContextPos, int postContextPos, int anteContextPos, int postContextPos,
String output, String output,
int cursorPos, int cursorOffset, int cursorPos, int cursorOffset,
int[] segs, UnicodeMatcher[] segs,
boolean anchorStart, boolean anchorEnd, boolean anchorStart, boolean anchorEnd,
RuleBasedTransliterator.Data theData) { RuleBasedTransliterator.Data theData) {
data = theData; data = theData;
@ -212,25 +189,11 @@ class TransliterationRule {
this.cursorPos = cursorPos + cursorOffset; this.cursorPos = cursorPos + cursorOffset;
this.output = output; this.output = output;
// We don't validate the segments array. The caller must // We don't validate the segments array. The caller must
// guarantee that the segments are well-formed. // guarantee that the segments are well-formed (that is, that
// all $n references in the output refer to indices of this
// array, and that no array elements are null).
this.segments = segs; this.segments = segs;
// Find the position of the first segment index that is after the
// anteContext (in the key). Note that this may be a start or a
// limit index. If all segments are in the ante context,
// firstKeySeg should point past the last segment -- that is, it
// should point at the end marker, which is -1. This allows the
// code to back up by one to obtain the last ante context segment.
firstKeySeg = -1;
if (segments != null) {
firstKeySeg = FIRST_SEG_POS_INDEX;
while (segments[firstKeySeg] >= 0 &&
segments[firstKeySeg] < anteContextLength) {
++firstKeySeg;
}
firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
}
pattern = input; pattern = input;
flags = 0; flags = 0;
if (anchorStart) { if (anchorStart) {
@ -410,25 +373,12 @@ class TransliterationRule {
// ============================ MATCH =========================== // ============================ MATCH ===========================
// Record the actual positions, in the text, of the segments. // Reset segment match data
// These are recorded in the order that they occur in the pattern.
// segPos[] is an array of 2*SEGMENTS_COUNT elements. It
// records the position in 'text' of each segment boundary, in
// the order that they occur in 'pattern'.
int[] segPos = null;
if (segments != null) { if (segments != null) {
segPos = new int[2*SEGMENTS_COUNT(segments)]; for (int i=0; i<segments.length; ++i) {
((StringMatcher) segments[i]).resetMatch();
}
} }
// iSeg is an index into segments[] that accesses the first
// array. As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
// When indexing into segments[] FIRST_SEG_POS_INDEX must be
// added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
int iSeg = firstKeySeg - 1;
// nextSegPos is an offset in 'pattern'. When the cursor is
// equal to nextSegPos, we are at a segment boundary, and we
// record the position in the real text in segPos[].
int nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
int lenDelta, keyLimit; int lenDelta, keyLimit;
int[] intRef = new int[1]; int[] intRef = new int[1];
@ -465,15 +415,6 @@ class TransliterationRule {
} }
oText = intRef[0]; oText = intRef[0];
} }
while (nextSegPos == oPattern) {
segPos[iSeg] = oText;
if (oText >= 0) {
segPos[iSeg] += UTF16.getCharCount(UTF16.charAt(text, oText));
} else {
++segPos[iSeg];
}
nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
}
} }
minOText = posAfter(text, oText); minOText = posAfter(text, oText);
@ -486,9 +427,6 @@ class TransliterationRule {
// -------------------- Key and Post Context -------------------- // -------------------- Key and Post Context --------------------
iSeg = firstKeySeg;
nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
oPattern = 0; oPattern = 0;
oText = pos.start; oText = pos.start;
keyLimit = 0; keyLimit = 0;
@ -511,10 +449,6 @@ class TransliterationRule {
// depending on whether we're in the key or in the post // depending on whether we're in the key or in the post
// context. // context.
while (oPattern == nextSegPos) {
segPos[iSeg] = oText;
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
}
if (oPattern == keyLength) { if (oPattern == keyLength) {
keyLimit = oText; keyLimit = oText;
} }
@ -554,10 +488,6 @@ class TransliterationRule {
//! return UnicodeMatcher.U_MISMATCH; //! return UnicodeMatcher.U_MISMATCH;
//!} //!}
} }
while (oPattern == nextSegPos) {
segPos[iSeg] = oText;
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
}
if (oPattern == keyLength) { if (oPattern == keyLength) {
keyLimit = oText; keyLimit = oText;
} }
@ -576,8 +506,7 @@ class TransliterationRule {
// =========================== REPLACE ========================== // =========================== REPLACE ==========================
// We have a full match. The key is between pos.start and // We have a full match. The key is between pos.start and
// keyLimit. Segment indices have been recorded in segPos[]. // keyLimit.
// Perform a replacement.
if (segments == null) { if (segments == null) {
text.replace(pos.start, keyLimit, output); text.replace(pos.start, keyLimit, output);
@ -629,12 +558,23 @@ class TransliterationRule {
buf.setLength(0); buf.setLength(0);
} }
// Copy segment with out-of-band data // Copy segment with out-of-band data
b *= 2; StringMatcher m = (StringMatcher) segments[b];
int start = segPos[SEGMENTS_NUM(segments,b)]; int start = m.getMatchStart();
int limit = segPos[SEGMENTS_NUM(segments,b+1)]; int limit = m.getMatchLimit();
// If there was no match, that means that a quantifier
// matched zero-length. E.g., x (a)* y matched "xy".
if (start >= 0) {
// Adjust indices for segments in post context
// for any inserted text between the key and
// the post context.
if (start >= keyLimit) {
start += dest - keyLimit;
limit += dest - keyLimit;
}
text.copy(start, limit, dest); text.copy(start, limit, dest);
dest += limit - start; dest += limit - start;
} }
}
oOutput += UTF16.getCharCount(c); oOutput += UTF16.getCharCount(c);
} }
// Insert any accumulated straight text. // Insert any accumulated straight text.
@ -790,20 +730,6 @@ class TransliterationRule {
StringBuffer rule = new StringBuffer(); StringBuffer rule = new StringBuffer();
// iseg indexes into segments[] directly (not offset from FSPI)
int iseg = FIRST_SEG_POS_INDEX-1;
int nextSeg = -1;
// Build an array of booleans specifying open vs. close paren
boolean[] isOpen = null;
if (segments != null) {
isOpen = new boolean[2*SEGMENTS_COUNT(segments)];
for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
isOpen[SEGMENTS_NUM(segments,i) ] = true;
isOpen[SEGMENTS_NUM(segments,i+1)] = false;
}
nextSeg = segments[++iseg];
}
// Accumulate special characters (and non-specials following them) // Accumulate special characters (and non-specials following them)
// into quoteBuf. Append quoteBuf, within single quotes, when // into quoteBuf. Append quoteBuf, within single quotes, when
// a non-quoted element must be inserted. // a non-quoted element must be inserted.
@ -825,14 +751,6 @@ class TransliterationRule {
appendToRule(rule, '{', true, escapeUnprintable, quoteBuf); appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
} }
// Append either '(' or ')' if we are at a segment index
if (i == nextSeg) {
appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
'(' : ')',
true, escapeUnprintable, quoteBuf);
nextSeg = segments[++iseg];
}
if (emitBraces && i == (anteContextLength + keyLength)) { if (emitBraces && i == (anteContextLength + keyLength)) {
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf); appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
} }
@ -847,11 +765,6 @@ class TransliterationRule {
} }
} }
if (i == nextSeg) {
// assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
appendToRule(rule, ')', true, escapeUnprintable, quoteBuf);
}
if (emitBraces && i == (anteContextLength + keyLength)) { if (emitBraces && i == (anteContextLength + keyLength)) {
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf); appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
} }
@ -885,7 +798,7 @@ class TransliterationRule {
} else { } else {
++seg; // make 1-based ++seg; // make 1-based
appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf); appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf);
rule.append(0x24 /*$*/); rule.append('$');
boolean show = false; // true if we should display digits boolean show = false; // true if we should display digits
for (int p=9; p>=0; --p) { for (int p=9; p>=0; --p) {
int d = seg / POW10[p]; int d = seg / POW10[p];
@ -938,6 +851,9 @@ class TransliterationRule {
/** /**
* $Log: TransliterationRule.java,v $ * $Log: TransliterationRule.java,v $
* Revision 1.34 2001/10/30 18:04:08 alan
* jitterbug 1406: make quantified segments behave like perl counterparts
*
* Revision 1.33 2001/10/25 23:22:15 alan * Revision 1.33 2001/10/25 23:22:15 alan
* jitterbug 73: changes to support zero-length matchers at end of key * jitterbug 73: changes to support zero-length matchers at end of key
* *

View File

@ -4,8 +4,8 @@
* Corporation and others. All Rights Reserved. * Corporation and others. All Rights Reserved.
********************************************************************** **********************************************************************
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $
* $Date: 2001/10/24 00:03:38 $ * $Date: 2001/10/30 18:04:09 $
* $Revision: 1.7 $ * $Revision: 1.8 $
********************************************************************** **********************************************************************
*/ */
package com.ibm.text; package com.ibm.text;
@ -117,6 +117,7 @@ class TransliteratorParser {
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
private static final String OPERATORS = "=><"; private static final String OPERATORS = "=><";
private static final String HALF_ENDERS = "=><;";
// Other special characters // Other special characters
private static final char QUOTE = '\''; private static final char QUOTE = '\'';
@ -142,7 +143,7 @@ class TransliteratorParser {
// private static final char ANCHOR_END = '$'; // private static final char ANCHOR_END = '$';
// Segments of the input string are delimited by "(" and ")". In the // Segments of the input string are delimited by "(" and ")". In the
// output string these segments are referenced as "$1" through "$9". // output string these segments are referenced as "$1", "$2", etc.
private static final char SEGMENT_OPEN = '('; private static final char SEGMENT_OPEN = '(';
private static final char SEGMENT_CLOSE = ')'; private static final char SEGMENT_CLOSE = ')';
@ -285,209 +286,6 @@ class TransliteratorParser {
} }
}; };
//----------------------------------------------------------------------
// class Segments
//----------------------------------------------------------------------
/**
* Segments are parentheses-enclosed regions of the input string.
* These are referenced in the output string using the notation $1,
* $2, etc. Numbering is in order of appearance of the left
* parenthesis. Number is one-based. Segments are defined as start,
* limit pairs. Segments may nest.
*
* During parsing, segment data is encoded in an object of class
* Segments. At runtime, the same data is encoded in compact form as
* an array of integers in a TransliterationRule. The runtime encoding
* must satisfy three goals:
*
* 1. Iterate over the offsets in a pattern, from left to right,
* and indicate all segment boundaries, in order. This is done
* during matching.
*
* 2. Given a reference $n, produce the start and limit offsets
* for that segment. This is done during replacement.
*
* 3. Similar to goal 1, but in addition, indicate whether each
* segment boundary is a start or a limit, in other words, whether
* each is an open paren or a close paren. This is required by
* the toRule() method.
*
* Goal 1 must be satisfied at high speed since this is done during
* matching. Goal 2 is next most important. Goal 3 is not performance
* critical since it is only needed by toRule().
*
* The array of integers is actually two arrays concatenated. The
* first gives the index values of the open and close parentheses in
* the order they appear. The second maps segment numbers to the
* indices of the first array. The two arrays have the same length.
* Iterating over the first array satisfies goal 1. Indexing into the
* second array satisfies goal 2. Goal 3 is satisfied by iterating
* over the second array and constructing the required data when
* needed. This is what toRule() does.
*
* Example: (a b(c d)e f)
* 0 1 2 3 4 5 6
*
* First array: Indices are 0, 2, 4, and 6.
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
* second array is 0, 3, 1 2 -- these give the indices in the
* first array at which $1:open, $1:close, $2:open, and $2:close
* occur.
*
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
*
* Each subarray is terminated with a -1, and two leading entries
* give the number of segments and the offset to the first entry
* of the second array. In addition, the second array value are
* all offset by 2 so they index directly into the final array.
* The total array size is 4*segments[0] + 4. The second index is
* 2*segments[0] + 3.
*
* In the output string, a segment reference is indicated by a
* character in a special range, as defined by
* RuleBasedTransliterator.Data.
*
* Most rules have no segments, in which case segments is null, and the
* output string need not be checked for segment reference characters.
*
* See also rbt_rule.h/cpp.
*/
private static class Segments {
private Vector offsets; // holds Integer objects
private Vector isOpenParen; // holds Boolean objects
private int offset(int i) {
return ((Integer) offsets.elementAt(i)).intValue();
}
private boolean isOpen(int i) {
return ((Boolean) isOpenParen.elementAt(i)).booleanValue();
}
// size of the Vectors
private int size() {
// assert(offset.size() == isOpenParen.size());
return offsets.size();
}
public Segments() {
offsets = new Vector();
isOpenParen = new Vector();
}
public void addParenthesisAt(int offset, boolean isOpen) {
offsets.addElement(new Integer(offset));
isOpenParen.addElement(new Boolean(isOpen));
}
public int getLastParenOffset(boolean[] isOpenParen) {
if (size() == 0) {
return -1;
}
isOpenParen[0] = isOpen(size()-1);
return offset(size()-1);
}
// Remove the last (rightmost) segment. Store its offsets in start
// and limit, and then convert all offsets at or after start to be
// equal to start. Upon failure, return FALSE. Assume that the
// caller has already called getLastParenOffset() and validated that
// there is at least one parenthesis and that the last one is a close
// paren.
public boolean extractLastParenSubstring(int[] start, int[] limit) {
// assert(offsets.size() > 0);
// assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
int i = size() - 1;
int n = 1; // count of close parens we need to match
// Record position of the last close paren
limit[0] = offset(i);
--i; // back up to the one before the last one
while (i >= 0 && n != 0) {
n += isOpen(i) ? -1 : 1;
}
if (n != 0) {
return false;
}
// assert(i>=0);
start[0] = offset(i);
// Reset all segment pairs from i to size() - 1 to [start, start+1).
while (i<size()) {
int o = isOpen(i) ? start[0] : (start[0]+1);
offsets.setElementAt(new Integer(o), i);
++i;
}
return true;
}
// Assume caller has already gotten a TRUE validate().
public int[] createArray() {
int c = count(); // number of segments
int arrayLen = 4*c + 4;
int[] array = new int[arrayLen];
int a2offset = 2*c + 3; // offset to array 2
array[0] = c;
array[1] = a2offset;
int i;
for (i=0; i<2*c; ++i) {
array[2+i] = offset(i);
}
array[a2offset-1] = -1;
array[arrayLen-1] = -1;
// Now walk through and match up segment numbers with parentheses.
// Number segments from 0. We're going to offset all entries by 2
// to skip the first two elements, array[0] and array[1].
Stack stack = new Stack();
int nextOpen = 0; // seg # of next open, 0-based
for (i=0; i<2*c; ++i) {
boolean open = isOpen(i);
// Let seg be the zero-based segment number.
// Open parens are at 2*seg in array 2.
// Close parens are at 2*seg+1 in array 2.
if (open) {
array[a2offset + 2*nextOpen] = 2+i;
stack.push(new Integer(nextOpen));
++nextOpen;
} else {
int nextClose = ((Integer) stack.pop()).intValue();
array[a2offset + 2*nextClose+1] = 2+i;
}
}
// assert(stack.empty());
return array;
}
public boolean validate() {
// want number of parens >= 2
// want number of parens to be even
// want first paren '('
// want parens to match up in the end
if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
return false;
}
int n = 0;
for (int i=0; i<size(); ++i) {
n += isOpen(i) ? 1 : -1;
if (n < 0) {
return false;
}
}
return n == 0;
}
// Number of segments
// Assume caller has already gotten a TRUE validate().
public int count() {
// assert(validate());
return size() / 2;
}
}
//---------------------------------------------------------------------- //----------------------------------------------------------------------
// class RuleHalf // class RuleHalf
//---------------------------------------------------------------------- //----------------------------------------------------------------------
@ -505,11 +303,7 @@ class TransliteratorParser {
public int ante = -1; // position of ante context marker '{' in text public int ante = -1; // position of ante context marker '{' in text
public int post = -1; // position of post context marker '}' in text public int post = -1; // position of post context marker '}' in text
// Record the position of the segment substrings and references. A public int maxRef = -1; // n where maximum segment ref is $n; 1-based
// given side should have segments or segment references, but not
// both.
public Segments segments = null;
public int maxRef = -1; // index of largest ref (1..9)
// Record the offset to the cursor either to the left or to the // Record the offset to the cursor either to the left or to the
// right of the key. This is indicated by characters on the output // right of the key. This is indicated by characters on the output
@ -521,29 +315,88 @@ class TransliteratorParser {
// output text. // output text.
public int cursorOffset = 0; // only nonzero on output side public int cursorOffset = 0; // only nonzero on output side
// Position of first CURSOR_OFFSET on _right_. This will be -1
// for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
private int cursorOffsetPos = 0;
public boolean anchorStart = false; public boolean anchorStart = false;
public boolean anchorEnd = false; public boolean anchorEnd = false;
/**
* UnicodeMatcher objects corresponding to each segment.
*/
public Vector segments = new Vector();
/**
* The segment number from 0..n-1 of the next '(' we see
* during parsing; 0-based.
*/
private int nextSegmentNumber = 0;
/** /**
* Parse one side of a rule, stopping at either the limit, * Parse one side of a rule, stopping at either the limit,
* the END_OF_RULE character, or an operator. Return * the END_OF_RULE character, or an operator.
* the pos of the terminating character (or limit). * @return the index after the terminating character, or
* if limit was reached, limit
*/ */
public int parse(String rule, int pos, int limit, public int parse(String rule, int pos, int limit,
TransliteratorParser parser) { TransliteratorParser parser) {
int start = pos; int start = pos;
StringBuffer buf = new StringBuffer(); StringBuffer buf = new StringBuffer();
pos = parseSection(rule, pos, limit, parser, buf, false);
text = buf.toString();
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
syntaxError("Misplaced " + CURSOR_POS, rule, start);
}
return pos;
}
/**
* Parse a section of one side of a rule, stopping at either
* the limit, the END_OF_RULE character, an operator, or a
* segment close character. This method parses both a
* top-level rule half and a segment within such a rule half.
* It calls itself recursively to parse segments and nested
* segments.
* @param buf buffer into which to accumulate the rule pattern
* characters, either literal characters from the rule or
* standins for UnicodeMatcher objects including segments.
* @param isSegment if true, then we've already seen a '(' and
* pos on entry points right after it. Accumulate everything
* up to the closing ')', put it in a segment matcher object,
* generate a standin for it, and add the standin to buf. As
* a side effect, update the segments vector with a reference
* to the segment matcher. This works recursively for nested
* segments. If isSegment is false, just accumulate
* characters into buf.
* @return the index after the terminating character, or
* if limit was reached, limit
*/
private int parseSection(String rule, int pos, int limit,
TransliteratorParser parser,
StringBuffer buf,
boolean isSegment) {
int start = pos;
ParsePosition pp = null; ParsePosition pp = null;
int cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
boolean done = false;
int quoteStart = -1; // Most recent 'single quoted string' int quoteStart = -1; // Most recent 'single quoted string'
int quoteLimit = -1; int quoteLimit = -1;
int varStart = -1; // Most recent $variableReference int varStart = -1; // Most recent $variableReference
int varLimit = -1; int varLimit = -1;
int[] iref = new int[1]; int[] iref = new int[1];
// If isSegment, then bufSegStart is the offset in buf to
// the first character of the segment we are parsing.
int bufSegStart = 0;
int segmentNumber = 0;
if (isSegment) {
bufSegStart = buf.length();
segmentNumber = nextSegmentNumber++;
}
main: main:
while (pos < limit && !done) { while (pos < limit) {
char c = rule.charAt(pos++); char c = rule.charAt(pos++);
if (Character.isWhitespace(c)) { if (Character.isWhitespace(c)) {
// Ignore whitespace. Note that this is not Unicode // Ignore whitespace. Note that this is not Unicode
@ -551,8 +404,11 @@ class TransliteratorParser {
// whitespace likely to be seen in code. // whitespace likely to be seen in code.
continue; continue;
} }
if (OPERATORS.indexOf(c) >= 0) { // HALF_ENDERS is all chars that end a rule half: "<>=;"
--pos; // Backup to point to operator if (HALF_ENDERS.indexOf(c) >= 0) {
if (isSegment) {
syntaxError("Unclosed segment", rule, start);
}
break main; break main;
} }
if (anchorEnd) { if (anchorEnd) {
@ -614,7 +470,12 @@ class TransliteratorParser {
} }
continue; continue;
} }
switch (c) { switch (c) {
//------------------------------------------------------
// Elements allowed within and out of segments
//------------------------------------------------------
case ANCHOR_START: case ANCHOR_START:
if (buf.length() == 0 && !anchorStart) { if (buf.length() == 0 && !anchorStart) {
anchorStart = true; anchorStart = true;
@ -624,17 +485,8 @@ class TransliteratorParser {
} }
break; break;
case SEGMENT_OPEN: case SEGMENT_OPEN:
case SEGMENT_CLOSE: pos = parseSection(rule, pos, limit, parser, buf, true);
// Handle segment definitions "(" and ")"
// Parse "(", ")"
if (segments == null) {
segments = new Segments();
}
segments.addParenthesisAt(buf.length(), c == SEGMENT_OPEN);
break; break;
case END_OF_RULE:
--pos; // Backup to point to END_OF_RULE
break main;
case SymbolTable.SYMBOL_REF: case SymbolTable.SYMBOL_REF:
// Handle variable references and segment references "$1" .. "$9" // Handle variable references and segment references "$1" .. "$9"
{ {
@ -697,25 +549,129 @@ class TransliteratorParser {
} }
} }
break; break;
case DOT:
buf.append(parser.getDotStandIn());
break;
case KLEENE_STAR:
case ONE_OR_MORE:
case ZERO_OR_ONE:
// Quantifiers. We handle single characters, quoted strings,
// variable references, and segments.
// a+ matches aaa
// 'foo'+ matches foofoofoo
// $v+ matches xyxyxy if $v == xy
// (seg)+ matches segsegseg
{
if (isSegment && buf.length() == bufSegStart) {
// The */+ immediately follows '('
syntaxError("Misplaced quantifier", rule, start);
break;
}
int qstart, qlimit;
// The */+ follows an isolated character or quote
// or variable reference
if (buf.length() == quoteLimit) {
// The */+ follows a 'quoted string'
qstart = quoteStart;
qlimit = quoteLimit;
} else if (buf.length() == varLimit) {
// The */+ follows a $variableReference
qstart = varStart;
qlimit = varLimit;
} else {
// The */+ follows a single character, possibly
// a segment standin
qstart = buf.length() - 1;
qlimit = qstart + 1;
}
UnicodeMatcher m =
new StringMatcher(buf.toString(), qstart, qlimit,
false, parser.data);
int min = 0;
int max = Quantifier.MAX;
switch (c) {
case ONE_OR_MORE:
min = 1;
break;
case ZERO_OR_ONE:
min = 0;
max = 1;
break;
// case KLEENE_STAR:
// do nothing -- min, max already set
}
m = new Quantifier(m, min, max);
buf.setLength(qstart);
buf.append(parser.generateStandInFor(m));
}
break;
//------------------------------------------------------
// Elements allowed ONLY WITHIN segments
//------------------------------------------------------
case SEGMENT_CLOSE:
if (isSegment) {
// We're done parsing a segment. The relevant
// characters are in buf, starting at offset
// bufSegStart. Extract them into a string
// matcher, and replace them with a standin
// for that matcher.
StringMatcher m =
new StringMatcher(buf.substring(bufSegStart),
true, parser.data);
// Since we call parseSection() recursively,
// nested segments will result in segment i+1
// getting parsed and stored before segment i;
// be careful with the vector handling here.
if ((segmentNumber+1) > segments.size()) {
segments.setSize(segmentNumber+1);
}
segments.setElementAt(m, segmentNumber);
buf.setLength(bufSegStart);
buf.append(parser.generateStandInFor(m));
break main;
}
// If we aren't in a segment, then a segment close
// character is a syntax error.
syntaxError("Unquoted special", rule, start);
break;
//------------------------------------------------------
// Elements allowed ONLY OUTSIDE segments
//------------------------------------------------------
case CONTEXT_ANTE: case CONTEXT_ANTE:
if (isSegment) {
syntaxError("Illegal character '" + c + "' in segment", rule, start);
}
if (ante >= 0) { if (ante >= 0) {
syntaxError("Multiple ante contexts", rule, start); syntaxError("Multiple ante contexts", rule, start);
} }
ante = buf.length(); ante = buf.length();
break; break;
case CONTEXT_POST: case CONTEXT_POST:
if (isSegment) {
syntaxError("Illegal character '" + c + "' in segment", rule, start);
}
if (post >= 0) { if (post >= 0) {
syntaxError("Multiple post contexts", rule, start); syntaxError("Multiple post contexts", rule, start);
} }
post = buf.length(); post = buf.length();
break; break;
case CURSOR_POS: case CURSOR_POS:
if (isSegment) {
syntaxError("Illegal character '" + c + "' in segment", rule, start);
}
if (cursor >= 0) { if (cursor >= 0) {
syntaxError("Multiple cursors", rule, start); syntaxError("Multiple cursors", rule, start);
} }
cursor = buf.length(); cursor = buf.length();
break; break;
case CURSOR_OFFSET: case CURSOR_OFFSET:
if (isSegment) {
syntaxError("Illegal character '" + c + "' in segment", rule, start);
}
if (cursorOffset < 0) { if (cursorOffset < 0) {
if (buf.length() > 0) { if (buf.length() > 0) {
syntaxError("Misplaced " + c, rule, start); syntaxError("Misplaced " + c, rule, start);
@ -737,74 +693,10 @@ class TransliteratorParser {
} }
} }
break; break;
case DOT:
buf.append(parser.getDotStandIn()); //------------------------------------------------------
break; // Non-special characters
case KLEENE_STAR: //------------------------------------------------------
case ONE_OR_MORE:
case ZERO_OR_ONE:
// Quantifiers. We handle single characters, quoted strings,
// variable references, and segments.
// a+ matches aaa
// 'foo'+ matches foofoofoo
// $v+ matches xyxyxy if $v == xy
// (seg)+ matches segsegseg
{
int qstart, qlimit;
boolean[] isOpenParen = new boolean[1];
boolean isSegment = false;
if (segments != null &&
segments.getLastParenOffset(isOpenParen) == buf.length()) {
// The */+ immediately follows a segment
if (isOpenParen[0]) {
syntaxError("Misplaced quantifier", rule, start);
}
int[] startparam = new int[1];
int[] limitparam = new int[1];
if (!segments.extractLastParenSubstring(startparam, limitparam)) {
syntaxError("Mismatched segment delimiters", rule, start);
}
qstart = startparam[0];
qlimit = limitparam[0];
isSegment = true;
} else {
// The */+ follows an isolated character or quote
// or variable reference
if (buf.length() == quoteLimit) {
// The */+ follows a 'quoted string'
qstart = quoteStart;
qlimit = quoteLimit;
} else if (buf.length() == varLimit) {
// The */+ follows a $variableReference
qstart = varStart;
qlimit = varLimit;
} else {
// The */+ follows a single character
qstart = buf.length() - 1;
qlimit = qstart + 1;
}
}
UnicodeMatcher m =
new StringMatcher(buf.toString(), qstart, qlimit,
isSegment, parser.data);
int min = 0;
int max = Quantifier.MAX;
switch (c) {
case ONE_OR_MORE:
min = 1;
break;
case ZERO_OR_ONE:
min = 0;
max = 1;
break;
// case KLEENE_STAR:
// do nothing -- min, max already set
}
m = new Quantifier(m, min, max);
buf.setLength(qstart);
buf.append(parser.generateStandInFor(m));
}
break;
default: default:
// Disallow unquoted characters other than [0-9A-Za-z] // Disallow unquoted characters other than [0-9A-Za-z]
// in the printable ASCII range. These characters are // in the printable ASCII range. These characters are
@ -819,11 +711,6 @@ class TransliteratorParser {
break; break;
} }
} }
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
syntaxError("Misplaced " + CURSOR_POS, rule, start);
}
text = buf.toString();
return pos; return pos;
} }
@ -838,10 +725,12 @@ class TransliteratorParser {
} }
/** /**
* Create and return an int[] array of segments. * Create and return a UnicodeMatcher[] array of segments,
* or null if there are no segments.
*/ */
int[] createSegments() { UnicodeMatcher[] createSegments() {
return (segments == null) ? null : segments.createArray(); return (segments.size() == 0) ? null :
(UnicodeMatcher[]) segments.toArray(new UnicodeMatcher[segments.size()]);
} }
} }
@ -1096,9 +985,10 @@ class TransliteratorParser {
pos = left.parse(rule, pos, limit, this); pos = left.parse(rule, pos, limit, this);
if (pos == limit || if (pos == limit ||
OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) { OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
syntaxError("No operator", rule, start); syntaxError("No operator pos=" + pos, rule, start);
} }
++pos;
// Found an operator char. Check for forward-reverse operator. // Found an operator char. Check for forward-reverse operator.
if (operator == REVERSE_RULE_OP && if (operator == REVERSE_RULE_OP &&
@ -1110,7 +1000,7 @@ class TransliteratorParser {
pos = right.parse(rule, pos, limit, this); pos = right.parse(rule, pos, limit, this);
if (pos < limit) { if (pos < limit) {
if (rule.charAt(pos) == END_OF_RULE) { if (rule.charAt(--pos) == END_OF_RULE) {
++pos; ++pos;
} else { } else {
// RuleHalf parser must have terminated at an operator // RuleHalf parser must have terminated at an operator
@ -1173,7 +1063,7 @@ class TransliteratorParser {
// apply. // apply.
if (operator == FWDREV_RULE_OP) { if (operator == FWDREV_RULE_OP) {
right.removeContext(); right.removeContext();
right.segments = null; right.segments.removeAllElements();
left.cursor = left.maxRef = -1; left.cursor = left.maxRef = -1;
left.cursorOffset = 0; left.cursorOffset = 0;
} }
@ -1193,7 +1083,7 @@ class TransliteratorParser {
// cannot place the cursor outside the limits of the context. // cannot place the cursor outside the limits of the context.
// Anchors are only allowed on the input side. // Anchors are only allowed on the input side.
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 || if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
right.segments != null || left.maxRef >= 0 || right.segments.size() > 0 || left.maxRef >= 0 ||
(right.cursorOffset != 0 && right.cursor < 0) || (right.cursorOffset != 0 && right.cursor < 0) ||
// - The following two checks were used to ensure that the // - The following two checks were used to ensure that the
// - the cursor offset stayed within the ante- or postcontext. // - the cursor offset stayed within the ante- or postcontext.
@ -1208,14 +1098,8 @@ class TransliteratorParser {
// Check integrity of segments and segment references. Each // Check integrity of segments and segment references. Each
// segment's start must have a corresponding limit, and the // segment's start must have a corresponding limit, and the
// references must not refer to segments that do not exist. // references must not refer to segments that do not exist.
if (left.segments != null) { if (right.maxRef > left.segments.size()) {
if (!left.segments.validate()) { syntaxError("Undefined segment reference $" + right.maxRef, rule, start);
syntaxError("Missing segment close", rule, start);
}
int n = left.segments.count();
if (right.maxRef > n) {
syntaxError("Undefined segment reference", rule, start);
}
} }
data.ruleSet.addRule(new TransliterationRule( data.ruleSet.addRule(new TransliterationRule(
@ -1363,7 +1247,7 @@ class TransliteratorParser {
char generateStandInFor(UnicodeMatcher matcher) { char generateStandInFor(UnicodeMatcher matcher) {
// assert(matcher != null); // assert(matcher != null);
if (variableNext >= variableLimit) { if (variableNext >= variableLimit) {
throw new RuntimeException("Private use variables exhausted"); throw new RuntimeException("Variable range exhausted");
} }
variablesVector.addElement(matcher); variablesVector.addElement(matcher);
return variableNext++; return variableNext++;

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
* $Date: 2001/10/26 22:59:26 $ * $Date: 2001/10/30 18:08:19 $
* $Revision: 1.57 $ * $Revision: 1.58 $
* *
***************************************************************************************** *****************************************************************************************
*/ */
@ -1268,9 +1268,11 @@ public class TransliteratorTest extends TestFmwk {
"c abc ababc", "c abc ababc",
"d d abd"); "d d abd");
// NOTE: The (ab)+ when referenced just yields a single "ab",
// not the full sequence of them. This accords with perl behavior.
expect("(ab)+ {x} > '(' $1 ')';", expect("(ab)+ {x} > '(' $1 ')';",
"x abx ababxy", "x abx ababxy",
"x ab(ab) abab(abab)y"); "x ab(ab) abab(ab)y");
expect("b+ > x;", expect("b+ > x;",
"ac abc abbc abbbc", "ac abc abbc abbbc",
@ -1288,12 +1290,11 @@ public class TransliteratorTest extends TestFmwk {
"qa qab qaba qababc", "qa qab qaba qababc",
"xa x xa xc"); "xa x xa xc");
// Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s // NOTE: The (ab)+ when referenced just yields a single "ab",
// In perl, it only matches the first occurrence, so the output // not the full sequence of them. This accords with perl behavior.
// is "()a (ab) (ab)a (ab)c".
expect("q(ab)* > '(' $1 ')';", expect("q(ab)* > '(' $1 ')';",
"qa qab qaba qababc", "qa qab qaba qababc",
"()a (ab) (ab)a (abab)c"); "()a (ab) (ab)a (ab)c");
// 'foo'+ and 'foo'* -- the quantifier should apply to the entire // 'foo'+ and 'foo'* -- the quantifier should apply to the entire
// quoted string // quoted string
@ -1574,6 +1575,46 @@ public class TransliteratorTest extends TestFmwk {
expect(gr, "\u03B1\u0314", "ha"); expect(gr, "\u03B1\u0314", "ha");
} }
/**
* Test quantified segment behavior. We want:
* ([abc])+ > x $1 x; applied to "cba" produces "xax"
*/
public void TestQuantifiedSegment() {
// The normal case
expect("([abc]+) > x $1 x;", "cba", "xcbax");
// The tricky case; the quantifier is around the segment
expect("([abc])+ > x $1 x;", "cba", "xax");
// Tricky case in reverse direction
expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
// Check post-context segment
expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
// Test toRule/toPattern for non-quantified segment.
// Careful with spacing here.
String r = "([a-c]){q} > x $1 x;";
Transliterator t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
String rr = t.toRules(true);
if (!r.equals(rr)) {
errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
} else {
logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
}
// Test toRule/toPattern for quantified segment.
// Careful with spacing here.
r = "([a-c])+{q} > x $1 x;";
t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
rr = t.toRules(true);
if (!r.equals(rr)) {
errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
} else {
logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
}
}
//====================================================================== //======================================================================
// icu4j ONLY // icu4j ONLY
// These tests are not mirrored (yet) in icu4c at // These tests are not mirrored (yet) in icu4c at

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/StringMatcher.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/StringMatcher.java,v $
* $Date: 2001/10/25 22:32:02 $ * $Date: 2001/10/30 18:04:08 $
* $Revision: 1.2 $ * $Revision: 1.3 $
* *
***************************************************************************************** *****************************************************************************************
*/ */
@ -18,16 +18,27 @@ class StringMatcher implements UnicodeMatcher {
private boolean isSegment; private boolean isSegment;
private int matchStart;
private int matchLimit;
private final RuleBasedTransliterator.Data data; private final RuleBasedTransliterator.Data data;
public StringMatcher(String theString,
boolean isSeg,
RuleBasedTransliterator.Data theData) {
data = theData;
isSegment = isSeg;
pattern = theString;
matchStart = matchLimit = -1;
}
public StringMatcher(String theString, public StringMatcher(String theString,
int start, int start,
int limit, int limit,
boolean isSeg, boolean isSeg,
RuleBasedTransliterator.Data theData) { RuleBasedTransliterator.Data theData) {
data = theData; this(theString.substring(start, limit), isSeg, theData);
isSegment = isSeg;
pattern = theString.substring(start, limit);
} }
/** /**
@ -40,6 +51,7 @@ class StringMatcher implements UnicodeMatcher {
int i; int i;
int[] cursor = new int[] { offset[0] }; int[] cursor = new int[] { offset[0] };
if (limit < cursor[0]) { if (limit < cursor[0]) {
// Match in the reverse direction
for (i=pattern.length()-1; i>=0; --i) { for (i=pattern.length()-1; i>=0; --i) {
char keyChar = pattern.charAt(i); char keyChar = pattern.charAt(i);
UnicodeMatcher subm = data.lookup(keyChar); UnicodeMatcher subm = data.lookup(keyChar);
@ -58,6 +70,13 @@ class StringMatcher implements UnicodeMatcher {
} }
} }
} }
// Record the match position, but adjust for a normal
// forward start, limit, and only if a prior match does not
// exist -- we want the rightmost match.
if (matchStart < 0) {
matchStart = cursor[0]+1;
matchLimit = offset[0]+1;
}
} else { } else {
for (i=0; i<pattern.length(); ++i) { for (i=0; i<pattern.length(); ++i) {
if (incremental && cursor[0] == limit) { if (incremental && cursor[0] == limit) {
@ -85,6 +104,9 @@ class StringMatcher implements UnicodeMatcher {
} }
} }
} }
// Record the match position
matchStart = offset[0];
matchLimit = cursor[0];
} }
offset[0] = cursor[0]; offset[0] = cursor[0];
@ -114,7 +136,7 @@ class StringMatcher implements UnicodeMatcher {
result.append(')'); result.append(')');
} }
// Flush quoteBuf out to result // Flush quoteBuf out to result
TransliterationRule.appendToRule(result, (isSegment?')':-1), TransliterationRule.appendToRule(result, -1,
true, escapeUnprintable, quoteBuf); true, escapeUnprintable, quoteBuf);
return result.toString(); return result.toString();
} }
@ -130,6 +152,32 @@ class StringMatcher implements UnicodeMatcher {
UnicodeMatcher m = data.lookup(c); UnicodeMatcher m = data.lookup(c);
return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v); return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
} }
/**
* Remove any match data. This must be called before performing a
* set of matches with this segment.
*/
public void resetMatch() {
matchStart = matchLimit = -1;
}
/**
* Return the start offset, in the match text, of the <em>rightmost</em>
* match. This method may get moved up into the UnicodeMatcher if
* it turns out to be useful to generalize this.
*/
public int getMatchStart() {
return matchStart;
}
/**
* Return the limit offset, in the match text, of the <em>rightmost</em>
* match. This method may get moved up into the UnicodeMatcher if
* it turns out to be useful to generalize this.
*/
public int getMatchLimit() {
return matchLimit;
}
} }
//eof //eof

View File

@ -5,8 +5,8 @@
******************************************************************************* *******************************************************************************
* *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
* $Date: 2001/10/25 23:22:15 $ * $Date: 2001/10/30 18:04:08 $
* $Revision: 1.33 $ * $Revision: 1.34 $
* *
***************************************************************************************** *****************************************************************************************
*/ */
@ -30,13 +30,15 @@ import com.ibm.util.Utility;
* Variables are detected by looking up each character in a supplied * Variables are detected by looking up each character in a supplied
* variable list to see if it has been so defined. * variable list to see if it has been so defined.
* *
* <p>A rule may contain segments in its input string and segment references in * <p>A rule may contain segments in its input string and segment
* its output string. A segment is a substring of the input pattern, indicated * references in its output string. A segment is a substring of the
* by an offset and limit. The segment may span the preceding or following * input pattern, indicated by an offset and limit. The segment may
* context. A segment reference is a special character in the output string * be in the preceding or following context. It may not span a
* that causes a segment of the input string (not the input pattern) to be * context boundary. A segment reference is a special character in
* copied to the output string. The range of special characters that represent * the output string that causes a segment of the input string (not
* segment references is defined by RuleBasedTransliterator.Data. * the input pattern) to be copied to the output string. The range of
* special characters that represent segment references is defined by
* RuleBasedTransliterator.Data.
* *
* <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input * <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
* string "abc.123" to "ab1.c23". * string "abc.123" to "ab1.c23".
@ -44,7 +46,7 @@ import com.ibm.util.Utility;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved. * <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
* *
* @author Alan Liu * @author Alan Liu
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.33 $ $Date: 2001/10/25 23:22:15 $ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.34 $ $Date: 2001/10/30 18:04:08 $
*/ */
class TransliterationRule { class TransliterationRule {
@ -64,20 +66,13 @@ class TransliterationRule {
private String output; private String output;
/** /**
* An array of integers encoding the position of the segments. * An array of matcher objects corresponding to the input pattern
* See RuleBasedTransliterator.Segments for more details. * segments. If there are no segments this is null. N.B. This is
* a UnicodeMatcher for generality, but in practice it is always a
* StringMatcher. In the future we may generalize this, but for
* now we sometimes cast down to StringMatcher.
*/ */
int[] segments; UnicodeMatcher[] segments;
/**
* A value we compute from segments. The first index into segments[]
* that is >= anteContextLength. That is, the first one that is within
* the forward scanned part of the pattern -- the key or the postContext.
* If there are no segments, this has the value -1. This index is relative
* to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
* segments[FIRST_SEG_POS_INDEX + firstKeySeg].
*/
int firstKeySeg;
/** /**
* The length of the string that must match before the key. If * The length of the string that must match before the key. If
@ -127,20 +122,6 @@ class TransliterationRule {
private static final char APOSTROPHE = '\''; private static final char APOSTROPHE = '\'';
private static final char BACKSLASH = '\\'; private static final char BACKSLASH = '\\';
// Macros for accessing the array of integers encoding the position of
// the segments. See RuleBasedTransliterator.Segments for more details.
// SEGMENTS_COUNT number of segments, n (half the number of parens)
// SEGMENTS_LEN length of the segments array (number of elements)
// SEGMENTS_POS position in 'pattern' of parenthesis i, where i=0..2n-1
// SEGMENTS_NUM index into segments to access POS of $1.open,
// $1.close, $2.open, $2.close,.., $n.open, $n.close
// Relative to FIRST_SEG_POS_INDEX. Ranges from 0..2n-1.
static final int FIRST_SEG_POS_INDEX = 2;
static final int SEGMENTS_COUNT(int[] x) { return x[0]; }
static final int SEGMENTS_LEN(int[] x) { return (SEGMENTS_COUNT(x)*4+4); }
static final int SEGMENTS_POS(int[] x,int i) { return x[FIRST_SEG_POS_INDEX+i]; }
static final int SEGMENTS_NUM(int[] x,int i) { return x[x[1]+i]-FIRST_SEG_POS_INDEX; }
private static final String COPYRIGHT = private static final String COPYRIGHT =
"\u00A9 IBM Corporation 1999-2001. All rights reserved."; "\u00A9 IBM Corporation 1999-2001. All rights reserved.";
@ -165,12 +146,8 @@ class TransliterationRule {
* 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to * 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
* "xyz" and moves the cursor to before "a". It would have a cursorOffset * "xyz" and moves the cursor to before "a". It would have a cursorOffset
* of -3. * of -3.
* @param segs array of 2n integers. Each of n pairs consists of offset, * @param segs array of UnicodeMatcher corresponding to input pattern
* limit for a segment of the input string. Characters in the output string * segments, or null if there are none
* refer to these segments if they are in a special range determined by the
* associated RuleBasedTransliterator.Data object. May be null if there are
* no segments. The caller is responsible for validating that segments
* are well-formed.
* @param anchorStart true if the the rule is anchored on the left to * @param anchorStart true if the the rule is anchored on the left to
* the context start * the context start
* @param anchorEnd true if the rule is anchored on the right to the * @param anchorEnd true if the rule is anchored on the right to the
@ -180,7 +157,7 @@ class TransliterationRule {
int anteContextPos, int postContextPos, int anteContextPos, int postContextPos,
String output, String output,
int cursorPos, int cursorOffset, int cursorPos, int cursorOffset,
int[] segs, UnicodeMatcher[] segs,
boolean anchorStart, boolean anchorEnd, boolean anchorStart, boolean anchorEnd,
RuleBasedTransliterator.Data theData) { RuleBasedTransliterator.Data theData) {
data = theData; data = theData;
@ -212,25 +189,11 @@ class TransliterationRule {
this.cursorPos = cursorPos + cursorOffset; this.cursorPos = cursorPos + cursorOffset;
this.output = output; this.output = output;
// We don't validate the segments array. The caller must // We don't validate the segments array. The caller must
// guarantee that the segments are well-formed. // guarantee that the segments are well-formed (that is, that
// all $n references in the output refer to indices of this
// array, and that no array elements are null).
this.segments = segs; this.segments = segs;
// Find the position of the first segment index that is after the
// anteContext (in the key). Note that this may be a start or a
// limit index. If all segments are in the ante context,
// firstKeySeg should point past the last segment -- that is, it
// should point at the end marker, which is -1. This allows the
// code to back up by one to obtain the last ante context segment.
firstKeySeg = -1;
if (segments != null) {
firstKeySeg = FIRST_SEG_POS_INDEX;
while (segments[firstKeySeg] >= 0 &&
segments[firstKeySeg] < anteContextLength) {
++firstKeySeg;
}
firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
}
pattern = input; pattern = input;
flags = 0; flags = 0;
if (anchorStart) { if (anchorStart) {
@ -410,25 +373,12 @@ class TransliterationRule {
// ============================ MATCH =========================== // ============================ MATCH ===========================
// Record the actual positions, in the text, of the segments. // Reset segment match data
// These are recorded in the order that they occur in the pattern.
// segPos[] is an array of 2*SEGMENTS_COUNT elements. It
// records the position in 'text' of each segment boundary, in
// the order that they occur in 'pattern'.
int[] segPos = null;
if (segments != null) { if (segments != null) {
segPos = new int[2*SEGMENTS_COUNT(segments)]; for (int i=0; i<segments.length; ++i) {
((StringMatcher) segments[i]).resetMatch();
}
} }
// iSeg is an index into segments[] that accesses the first
// array. As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
// When indexing into segments[] FIRST_SEG_POS_INDEX must be
// added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
int iSeg = firstKeySeg - 1;
// nextSegPos is an offset in 'pattern'. When the cursor is
// equal to nextSegPos, we are at a segment boundary, and we
// record the position in the real text in segPos[].
int nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
int lenDelta, keyLimit; int lenDelta, keyLimit;
int[] intRef = new int[1]; int[] intRef = new int[1];
@ -465,15 +415,6 @@ class TransliterationRule {
} }
oText = intRef[0]; oText = intRef[0];
} }
while (nextSegPos == oPattern) {
segPos[iSeg] = oText;
if (oText >= 0) {
segPos[iSeg] += UTF16.getCharCount(UTF16.charAt(text, oText));
} else {
++segPos[iSeg];
}
nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
}
} }
minOText = posAfter(text, oText); minOText = posAfter(text, oText);
@ -486,9 +427,6 @@ class TransliterationRule {
// -------------------- Key and Post Context -------------------- // -------------------- Key and Post Context --------------------
iSeg = firstKeySeg;
nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
oPattern = 0; oPattern = 0;
oText = pos.start; oText = pos.start;
keyLimit = 0; keyLimit = 0;
@ -511,10 +449,6 @@ class TransliterationRule {
// depending on whether we're in the key or in the post // depending on whether we're in the key or in the post
// context. // context.
while (oPattern == nextSegPos) {
segPos[iSeg] = oText;
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
}
if (oPattern == keyLength) { if (oPattern == keyLength) {
keyLimit = oText; keyLimit = oText;
} }
@ -554,10 +488,6 @@ class TransliterationRule {
//! return UnicodeMatcher.U_MISMATCH; //! return UnicodeMatcher.U_MISMATCH;
//!} //!}
} }
while (oPattern == nextSegPos) {
segPos[iSeg] = oText;
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
}
if (oPattern == keyLength) { if (oPattern == keyLength) {
keyLimit = oText; keyLimit = oText;
} }
@ -576,8 +506,7 @@ class TransliterationRule {
// =========================== REPLACE ========================== // =========================== REPLACE ==========================
// We have a full match. The key is between pos.start and // We have a full match. The key is between pos.start and
// keyLimit. Segment indices have been recorded in segPos[]. // keyLimit.
// Perform a replacement.
if (segments == null) { if (segments == null) {
text.replace(pos.start, keyLimit, output); text.replace(pos.start, keyLimit, output);
@ -629,12 +558,23 @@ class TransliterationRule {
buf.setLength(0); buf.setLength(0);
} }
// Copy segment with out-of-band data // Copy segment with out-of-band data
b *= 2; StringMatcher m = (StringMatcher) segments[b];
int start = segPos[SEGMENTS_NUM(segments,b)]; int start = m.getMatchStart();
int limit = segPos[SEGMENTS_NUM(segments,b+1)]; int limit = m.getMatchLimit();
// If there was no match, that means that a quantifier
// matched zero-length. E.g., x (a)* y matched "xy".
if (start >= 0) {
// Adjust indices for segments in post context
// for any inserted text between the key and
// the post context.
if (start >= keyLimit) {
start += dest - keyLimit;
limit += dest - keyLimit;
}
text.copy(start, limit, dest); text.copy(start, limit, dest);
dest += limit - start; dest += limit - start;
} }
}
oOutput += UTF16.getCharCount(c); oOutput += UTF16.getCharCount(c);
} }
// Insert any accumulated straight text. // Insert any accumulated straight text.
@ -790,20 +730,6 @@ class TransliterationRule {
StringBuffer rule = new StringBuffer(); StringBuffer rule = new StringBuffer();
// iseg indexes into segments[] directly (not offset from FSPI)
int iseg = FIRST_SEG_POS_INDEX-1;
int nextSeg = -1;
// Build an array of booleans specifying open vs. close paren
boolean[] isOpen = null;
if (segments != null) {
isOpen = new boolean[2*SEGMENTS_COUNT(segments)];
for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
isOpen[SEGMENTS_NUM(segments,i) ] = true;
isOpen[SEGMENTS_NUM(segments,i+1)] = false;
}
nextSeg = segments[++iseg];
}
// Accumulate special characters (and non-specials following them) // Accumulate special characters (and non-specials following them)
// into quoteBuf. Append quoteBuf, within single quotes, when // into quoteBuf. Append quoteBuf, within single quotes, when
// a non-quoted element must be inserted. // a non-quoted element must be inserted.
@ -825,14 +751,6 @@ class TransliterationRule {
appendToRule(rule, '{', true, escapeUnprintable, quoteBuf); appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
} }
// Append either '(' or ')' if we are at a segment index
if (i == nextSeg) {
appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
'(' : ')',
true, escapeUnprintable, quoteBuf);
nextSeg = segments[++iseg];
}
if (emitBraces && i == (anteContextLength + keyLength)) { if (emitBraces && i == (anteContextLength + keyLength)) {
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf); appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
} }
@ -847,11 +765,6 @@ class TransliterationRule {
} }
} }
if (i == nextSeg) {
// assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
appendToRule(rule, ')', true, escapeUnprintable, quoteBuf);
}
if (emitBraces && i == (anteContextLength + keyLength)) { if (emitBraces && i == (anteContextLength + keyLength)) {
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf); appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
} }
@ -885,7 +798,7 @@ class TransliterationRule {
} else { } else {
++seg; // make 1-based ++seg; // make 1-based
appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf); appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf);
rule.append(0x24 /*$*/); rule.append('$');
boolean show = false; // true if we should display digits boolean show = false; // true if we should display digits
for (int p=9; p>=0; --p) { for (int p=9; p>=0; --p) {
int d = seg / POW10[p]; int d = seg / POW10[p];
@ -938,6 +851,9 @@ class TransliterationRule {
/** /**
* $Log: TransliterationRule.java,v $ * $Log: TransliterationRule.java,v $
* Revision 1.34 2001/10/30 18:04:08 alan
* jitterbug 1406: make quantified segments behave like perl counterparts
*
* Revision 1.33 2001/10/25 23:22:15 alan * Revision 1.33 2001/10/25 23:22:15 alan
* jitterbug 73: changes to support zero-length matchers at end of key * jitterbug 73: changes to support zero-length matchers at end of key
* *

View File

@ -4,8 +4,8 @@
* Corporation and others. All Rights Reserved. * Corporation and others. All Rights Reserved.
********************************************************************** **********************************************************************
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliteratorParser.java,v $ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliteratorParser.java,v $
* $Date: 2001/10/24 00:03:38 $ * $Date: 2001/10/30 18:04:09 $
* $Revision: 1.7 $ * $Revision: 1.8 $
********************************************************************** **********************************************************************
*/ */
package com.ibm.text; package com.ibm.text;
@ -117,6 +117,7 @@ class TransliteratorParser {
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
private static final String OPERATORS = "=><"; private static final String OPERATORS = "=><";
private static final String HALF_ENDERS = "=><;";
// Other special characters // Other special characters
private static final char QUOTE = '\''; private static final char QUOTE = '\'';
@ -142,7 +143,7 @@ class TransliteratorParser {
// private static final char ANCHOR_END = '$'; // private static final char ANCHOR_END = '$';
// Segments of the input string are delimited by "(" and ")". In the // Segments of the input string are delimited by "(" and ")". In the
// output string these segments are referenced as "$1" through "$9". // output string these segments are referenced as "$1", "$2", etc.
private static final char SEGMENT_OPEN = '('; private static final char SEGMENT_OPEN = '(';
private static final char SEGMENT_CLOSE = ')'; private static final char SEGMENT_CLOSE = ')';
@ -285,209 +286,6 @@ class TransliteratorParser {
} }
}; };
//----------------------------------------------------------------------
// class Segments
//----------------------------------------------------------------------
/**
* Segments are parentheses-enclosed regions of the input string.
* These are referenced in the output string using the notation $1,
* $2, etc. Numbering is in order of appearance of the left
* parenthesis. Number is one-based. Segments are defined as start,
* limit pairs. Segments may nest.
*
* During parsing, segment data is encoded in an object of class
* Segments. At runtime, the same data is encoded in compact form as
* an array of integers in a TransliterationRule. The runtime encoding
* must satisfy three goals:
*
* 1. Iterate over the offsets in a pattern, from left to right,
* and indicate all segment boundaries, in order. This is done
* during matching.
*
* 2. Given a reference $n, produce the start and limit offsets
* for that segment. This is done during replacement.
*
* 3. Similar to goal 1, but in addition, indicate whether each
* segment boundary is a start or a limit, in other words, whether
* each is an open paren or a close paren. This is required by
* the toRule() method.
*
* Goal 1 must be satisfied at high speed since this is done during
* matching. Goal 2 is next most important. Goal 3 is not performance
* critical since it is only needed by toRule().
*
* The array of integers is actually two arrays concatenated. The
* first gives the index values of the open and close parentheses in
* the order they appear. The second maps segment numbers to the
* indices of the first array. The two arrays have the same length.
* Iterating over the first array satisfies goal 1. Indexing into the
* second array satisfies goal 2. Goal 3 is satisfied by iterating
* over the second array and constructing the required data when
* needed. This is what toRule() does.
*
* Example: (a b(c d)e f)
* 0 1 2 3 4 5 6
*
* First array: Indices are 0, 2, 4, and 6.
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
* second array is 0, 3, 1 2 -- these give the indices in the
* first array at which $1:open, $1:close, $2:open, and $2:close
* occur.
*
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
*
* Each subarray is terminated with a -1, and two leading entries
* give the number of segments and the offset to the first entry
* of the second array. In addition, the second array value are
* all offset by 2 so they index directly into the final array.
* The total array size is 4*segments[0] + 4. The second index is
* 2*segments[0] + 3.
*
* In the output string, a segment reference is indicated by a
* character in a special range, as defined by
* RuleBasedTransliterator.Data.
*
* Most rules have no segments, in which case segments is null, and the
* output string need not be checked for segment reference characters.
*
* See also rbt_rule.h/cpp.
*/
private static class Segments {
private Vector offsets; // holds Integer objects
private Vector isOpenParen; // holds Boolean objects
private int offset(int i) {
return ((Integer) offsets.elementAt(i)).intValue();
}
private boolean isOpen(int i) {
return ((Boolean) isOpenParen.elementAt(i)).booleanValue();
}
// size of the Vectors
private int size() {
// assert(offset.size() == isOpenParen.size());
return offsets.size();
}
public Segments() {
offsets = new Vector();
isOpenParen = new Vector();
}
public void addParenthesisAt(int offset, boolean isOpen) {
offsets.addElement(new Integer(offset));
isOpenParen.addElement(new Boolean(isOpen));
}
public int getLastParenOffset(boolean[] isOpenParen) {
if (size() == 0) {
return -1;
}
isOpenParen[0] = isOpen(size()-1);
return offset(size()-1);
}
// Remove the last (rightmost) segment. Store its offsets in start
// and limit, and then convert all offsets at or after start to be
// equal to start. Upon failure, return FALSE. Assume that the
// caller has already called getLastParenOffset() and validated that
// there is at least one parenthesis and that the last one is a close
// paren.
public boolean extractLastParenSubstring(int[] start, int[] limit) {
// assert(offsets.size() > 0);
// assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
int i = size() - 1;
int n = 1; // count of close parens we need to match
// Record position of the last close paren
limit[0] = offset(i);
--i; // back up to the one before the last one
while (i >= 0 && n != 0) {
n += isOpen(i) ? -1 : 1;
}
if (n != 0) {
return false;
}
// assert(i>=0);
start[0] = offset(i);
// Reset all segment pairs from i to size() - 1 to [start, start+1).
while (i<size()) {
int o = isOpen(i) ? start[0] : (start[0]+1);
offsets.setElementAt(new Integer(o), i);
++i;
}
return true;
}
// Assume caller has already gotten a TRUE validate().
public int[] createArray() {
int c = count(); // number of segments
int arrayLen = 4*c + 4;
int[] array = new int[arrayLen];
int a2offset = 2*c + 3; // offset to array 2
array[0] = c;
array[1] = a2offset;
int i;
for (i=0; i<2*c; ++i) {
array[2+i] = offset(i);
}
array[a2offset-1] = -1;
array[arrayLen-1] = -1;
// Now walk through and match up segment numbers with parentheses.
// Number segments from 0. We're going to offset all entries by 2
// to skip the first two elements, array[0] and array[1].
Stack stack = new Stack();
int nextOpen = 0; // seg # of next open, 0-based
for (i=0; i<2*c; ++i) {
boolean open = isOpen(i);
// Let seg be the zero-based segment number.
// Open parens are at 2*seg in array 2.
// Close parens are at 2*seg+1 in array 2.
if (open) {
array[a2offset + 2*nextOpen] = 2+i;
stack.push(new Integer(nextOpen));
++nextOpen;
} else {
int nextClose = ((Integer) stack.pop()).intValue();
array[a2offset + 2*nextClose+1] = 2+i;
}
}
// assert(stack.empty());
return array;
}
public boolean validate() {
// want number of parens >= 2
// want number of parens to be even
// want first paren '('
// want parens to match up in the end
if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
return false;
}
int n = 0;
for (int i=0; i<size(); ++i) {
n += isOpen(i) ? 1 : -1;
if (n < 0) {
return false;
}
}
return n == 0;
}
// Number of segments
// Assume caller has already gotten a TRUE validate().
public int count() {
// assert(validate());
return size() / 2;
}
}
//---------------------------------------------------------------------- //----------------------------------------------------------------------
// class RuleHalf // class RuleHalf
//---------------------------------------------------------------------- //----------------------------------------------------------------------
@ -505,11 +303,7 @@ class TransliteratorParser {
public int ante = -1; // position of ante context marker '{' in text public int ante = -1; // position of ante context marker '{' in text
public int post = -1; // position of post context marker '}' in text public int post = -1; // position of post context marker '}' in text
// Record the position of the segment substrings and references. A public int maxRef = -1; // n where maximum segment ref is $n; 1-based
// given side should have segments or segment references, but not
// both.
public Segments segments = null;
public int maxRef = -1; // index of largest ref (1..9)
// Record the offset to the cursor either to the left or to the // Record the offset to the cursor either to the left or to the
// right of the key. This is indicated by characters on the output // right of the key. This is indicated by characters on the output
@ -521,29 +315,88 @@ class TransliteratorParser {
// output text. // output text.
public int cursorOffset = 0; // only nonzero on output side public int cursorOffset = 0; // only nonzero on output side
// Position of first CURSOR_OFFSET on _right_. This will be -1
// for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
private int cursorOffsetPos = 0;
public boolean anchorStart = false; public boolean anchorStart = false;
public boolean anchorEnd = false; public boolean anchorEnd = false;
/**
* UnicodeMatcher objects corresponding to each segment.
*/
public Vector segments = new Vector();
/**
* The segment number from 0..n-1 of the next '(' we see
* during parsing; 0-based.
*/
private int nextSegmentNumber = 0;
/** /**
* Parse one side of a rule, stopping at either the limit, * Parse one side of a rule, stopping at either the limit,
* the END_OF_RULE character, or an operator. Return * the END_OF_RULE character, or an operator.
* the pos of the terminating character (or limit). * @return the index after the terminating character, or
* if limit was reached, limit
*/ */
public int parse(String rule, int pos, int limit, public int parse(String rule, int pos, int limit,
TransliteratorParser parser) { TransliteratorParser parser) {
int start = pos; int start = pos;
StringBuffer buf = new StringBuffer(); StringBuffer buf = new StringBuffer();
pos = parseSection(rule, pos, limit, parser, buf, false);
text = buf.toString();
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
syntaxError("Misplaced " + CURSOR_POS, rule, start);
}
return pos;
}
/**
* Parse a section of one side of a rule, stopping at either
* the limit, the END_OF_RULE character, an operator, or a
* segment close character. This method parses both a
* top-level rule half and a segment within such a rule half.
* It calls itself recursively to parse segments and nested
* segments.
* @param buf buffer into which to accumulate the rule pattern
* characters, either literal characters from the rule or
* standins for UnicodeMatcher objects including segments.
* @param isSegment if true, then we've already seen a '(' and
* pos on entry points right after it. Accumulate everything
* up to the closing ')', put it in a segment matcher object,
* generate a standin for it, and add the standin to buf. As
* a side effect, update the segments vector with a reference
* to the segment matcher. This works recursively for nested
* segments. If isSegment is false, just accumulate
* characters into buf.
* @return the index after the terminating character, or
* if limit was reached, limit
*/
private int parseSection(String rule, int pos, int limit,
TransliteratorParser parser,
StringBuffer buf,
boolean isSegment) {
int start = pos;
ParsePosition pp = null; ParsePosition pp = null;
int cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
boolean done = false;
int quoteStart = -1; // Most recent 'single quoted string' int quoteStart = -1; // Most recent 'single quoted string'
int quoteLimit = -1; int quoteLimit = -1;
int varStart = -1; // Most recent $variableReference int varStart = -1; // Most recent $variableReference
int varLimit = -1; int varLimit = -1;
int[] iref = new int[1]; int[] iref = new int[1];
// If isSegment, then bufSegStart is the offset in buf to
// the first character of the segment we are parsing.
int bufSegStart = 0;
int segmentNumber = 0;
if (isSegment) {
bufSegStart = buf.length();
segmentNumber = nextSegmentNumber++;
}
main: main:
while (pos < limit && !done) { while (pos < limit) {
char c = rule.charAt(pos++); char c = rule.charAt(pos++);
if (Character.isWhitespace(c)) { if (Character.isWhitespace(c)) {
// Ignore whitespace. Note that this is not Unicode // Ignore whitespace. Note that this is not Unicode
@ -551,8 +404,11 @@ class TransliteratorParser {
// whitespace likely to be seen in code. // whitespace likely to be seen in code.
continue; continue;
} }
if (OPERATORS.indexOf(c) >= 0) { // HALF_ENDERS is all chars that end a rule half: "<>=;"
--pos; // Backup to point to operator if (HALF_ENDERS.indexOf(c) >= 0) {
if (isSegment) {
syntaxError("Unclosed segment", rule, start);
}
break main; break main;
} }
if (anchorEnd) { if (anchorEnd) {
@ -614,7 +470,12 @@ class TransliteratorParser {
} }
continue; continue;
} }
switch (c) { switch (c) {
//------------------------------------------------------
// Elements allowed within and out of segments
//------------------------------------------------------
case ANCHOR_START: case ANCHOR_START:
if (buf.length() == 0 && !anchorStart) { if (buf.length() == 0 && !anchorStart) {
anchorStart = true; anchorStart = true;
@ -624,17 +485,8 @@ class TransliteratorParser {
} }
break; break;
case SEGMENT_OPEN: case SEGMENT_OPEN:
case SEGMENT_CLOSE: pos = parseSection(rule, pos, limit, parser, buf, true);
// Handle segment definitions "(" and ")"
// Parse "(", ")"
if (segments == null) {
segments = new Segments();
}
segments.addParenthesisAt(buf.length(), c == SEGMENT_OPEN);
break; break;
case END_OF_RULE:
--pos; // Backup to point to END_OF_RULE
break main;
case SymbolTable.SYMBOL_REF: case SymbolTable.SYMBOL_REF:
// Handle variable references and segment references "$1" .. "$9" // Handle variable references and segment references "$1" .. "$9"
{ {
@ -697,25 +549,129 @@ class TransliteratorParser {
} }
} }
break; break;
case DOT:
buf.append(parser.getDotStandIn());
break;
case KLEENE_STAR:
case ONE_OR_MORE:
case ZERO_OR_ONE:
// Quantifiers. We handle single characters, quoted strings,
// variable references, and segments.
// a+ matches aaa
// 'foo'+ matches foofoofoo
// $v+ matches xyxyxy if $v == xy
// (seg)+ matches segsegseg
{
if (isSegment && buf.length() == bufSegStart) {
// The */+ immediately follows '('
syntaxError("Misplaced quantifier", rule, start);
break;
}
int qstart, qlimit;
// The */+ follows an isolated character or quote
// or variable reference
if (buf.length() == quoteLimit) {
// The */+ follows a 'quoted string'
qstart = quoteStart;
qlimit = quoteLimit;
} else if (buf.length() == varLimit) {
// The */+ follows a $variableReference
qstart = varStart;
qlimit = varLimit;
} else {
// The */+ follows a single character, possibly
// a segment standin
qstart = buf.length() - 1;
qlimit = qstart + 1;
}
UnicodeMatcher m =
new StringMatcher(buf.toString(), qstart, qlimit,
false, parser.data);
int min = 0;
int max = Quantifier.MAX;
switch (c) {
case ONE_OR_MORE:
min = 1;
break;
case ZERO_OR_ONE:
min = 0;
max = 1;
break;
// case KLEENE_STAR:
// do nothing -- min, max already set
}
m = new Quantifier(m, min, max);
buf.setLength(qstart);
buf.append(parser.generateStandInFor(m));
}
break;
//------------------------------------------------------
// Elements allowed ONLY WITHIN segments
//------------------------------------------------------
case SEGMENT_CLOSE:
if (isSegment) {
// We're done parsing a segment. The relevant
// characters are in buf, starting at offset
// bufSegStart. Extract them into a string
// matcher, and replace them with a standin
// for that matcher.
StringMatcher m =
new StringMatcher(buf.substring(bufSegStart),
true, parser.data);
// Since we call parseSection() recursively,
// nested segments will result in segment i+1
// getting parsed and stored before segment i;
// be careful with the vector handling here.
if ((segmentNumber+1) > segments.size()) {
segments.setSize(segmentNumber+1);
}
segments.setElementAt(m, segmentNumber);
buf.setLength(bufSegStart);
buf.append(parser.generateStandInFor(m));
break main;
}
// If we aren't in a segment, then a segment close
// character is a syntax error.
syntaxError("Unquoted special", rule, start);
break;
//------------------------------------------------------
// Elements allowed ONLY OUTSIDE segments
//------------------------------------------------------
case CONTEXT_ANTE: case CONTEXT_ANTE:
if (isSegment) {
syntaxError("Illegal character '" + c + "' in segment", rule, start);
}
if (ante >= 0) { if (ante >= 0) {
syntaxError("Multiple ante contexts", rule, start); syntaxError("Multiple ante contexts", rule, start);
} }
ante = buf.length(); ante = buf.length();
break; break;
case CONTEXT_POST: case CONTEXT_POST:
if (isSegment) {
syntaxError("Illegal character '" + c + "' in segment", rule, start);
}
if (post >= 0) { if (post >= 0) {
syntaxError("Multiple post contexts", rule, start); syntaxError("Multiple post contexts", rule, start);
} }
post = buf.length(); post = buf.length();
break; break;
case CURSOR_POS: case CURSOR_POS:
if (isSegment) {
syntaxError("Illegal character '" + c + "' in segment", rule, start);
}
if (cursor >= 0) { if (cursor >= 0) {
syntaxError("Multiple cursors", rule, start); syntaxError("Multiple cursors", rule, start);
} }
cursor = buf.length(); cursor = buf.length();
break; break;
case CURSOR_OFFSET: case CURSOR_OFFSET:
if (isSegment) {
syntaxError("Illegal character '" + c + "' in segment", rule, start);
}
if (cursorOffset < 0) { if (cursorOffset < 0) {
if (buf.length() > 0) { if (buf.length() > 0) {
syntaxError("Misplaced " + c, rule, start); syntaxError("Misplaced " + c, rule, start);
@ -737,74 +693,10 @@ class TransliteratorParser {
} }
} }
break; break;
case DOT:
buf.append(parser.getDotStandIn()); //------------------------------------------------------
break; // Non-special characters
case KLEENE_STAR: //------------------------------------------------------
case ONE_OR_MORE:
case ZERO_OR_ONE:
// Quantifiers. We handle single characters, quoted strings,
// variable references, and segments.
// a+ matches aaa
// 'foo'+ matches foofoofoo
// $v+ matches xyxyxy if $v == xy
// (seg)+ matches segsegseg
{
int qstart, qlimit;
boolean[] isOpenParen = new boolean[1];
boolean isSegment = false;
if (segments != null &&
segments.getLastParenOffset(isOpenParen) == buf.length()) {
// The */+ immediately follows a segment
if (isOpenParen[0]) {
syntaxError("Misplaced quantifier", rule, start);
}
int[] startparam = new int[1];
int[] limitparam = new int[1];
if (!segments.extractLastParenSubstring(startparam, limitparam)) {
syntaxError("Mismatched segment delimiters", rule, start);
}
qstart = startparam[0];
qlimit = limitparam[0];
isSegment = true;
} else {
// The */+ follows an isolated character or quote
// or variable reference
if (buf.length() == quoteLimit) {
// The */+ follows a 'quoted string'
qstart = quoteStart;
qlimit = quoteLimit;
} else if (buf.length() == varLimit) {
// The */+ follows a $variableReference
qstart = varStart;
qlimit = varLimit;
} else {
// The */+ follows a single character
qstart = buf.length() - 1;
qlimit = qstart + 1;
}
}
UnicodeMatcher m =
new StringMatcher(buf.toString(), qstart, qlimit,
isSegment, parser.data);
int min = 0;
int max = Quantifier.MAX;
switch (c) {
case ONE_OR_MORE:
min = 1;
break;
case ZERO_OR_ONE:
min = 0;
max = 1;
break;
// case KLEENE_STAR:
// do nothing -- min, max already set
}
m = new Quantifier(m, min, max);
buf.setLength(qstart);
buf.append(parser.generateStandInFor(m));
}
break;
default: default:
// Disallow unquoted characters other than [0-9A-Za-z] // Disallow unquoted characters other than [0-9A-Za-z]
// in the printable ASCII range. These characters are // in the printable ASCII range. These characters are
@ -819,11 +711,6 @@ class TransliteratorParser {
break; break;
} }
} }
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
syntaxError("Misplaced " + CURSOR_POS, rule, start);
}
text = buf.toString();
return pos; return pos;
} }
@ -838,10 +725,12 @@ class TransliteratorParser {
} }
/** /**
* Create and return an int[] array of segments. * Create and return a UnicodeMatcher[] array of segments,
* or null if there are no segments.
*/ */
int[] createSegments() { UnicodeMatcher[] createSegments() {
return (segments == null) ? null : segments.createArray(); return (segments.size() == 0) ? null :
(UnicodeMatcher[]) segments.toArray(new UnicodeMatcher[segments.size()]);
} }
} }
@ -1096,9 +985,10 @@ class TransliteratorParser {
pos = left.parse(rule, pos, limit, this); pos = left.parse(rule, pos, limit, this);
if (pos == limit || if (pos == limit ||
OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) { OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
syntaxError("No operator", rule, start); syntaxError("No operator pos=" + pos, rule, start);
} }
++pos;
// Found an operator char. Check for forward-reverse operator. // Found an operator char. Check for forward-reverse operator.
if (operator == REVERSE_RULE_OP && if (operator == REVERSE_RULE_OP &&
@ -1110,7 +1000,7 @@ class TransliteratorParser {
pos = right.parse(rule, pos, limit, this); pos = right.parse(rule, pos, limit, this);
if (pos < limit) { if (pos < limit) {
if (rule.charAt(pos) == END_OF_RULE) { if (rule.charAt(--pos) == END_OF_RULE) {
++pos; ++pos;
} else { } else {
// RuleHalf parser must have terminated at an operator // RuleHalf parser must have terminated at an operator
@ -1173,7 +1063,7 @@ class TransliteratorParser {
// apply. // apply.
if (operator == FWDREV_RULE_OP) { if (operator == FWDREV_RULE_OP) {
right.removeContext(); right.removeContext();
right.segments = null; right.segments.removeAllElements();
left.cursor = left.maxRef = -1; left.cursor = left.maxRef = -1;
left.cursorOffset = 0; left.cursorOffset = 0;
} }
@ -1193,7 +1083,7 @@ class TransliteratorParser {
// cannot place the cursor outside the limits of the context. // cannot place the cursor outside the limits of the context.
// Anchors are only allowed on the input side. // Anchors are only allowed on the input side.
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 || if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
right.segments != null || left.maxRef >= 0 || right.segments.size() > 0 || left.maxRef >= 0 ||
(right.cursorOffset != 0 && right.cursor < 0) || (right.cursorOffset != 0 && right.cursor < 0) ||
// - The following two checks were used to ensure that the // - The following two checks were used to ensure that the
// - the cursor offset stayed within the ante- or postcontext. // - the cursor offset stayed within the ante- or postcontext.
@ -1208,14 +1098,8 @@ class TransliteratorParser {
// Check integrity of segments and segment references. Each // Check integrity of segments and segment references. Each
// segment's start must have a corresponding limit, and the // segment's start must have a corresponding limit, and the
// references must not refer to segments that do not exist. // references must not refer to segments that do not exist.
if (left.segments != null) { if (right.maxRef > left.segments.size()) {
if (!left.segments.validate()) { syntaxError("Undefined segment reference $" + right.maxRef, rule, start);
syntaxError("Missing segment close", rule, start);
}
int n = left.segments.count();
if (right.maxRef > n) {
syntaxError("Undefined segment reference", rule, start);
}
} }
data.ruleSet.addRule(new TransliterationRule( data.ruleSet.addRule(new TransliterationRule(
@ -1363,7 +1247,7 @@ class TransliteratorParser {
char generateStandInFor(UnicodeMatcher matcher) { char generateStandInFor(UnicodeMatcher matcher) {
// assert(matcher != null); // assert(matcher != null);
if (variableNext >= variableLimit) { if (variableNext >= variableLimit) {
throw new RuntimeException("Private use variables exhausted"); throw new RuntimeException("Variable range exhausted");
} }
variablesVector.addElement(matcher); variablesVector.addElement(matcher);
return variableNext++; return variableNext++;