ICU-1406 make quantified segments behave like perl counterparts
X-SVN-Rev: 6493
This commit is contained in:
parent
0d08aaadcc
commit
2c2b11dfe8
@ -63,6 +63,10 @@ static const UChar gOPERATORS[] = {
|
||||
0x3D, 0x3E, 0x3C, 0 // "=><"
|
||||
};
|
||||
|
||||
static const UChar HALF_ENDERS[] = {
|
||||
0x3D, 0x3E, 0x3C, 59, 0 // "=><;"
|
||||
};
|
||||
|
||||
// These are also used in Transliterator::toRules()
|
||||
static const int32_t ID_TOKEN_LEN = 2;
|
||||
static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'
|
||||
@ -147,256 +151,6 @@ UnicodeString ParseData::parseReference(const UnicodeString& text,
|
||||
return result;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Segments
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Segments are parentheses-enclosed regions of the input string.
|
||||
* These are referenced in the output string using the notation $1,
|
||||
* $2, etc. Numbering is in order of appearance of the left
|
||||
* parenthesis. Number is one-based. Segments are defined as start,
|
||||
* limit pairs. Segments may nest.
|
||||
*
|
||||
* During parsing, segment data is encoded in an object of class
|
||||
* Segments. At runtime, the same data is encoded in compact form as
|
||||
* an array of integers in a TransliterationRule. The runtime encoding
|
||||
* must satisfy three goals:
|
||||
*
|
||||
* 1. Iterate over the offsets in a pattern, from left to right,
|
||||
* and indicate all segment boundaries, in order. This is done
|
||||
* during matching.
|
||||
*
|
||||
* 2. Given a reference $n, produce the start and limit offsets
|
||||
* for that segment. This is done during replacement.
|
||||
*
|
||||
* 3. Similar to goal 1, but in addition, indicate whether each
|
||||
* segment boundary is a start or a limit, in other words, whether
|
||||
* each is an open paren or a close paren. This is required by
|
||||
* the toRule() method.
|
||||
*
|
||||
* Goal 1 must be satisfied at high speed since this is done during
|
||||
* matching. Goal 2 is next most important. Goal 3 is not performance
|
||||
* critical since it is only needed by toRule().
|
||||
*
|
||||
* The array of integers is actually two arrays concatenated. The
|
||||
* first gives the index values of the open and close parentheses in
|
||||
* the order they appear. The second maps segment numbers to the
|
||||
* indices of the first array. The two arrays have the same length.
|
||||
* Iterating over the first array satisfies goal 1. Indexing into the
|
||||
* second array satisfies goal 2. Goal 3 is satisfied by iterating
|
||||
* over the second array and constructing the required data when
|
||||
* needed. This is what toRule() does.
|
||||
*
|
||||
* Example: (a b(c d)e f)
|
||||
* 0 1 2 3 4 5 6
|
||||
*
|
||||
* First array: Indices are 0, 2, 4, and 6.
|
||||
|
||||
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
|
||||
* second array is 0, 3, 1 2 -- these give the indices in the
|
||||
* first array at which $1:open, $1:close, $2:open, and $2:close
|
||||
* occur.
|
||||
*
|
||||
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
|
||||
*
|
||||
* Each subarray is terminated with a -1, and two leading entries
|
||||
* give the number of segments and the offset to the first entry
|
||||
* of the second array. In addition, the second array value are
|
||||
* all offset by 2 so they index directly into the final array.
|
||||
* The total array size is 4*segments[0] + 4. The second index is
|
||||
* 2*segments[0] + 3.
|
||||
*
|
||||
* In the output string, a segment reference is indicated by a
|
||||
* character in a special range, as defined by
|
||||
* RuleBasedTransliterator.Data.
|
||||
*
|
||||
* Most rules have no segments, in which case segments is null, and the
|
||||
* output string need not be checked for segment reference characters.
|
||||
*
|
||||
* See also rbt_rule.h/cpp.
|
||||
*/
|
||||
class Segments {
|
||||
UVector offsets;
|
||||
UVector isOpenParen;
|
||||
public:
|
||||
Segments(UErrorCode &status);
|
||||
~Segments();
|
||||
void addParenthesisAt(int32_t offset, UBool isOpenParen, UErrorCode &status);
|
||||
int32_t getLastParenOffset(UBool& isOpenParen) const;
|
||||
UBool extractLastParenSubstring(int32_t& start, int32_t& limit);
|
||||
int32_t* createArray(UErrorCode &status) const;
|
||||
UBool validate() const;
|
||||
int32_t count() const; // number of segments
|
||||
private:
|
||||
int32_t offset(int32_t i) const;
|
||||
UBool isOpen(int32_t i) const;
|
||||
int32_t size() const; // size of the UVectors
|
||||
};
|
||||
|
||||
int32_t Segments::offset(int32_t i) const {
|
||||
return offsets.elementAti(i);
|
||||
}
|
||||
|
||||
UBool Segments::isOpen(int32_t i) const {
|
||||
return isOpenParen.elementAti(i) != 0;
|
||||
}
|
||||
|
||||
int32_t Segments::size() const {
|
||||
// assert(offset.size() == isOpenParen.size());
|
||||
return offsets.size();
|
||||
}
|
||||
|
||||
Segments::Segments(UErrorCode &status)
|
||||
: offsets(status),
|
||||
isOpenParen(status)
|
||||
{}
|
||||
Segments::~Segments() {}
|
||||
|
||||
void Segments::addParenthesisAt(int32_t offset, UBool isOpen, UErrorCode &status) {
|
||||
offsets.addElement(offset, status);
|
||||
isOpenParen.addElement(isOpen ? 1 : 0, status);
|
||||
}
|
||||
|
||||
int32_t Segments::getLastParenOffset(UBool& isOpenParenReturn) const {
|
||||
if (size() == 0) {
|
||||
return -1;
|
||||
}
|
||||
isOpenParenReturn = isOpen(size()-1);
|
||||
return offset(size()-1);
|
||||
}
|
||||
|
||||
// Remove the last (rightmost) segment. Store its offsets in start
|
||||
// and limit, and then convert all offsets at or after start to be
|
||||
// equal to start. Upon failure, return FALSE. Assume that the
|
||||
// caller has already called getLastParenOffset() and validated that
|
||||
// there is at least one parenthesis and that the last one is a close
|
||||
// paren.
|
||||
UBool Segments::extractLastParenSubstring(int32_t& start, int32_t& limit) {
|
||||
// assert(offsets.size() > 0);
|
||||
// assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
|
||||
int32_t i = size() - 1;
|
||||
int32_t n = 1; // count of close parens we need to match
|
||||
// Record position of the last close paren
|
||||
limit = offset(i);
|
||||
--i; // back up to the one before the last one
|
||||
while (i >= 0 && n != 0) {
|
||||
n += isOpen(i) ? -1 : 1;
|
||||
}
|
||||
if (n != 0) {
|
||||
return FALSE;
|
||||
}
|
||||
// assert(i>=0);
|
||||
start = offset(i);
|
||||
// Reset all segment pairs from i to size() - 1 to [start, start+1).
|
||||
while (i<size()) {
|
||||
int32_t o = isOpen(i) ? start : (start+1);
|
||||
offsets.setElementAt(o, i);
|
||||
++i;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
// Assume caller has already gotten a TRUE validate().
|
||||
int32_t* Segments::createArray(UErrorCode &status) const {
|
||||
int32_t c = count(); // number of segments
|
||||
int32_t arrayLen = 4*c + 4;
|
||||
int32_t *array = new int32_t[arrayLen];
|
||||
int32_t a2offset = 2*c + 3; // offset to array 2
|
||||
|
||||
if (array == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
array[0] = c;
|
||||
array[1] = a2offset;
|
||||
int32_t i;
|
||||
for (i=0; i<2*c; ++i) {
|
||||
array[2+i] = offset(i);
|
||||
}
|
||||
array[a2offset-1] = -1;
|
||||
array[arrayLen-1] = -1;
|
||||
// Now walk through and match up segment numbers with parentheses.
|
||||
// Number segments from 0. We're going to offset all entries by 2
|
||||
// to skip the first two elements, array[0] and array[1].
|
||||
UStack stack(status);
|
||||
int32_t nextOpen = 0; // seg # of next open, 0-based
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
for (i=0; i<2*c; ++i) {
|
||||
UBool open = isOpen(i);
|
||||
// Let seg be the zero-based segment number.
|
||||
// Open parens are at 2*seg in array 2.
|
||||
// Close parens are at 2*seg+1 in array 2.
|
||||
if (open) {
|
||||
array[a2offset + 2*nextOpen] = 2+i;
|
||||
stack.push(nextOpen, status);
|
||||
++nextOpen;
|
||||
} else {
|
||||
int32_t nextClose = stack.popi();
|
||||
array[a2offset + 2*nextClose+1] = 2+i;
|
||||
}
|
||||
}
|
||||
// assert(stack.empty());
|
||||
|
||||
// Perform a series of checks on the array. DO NOT COMPILE INTO
|
||||
// PRODUCTION CODE. Use to debug array building problems.
|
||||
//
|
||||
//::if (!stack.empty()) {
|
||||
//:: __asm int 03;
|
||||
//::}
|
||||
//::// check the array
|
||||
//::if (array[0] < 1) {
|
||||
//:: __asm int 03;
|
||||
//::}
|
||||
//::if (array[1] < 5) {
|
||||
//:: __asm int 03;
|
||||
//::}
|
||||
//::for (i=2; i<2+array[0]*2; ++i) {
|
||||
//:: if (array[i] < 0) { // array[i] is an offset into the rule
|
||||
//:: __asm int 03;
|
||||
//:: }
|
||||
//::}
|
||||
//::if (array[2+array[0]*2] != -1) {
|
||||
//:: __asm int 03;
|
||||
//::}
|
||||
//::for (i=array[1]; i<array[1]+array[0]*2; ++i) {
|
||||
//:: if (array[i] < 2 || array[i] >= (2+2*array[0])) {
|
||||
//:: __asm int 03;
|
||||
//:: }
|
||||
//::}
|
||||
//::if (array[array[1]+array[0]*2] != -1) {
|
||||
//:: __asm int 03;
|
||||
//::}
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
UBool Segments::validate() const {
|
||||
// want number of parens >= 2
|
||||
// want number of parens to be even
|
||||
// want first paren '('
|
||||
// want parens to match up in the end
|
||||
if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
|
||||
return FALSE;
|
||||
}
|
||||
int32_t n = 0;
|
||||
for (int32_t i=0; i<size(); ++i) {
|
||||
n += isOpen(i) ? 1 : -1;
|
||||
if (n < 0) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return n == 0;
|
||||
}
|
||||
|
||||
// Assume caller has already gotten a TRUE validate().
|
||||
int32_t Segments::count() const {
|
||||
// assert(validate());
|
||||
return size() / 2;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// BEGIN RuleHalf
|
||||
//----------------------------------------------------------------------
|
||||
@ -416,11 +170,7 @@ public:
|
||||
int32_t ante; // position of ante context marker '{' in text
|
||||
int32_t post; // position of post context marker '}' in text
|
||||
|
||||
// Record the position of the segment substrings and references. A
|
||||
// given side should have segments or segment references, but not
|
||||
// both.
|
||||
Segments* segments;
|
||||
int32_t maxRef; // index of largest ref ($n) on the right
|
||||
int32_t maxRef; // n where maximum segment ref is $n; 1-based
|
||||
|
||||
// Record the offset to the cursor either to the left or to the
|
||||
// right of the key. This is indicated by characters on the output
|
||||
@ -432,9 +182,26 @@ public:
|
||||
// output text.
|
||||
int32_t cursorOffset; // only nonzero on output side
|
||||
|
||||
// Position of first CURSOR_OFFSET on _right_. This will be -1
|
||||
// for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
|
||||
int32_t cursorOffsetPos;
|
||||
|
||||
UBool anchorStart;
|
||||
UBool anchorEnd;
|
||||
|
||||
UErrorCode ec;
|
||||
|
||||
/**
|
||||
* UnicodeMatcher objects corresponding to each segment.
|
||||
*/
|
||||
UVector segments;
|
||||
|
||||
/**
|
||||
* The segment number from 0..n-1 of the next '(' we see
|
||||
* during parsing; 0-based.
|
||||
*/
|
||||
int32_t nextSegmentNumber;
|
||||
|
||||
TransliteratorParser& parser;
|
||||
|
||||
//--------------------------------------------------
|
||||
@ -443,22 +210,22 @@ public:
|
||||
RuleHalf(TransliteratorParser& parser);
|
||||
~RuleHalf();
|
||||
|
||||
/**
|
||||
* Parse one side of a rule, stopping at either the limit,
|
||||
* the END_OF_RULE character, or an operator. Return
|
||||
* the pos of the terminating character (or limit).
|
||||
*/
|
||||
int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit);
|
||||
|
||||
int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
|
||||
UnicodeString& buf,
|
||||
UBool isSegment);
|
||||
|
||||
/**
|
||||
* Remove context.
|
||||
*/
|
||||
void removeContext();
|
||||
|
||||
/**
|
||||
* Create and return an int[] array of segments.
|
||||
* Create and return a UnicodeMatcher*[] array of segments,
|
||||
* or NULL if there are no segments.
|
||||
*/
|
||||
int32_t* createSegments(UErrorCode& status) const;
|
||||
UnicodeMatcher** createSegments(UErrorCode& status) const;
|
||||
|
||||
int syntaxError(UErrorCode code,
|
||||
const UnicodeString& rule,
|
||||
@ -472,30 +239,69 @@ private:
|
||||
RuleHalf& operator=(const RuleHalf&);
|
||||
};
|
||||
|
||||
RuleHalf::RuleHalf(TransliteratorParser& p) : parser(p) {
|
||||
RuleHalf::RuleHalf(TransliteratorParser& p) :
|
||||
ec(U_ZERO_ERROR),
|
||||
segments(ec),
|
||||
parser(p)
|
||||
{
|
||||
cursor = -1;
|
||||
ante = -1;
|
||||
post = -1;
|
||||
segments = NULL;
|
||||
maxRef = -1;
|
||||
cursorOffset = 0;
|
||||
cursorOffsetPos = 0;
|
||||
anchorStart = anchorEnd = FALSE;
|
||||
segments.removeAllElements();
|
||||
nextSegmentNumber = 0;
|
||||
}
|
||||
|
||||
RuleHalf::~RuleHalf() {
|
||||
delete segments;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse one side of a rule, stopping at either the limit,
|
||||
* the END_OF_RULE character, or an operator. Return
|
||||
* the pos of the terminating character (or limit).
|
||||
* the END_OF_RULE character, or an operator.
|
||||
* @return the index after the terminating character, or
|
||||
* if limit was reached, limit
|
||||
*/
|
||||
int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||
int32_t start = pos;
|
||||
UnicodeString& buf = text;
|
||||
text.truncate(0);
|
||||
pos = parseSection(rule, pos, limit, text, FALSE);
|
||||
|
||||
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
||||
return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a section of one side of a rule, stopping at either
|
||||
* the limit, the END_OF_RULE character, an operator, or a
|
||||
* segment close character. This method parses both a
|
||||
* top-level rule half and a segment within such a rule half.
|
||||
* It calls itself recursively to parse segments and nested
|
||||
* segments.
|
||||
* @param buf buffer into which to accumulate the rule pattern
|
||||
* characters, either literal characters from the rule or
|
||||
* standins for UnicodeMatcher objects including segments.
|
||||
* @param isSegment if true, then we've already seen a '(' and
|
||||
* pos on entry points right after it. Accumulate everything
|
||||
* up to the closing ')', put it in a segment matcher object,
|
||||
* generate a standin for it, and add the standin to buf. As
|
||||
* a side effect, update the segments vector with a reference
|
||||
* to the segment matcher. This works recursively for nested
|
||||
* segments. If isSegment is false, just accumulate
|
||||
* characters into buf.
|
||||
* @return the index after the terminating character, or
|
||||
* if limit was reached, limit
|
||||
*/
|
||||
int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
|
||||
UnicodeString& buf,
|
||||
UBool isSegment) {
|
||||
int32_t start = pos;
|
||||
ParsePosition pp;
|
||||
int32_t cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
|
||||
UnicodeString scratch;
|
||||
UBool done = FALSE;
|
||||
int32_t quoteStart = -1; // Most recent 'single quoted string'
|
||||
@ -503,6 +309,15 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||
int32_t varStart = -1; // Most recent $variableReference
|
||||
int32_t varLimit = -1;
|
||||
|
||||
// If isSegment, then bufSegStart is the offset in buf to
|
||||
// the first character of the segment we are parsing.
|
||||
int32_t bufSegStart = 0;
|
||||
int32_t segmentNumber = 0;
|
||||
if (isSegment) {
|
||||
bufSegStart = buf.length();
|
||||
segmentNumber = nextSegmentNumber++;
|
||||
}
|
||||
|
||||
while (pos < limit && !done) {
|
||||
UChar c = rule.charAt(pos++);
|
||||
if (u_isWhitespace(c)) {
|
||||
@ -511,8 +326,11 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||
// whitespace likely to be seen in code.
|
||||
continue;
|
||||
}
|
||||
if (u_strchr(gOPERATORS, c) != NULL) {
|
||||
--pos; // Backup to point to operator
|
||||
if (u_strchr(HALF_ENDERS, c) != NULL) {
|
||||
if (isSegment) {
|
||||
// Unclosed segment
|
||||
return syntaxError(U_UNCLOSED_SEGMENT, rule, start);
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (anchorEnd) {
|
||||
@ -575,6 +393,10 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||
continue;
|
||||
}
|
||||
switch (c) {
|
||||
|
||||
//------------------------------------------------------
|
||||
// Elements allowed within and out of segments
|
||||
//------------------------------------------------------
|
||||
case ANCHOR_START:
|
||||
if (buf.length() == 0 && !anchorStart) {
|
||||
anchorStart = TRUE;
|
||||
@ -584,17 +406,7 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||
}
|
||||
break;
|
||||
case SEGMENT_OPEN:
|
||||
case SEGMENT_CLOSE:
|
||||
// Handle segment definitions "(" and ")"
|
||||
// Parse "(", ")"
|
||||
if (segments == NULL) {
|
||||
segments = new Segments(parser.status);
|
||||
}
|
||||
segments->addParenthesisAt(buf.length(), c == SEGMENT_OPEN, parser.status);
|
||||
break;
|
||||
case END_OF_RULE:
|
||||
--pos; // Backup to point to END_OF_RULE
|
||||
done = TRUE;
|
||||
pos = parseSection(rule, pos, limit, buf, TRUE);
|
||||
break;
|
||||
case SymbolTable::SYMBOL_REF:
|
||||
// Handle variable references and segment references "$1" .. "$9"
|
||||
@ -655,25 +467,128 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||
}
|
||||
}
|
||||
break;
|
||||
case DOT:
|
||||
buf.append(parser.getDotStandIn());
|
||||
break;
|
||||
case KLEENE_STAR:
|
||||
case ONE_OR_MORE:
|
||||
case ZERO_OR_ONE:
|
||||
// Quantifiers. We handle single characters, quoted strings,
|
||||
// variable references, and segments.
|
||||
// a+ matches aaa
|
||||
// 'foo'+ matches foofoofoo
|
||||
// $v+ matches xyxyxy if $v == xy
|
||||
// (seg)+ matches segsegseg
|
||||
{
|
||||
if (isSegment && buf.length() == bufSegStart) {
|
||||
// The */+ immediately follows '('
|
||||
return syntaxError(U_MISPLACED_QUANTIFIER, rule, start);
|
||||
}
|
||||
|
||||
int32_t qstart, qlimit;
|
||||
// The */+ follows an isolated character or quote
|
||||
// or variable reference
|
||||
if (buf.length() == quoteLimit) {
|
||||
// The */+ follows a 'quoted string'
|
||||
qstart = quoteStart;
|
||||
qlimit = quoteLimit;
|
||||
} else if (buf.length() == varLimit) {
|
||||
// The */+ follows a $variableReference
|
||||
qstart = varStart;
|
||||
qlimit = varLimit;
|
||||
} else {
|
||||
// The */+ follows a single character, possibly
|
||||
// a segment standin
|
||||
qstart = buf.length() - 1;
|
||||
qlimit = qstart + 1;
|
||||
}
|
||||
|
||||
UnicodeMatcher *m =
|
||||
new StringMatcher(buf, qstart, qlimit, FALSE, *parser.data);
|
||||
int32_t min = 0;
|
||||
int32_t max = Quantifier::MAX;
|
||||
switch (c) {
|
||||
case ONE_OR_MORE:
|
||||
min = 1;
|
||||
break;
|
||||
case ZERO_OR_ONE:
|
||||
min = 0;
|
||||
max = 1;
|
||||
break;
|
||||
// case KLEENE_STAR:
|
||||
// do nothing -- min, max already set
|
||||
}
|
||||
m = new Quantifier(m, min, max);
|
||||
buf.truncate(qstart);
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
}
|
||||
break;
|
||||
|
||||
//------------------------------------------------------
|
||||
// Elements allowed ONLY WITHIN segments
|
||||
//------------------------------------------------------
|
||||
case SEGMENT_CLOSE:
|
||||
if (isSegment) {
|
||||
// We're done parsing a segment. The relevant
|
||||
// characters are in buf, starting at offset
|
||||
// bufSegStart. Extract them into a string
|
||||
// matcher, and replace them with a standin
|
||||
// for that matcher.
|
||||
StringMatcher *m =
|
||||
new StringMatcher(buf, bufSegStart, buf.length(),
|
||||
TRUE, *parser.data);
|
||||
// Since we call parseSection() recursively,
|
||||
// nested segments will result in segment i+1
|
||||
// getting parsed and stored before segment i;
|
||||
// be careful with the vector handling here.
|
||||
if ((segmentNumber+1) > segments.size()) {
|
||||
segments.setSize(segmentNumber+1);
|
||||
}
|
||||
segments.setElementAt(m, segmentNumber);
|
||||
buf.truncate(bufSegStart);
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
done = TRUE;
|
||||
break;
|
||||
}
|
||||
|
||||
// If we aren't in a segment, then a segment close
|
||||
// character is a syntax error.
|
||||
return syntaxError(U_UNQUOTED_SPECIAL, rule, start);
|
||||
|
||||
//------------------------------------------------------
|
||||
// Elements allowed ONLY OUTSIDE segments
|
||||
//------------------------------------------------------
|
||||
case CONTEXT_ANTE:
|
||||
if (isSegment) {
|
||||
return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
|
||||
}
|
||||
if (ante >= 0) {
|
||||
return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start);
|
||||
}
|
||||
ante = buf.length();
|
||||
break;
|
||||
case CONTEXT_POST:
|
||||
if (isSegment) {
|
||||
return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
|
||||
}
|
||||
if (post >= 0) {
|
||||
return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start);
|
||||
}
|
||||
post = buf.length();
|
||||
break;
|
||||
case CURSOR_POS:
|
||||
if (isSegment) {
|
||||
return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
|
||||
}
|
||||
if (cursor >= 0) {
|
||||
return syntaxError(U_MULTIPLE_CURSORS, rule, start);
|
||||
}
|
||||
cursor = buf.length();
|
||||
break;
|
||||
case CURSOR_OFFSET:
|
||||
if (isSegment) {
|
||||
return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
|
||||
}
|
||||
if (cursorOffset < 0) {
|
||||
if (buf.length() > 0) {
|
||||
return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
|
||||
@ -695,69 +610,11 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||
}
|
||||
}
|
||||
break;
|
||||
case DOT:
|
||||
buf.append(parser.getDotStandIn());
|
||||
break;
|
||||
case KLEENE_STAR:
|
||||
case ONE_OR_MORE:
|
||||
case ZERO_OR_ONE:
|
||||
// Quantifiers. We handle single characters, quoted strings,
|
||||
// variable references, and segments.
|
||||
// a+ matches aaa
|
||||
// 'foo'+ matches foofoofoo
|
||||
// $v+ matches xyxyxy if $v == xy
|
||||
// (seg)+ matches segsegseg
|
||||
{
|
||||
int32_t start, limit;
|
||||
UBool isOpenParen;
|
||||
UBool isSegment = FALSE;
|
||||
if (segments != 0 &&
|
||||
segments->getLastParenOffset(isOpenParen) == buf.length()) {
|
||||
// The */+ immediately follows a segment
|
||||
if (isOpenParen) {
|
||||
return syntaxError(U_MISPLACED_QUANTIFIER, rule, start);
|
||||
}
|
||||
if (!segments->extractLastParenSubstring(start, limit)) {
|
||||
return syntaxError(U_MISMATCHED_SEGMENT_DELIMITERS, rule, start);
|
||||
}
|
||||
isSegment = TRUE;
|
||||
} else {
|
||||
// The */+ follows an isolated character or quote
|
||||
// or variable reference
|
||||
if (buf.length() == quoteLimit) {
|
||||
// The */+ follows a 'quoted string'
|
||||
start = quoteStart;
|
||||
limit = quoteLimit;
|
||||
} else if (buf.length() == varLimit) {
|
||||
// The */+ follows a $variableReference
|
||||
start = varStart;
|
||||
limit = varLimit;
|
||||
} else {
|
||||
// The */+ follows a single character
|
||||
start = buf.length() - 1;
|
||||
limit = start + 1;
|
||||
}
|
||||
}
|
||||
UnicodeMatcher *m =
|
||||
new StringMatcher(buf, start, limit, isSegment, *parser.data);
|
||||
int32_t min = 0;
|
||||
int32_t max = Quantifier::MAX;
|
||||
switch (c) {
|
||||
case ONE_OR_MORE:
|
||||
min = 1;
|
||||
break;
|
||||
case ZERO_OR_ONE:
|
||||
min = 0;
|
||||
max = 1;
|
||||
break;
|
||||
// case KLEENE_STAR:
|
||||
// do nothing -- min, max already set
|
||||
}
|
||||
m = new Quantifier(m, min, max);
|
||||
buf.truncate(start);
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
//------------------------------------------------------
|
||||
// Non-special characters
|
||||
//------------------------------------------------------
|
||||
default:
|
||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||
// in the printable ASCII range. These characters are
|
||||
@ -773,10 +630,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||
}
|
||||
}
|
||||
|
||||
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
||||
return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
|
||||
}
|
||||
// text = buf.toString();
|
||||
return pos;
|
||||
}
|
||||
|
||||
@ -797,10 +650,15 @@ void RuleHalf::removeContext() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and return an int32_t[] array of segments.
|
||||
* Create and return a UnicodeMatcher*[] array of segments,
|
||||
* or NULL if there are no segments.
|
||||
*/
|
||||
int32_t* RuleHalf::createSegments(UErrorCode& status) const {
|
||||
return (segments == 0) ? 0 : segments->createArray(status);
|
||||
UnicodeMatcher** RuleHalf::createSegments(UErrorCode& status) const {
|
||||
if (segments.size() == 0) {
|
||||
return NULL;
|
||||
}
|
||||
UnicodeMatcher** result = new UnicodeMatcher*[segments.size()];
|
||||
return (UnicodeMatcher**) segments.toArray((void**) result);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
@ -1172,9 +1030,10 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
||||
return start;
|
||||
}
|
||||
|
||||
if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(pos++))) == NULL) {
|
||||
if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) {
|
||||
return syntaxError(U_MISSING_OPERATOR, rule, start);
|
||||
}
|
||||
++pos;
|
||||
|
||||
// Found an operator char. Check for forward-reverse operator.
|
||||
if (op == REVERSE_RULE_OP &&
|
||||
@ -1189,7 +1048,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
||||
}
|
||||
|
||||
if (pos < limit) {
|
||||
if (rule.charAt(pos) == END_OF_RULE) {
|
||||
if (rule.charAt(--pos) == END_OF_RULE) {
|
||||
++pos;
|
||||
} else {
|
||||
// RuleHalf parser must have terminated at an operator
|
||||
@ -1251,8 +1110,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
||||
// apply.
|
||||
if (op == FWDREV_RULE_OP) {
|
||||
right->removeContext();
|
||||
delete right->segments;
|
||||
right->segments = NULL;
|
||||
right->segments.removeAllElements();
|
||||
left->cursor = left->maxRef = -1;
|
||||
left->cursorOffset = 0;
|
||||
}
|
||||
@ -1272,7 +1130,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
||||
// cannot place the cursor outside the limits of the context.
|
||||
// Anchors are only allowed on the input side.
|
||||
if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 ||
|
||||
right->segments != NULL || left->maxRef >= 0 ||
|
||||
right->segments.size() > 0 || left->maxRef >= 0 ||
|
||||
(right->cursorOffset != 0 && right->cursor < 0) ||
|
||||
// - The following two checks were used to ensure that the
|
||||
// - the cursor offset stayed within the ante- or postcontext.
|
||||
@ -1288,20 +1146,15 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
||||
// Check integrity of segments and segment references. Each
|
||||
// segment's start must have a corresponding limit, and the
|
||||
// references must not refer to segments that do not exist.
|
||||
if (left->segments != NULL) {
|
||||
if (!left->segments->validate()) {
|
||||
return syntaxError(U_MISSING_SEGMENT_CLOSE, rule, start);
|
||||
}
|
||||
int32_t n = left->segments->count();
|
||||
if (right->maxRef > n) {
|
||||
return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start);
|
||||
}
|
||||
if (right->maxRef > left->segments.size()) {
|
||||
return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start);
|
||||
}
|
||||
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
left->text, left->ante, left->post,
|
||||
right->text, right->cursor, right->cursorOffset,
|
||||
left->createSegments(status),
|
||||
left->segments.size(),
|
||||
left->anchorStart, left->anchorEnd,
|
||||
data,
|
||||
status), status);
|
||||
@ -1366,7 +1219,7 @@ UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
|
||||
if (variableNext >= variableLimit) {
|
||||
// throw new RuntimeException("Private use variables exhausted");
|
||||
delete adopted;
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
status = U_VARIABLE_RANGE_EXHAUSTED;
|
||||
return 0;
|
||||
}
|
||||
variablesVector->addElement(adopted, status);
|
||||
|
@ -14,28 +14,11 @@
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unicode.h"
|
||||
#include "cmemory.h"
|
||||
#include "strmatch.h"
|
||||
|
||||
static const UChar APOSTROPHE = 0x0027; // '\''
|
||||
static const UChar BACKSLASH = 0x005C; // '\'
|
||||
|
||||
// To process segments we need to allocate arrays of integers. We use
|
||||
// stack storage as long as the segment count is <= MAX_STATIC_SEGS.
|
||||
// Otherwise, we allocate heap space.
|
||||
#define MAX_STATIC_SEGS 20
|
||||
|
||||
// Macros for accessing the array of integers encoding the position of
|
||||
// SEGMENTS_COUNT number of segments, n (half the number of parens)
|
||||
// SEGMENTS_LEN length of the segments array (number of elements)
|
||||
// SEGMENTS_POS position in 'pattern' of parenthesis i, where i=0..2n-1
|
||||
// SEGMENTS_NUM index into segments to access POS of $1.open,
|
||||
// $1.close, $2.open, $2.close,.., $n.open, $n.close
|
||||
// Relative to FIRST_SEG_POS_INDEX. Ranges from 0..2n-1.
|
||||
#define FIRST_SEG_POS_INDEX 2
|
||||
#define SEGMENTS_COUNT(x) x[0]
|
||||
#define SEGMENTS_LEN(x) (SEGMENTS_COUNT(x)*4+4)
|
||||
#define SEGMENTS_POS(x,i) x[FIRST_SEG_POS_INDEX+i]
|
||||
#define SEGMENTS_NUM(x,i) (x[x[1]+i]-FIRST_SEG_POS_INDEX)
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
const UChar TransliterationRule::ETHER = 0xFFFF;
|
||||
@ -56,11 +39,10 @@ const UChar TransliterationRule::ETHER = 0xFFFF;
|
||||
* <code>output</code>; that is, -1 is equivalent to
|
||||
* <code>output.length()</code>. If greater than
|
||||
* <code>output.length()</code> then an exception is thrown.
|
||||
* @param adoptedSegs array of 2n integers. Each of n pairs consists of offset,
|
||||
* limit for a segment of the input string. Characters in the output string
|
||||
* refer to these segments if they are in a special range determined by the
|
||||
* associated RuleBasedTransliterator.Data object. May be null if there are
|
||||
* no segments.
|
||||
* @param segs array of UnicodeMatcher corresponding to input pattern
|
||||
* segments, or null if there are none. The array itself is adopted,
|
||||
* but the pointers within it are not.
|
||||
* @param segsCount number of elements in segs[]
|
||||
* @param anchorStart TRUE if the the rule is anchored on the left to
|
||||
* the context start
|
||||
* @param anchorEnd TRUE if the rule is anchored on the right to the
|
||||
@ -70,7 +52,8 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
|
||||
int32_t anteContextPos, int32_t postContextPos,
|
||||
const UnicodeString& outputStr,
|
||||
int32_t cursorPosition, int32_t cursorOffset,
|
||||
int32_t* adoptedSegs,
|
||||
UnicodeMatcher** segs,
|
||||
int32_t segsCount,
|
||||
UBool anchorStart, UBool anchorEnd,
|
||||
const TransliterationRuleData* theData,
|
||||
UErrorCode& status) :
|
||||
@ -113,23 +96,11 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
|
||||
this->cursorPos = cursorPosition + cursorOffset;
|
||||
this->output = outputStr;
|
||||
// We don't validate the segments array. The caller must
|
||||
// guarantee that the segments are well-formed.
|
||||
this->segments = adoptedSegs;
|
||||
// Find the position of the first segment index that is after the
|
||||
// anteContext (in the key). Note that this may be a start or a
|
||||
// limit index. If all segments are in the ante context,
|
||||
// firstKeySeg should point past the last segment -- that is, it
|
||||
// should point at the end marker, which is -1. This allows the
|
||||
// code to back up by one to obtain the last ante context segment.
|
||||
firstKeySeg = -1;
|
||||
if (segments != 0) {
|
||||
firstKeySeg = FIRST_SEG_POS_INDEX;
|
||||
while (segments[firstKeySeg] >= 0 &&
|
||||
segments[firstKeySeg] < anteContextLength) {
|
||||
++firstKeySeg;
|
||||
}
|
||||
firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
|
||||
}
|
||||
// guarantee that the segments are well-formed (that is, that
|
||||
// all $n references in the output refer to indices of this
|
||||
// array, and that no array elements are null).
|
||||
this->segments = segs;
|
||||
this->segmentsCount = segsCount;
|
||||
|
||||
pattern = input;
|
||||
flags = 0;
|
||||
@ -149,18 +120,17 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
|
||||
TransliterationRule::TransliterationRule(TransliterationRule& other) :
|
||||
pattern(other.pattern),
|
||||
output(other.output),
|
||||
firstKeySeg(other.firstKeySeg),
|
||||
anteContextLength(other.anteContextLength),
|
||||
keyLength(other.keyLength),
|
||||
cursorPos(other.cursorPos),
|
||||
flags(other.flags),
|
||||
data(other.data) {
|
||||
|
||||
segments = 0;
|
||||
if (other.segments != 0) {
|
||||
int32_t len = SEGMENTS_LEN(other.segments);
|
||||
segments = new int32_t[len];
|
||||
uprv_memcpy(segments, other.segments, len*sizeof(segments[0]));
|
||||
segments = NULL;
|
||||
segmentsCount = 0;
|
||||
if (other.segmentsCount > 0) {
|
||||
segments = new UnicodeMatcher*[other.segmentsCount];
|
||||
uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0]));
|
||||
}
|
||||
}
|
||||
|
||||
@ -341,26 +311,12 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
||||
|
||||
// ============================ MATCH ===========================
|
||||
|
||||
// Record the actual positions, in the text, of the segments.
|
||||
// These are recorded in the order that they occur in the pattern.
|
||||
|
||||
// segPos[] is an array of 2*SEGMENTS_COUNT elements. It
|
||||
// records the position in 'text' of each segment boundary, in
|
||||
// the order that they occur in 'pattern'.
|
||||
int32_t _segPos[2*MAX_STATIC_SEGS];
|
||||
int32_t *segPos = _segPos;
|
||||
if (segments != 0 && SEGMENTS_COUNT(segments) > MAX_STATIC_SEGS) {
|
||||
segPos = new int32_t[2*SEGMENTS_COUNT(segments)];
|
||||
// Reset segment match data
|
||||
if (segments != NULL) {
|
||||
for (int32_t i=0; i<segmentsCount; ++i) {
|
||||
((StringMatcher*) segments[i])->resetMatch();
|
||||
}
|
||||
}
|
||||
// iSeg is an index into segments[] that accesses the first
|
||||
// array. As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
|
||||
// When indexing into segments[] FIRST_SEG_POS_INDEX must be
|
||||
// added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
|
||||
int32_t iSeg = firstKeySeg - 1;
|
||||
// nextSegPos is an offset in 'pattern'. When the cursor is
|
||||
// equal to nextSegPos, we are at a segment boundary, and we
|
||||
// record the position in the real text in segPos[].
|
||||
int32_t nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
|
||||
|
||||
UMatchDegree m;
|
||||
int32_t lenDelta, keyLimit;
|
||||
@ -386,26 +342,15 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
||||
keyChar == text.charAt(oText)) {
|
||||
--oText;
|
||||
} else {
|
||||
m = U_MISMATCH;
|
||||
goto exit;
|
||||
return U_MISMATCH;
|
||||
}
|
||||
} else {
|
||||
// Subtract 1 from contextStart to make it a reverse limit
|
||||
if (matcher->matches(text, oText, pos.contextStart-1, FALSE)
|
||||
!= U_MATCH) {
|
||||
m = U_MISMATCH;
|
||||
goto exit;
|
||||
return U_MISMATCH;
|
||||
}
|
||||
}
|
||||
while (nextSegPos == oPattern) {
|
||||
segPos[iSeg] = oText;
|
||||
if (oText >= 0) {
|
||||
segPos[iSeg] += UTF_CHAR_LENGTH(text.char32At(oText));
|
||||
} else {
|
||||
++segPos[iSeg];
|
||||
}
|
||||
nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
|
||||
}
|
||||
}
|
||||
|
||||
minOText = posAfter(text, oText);
|
||||
@ -413,15 +358,11 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
||||
// ------------------------ Start Anchor ------------------------
|
||||
|
||||
if ((flags & ANCHOR_START) && oText != posBefore(text, pos.contextStart)) {
|
||||
m = U_MISMATCH;
|
||||
goto exit;
|
||||
return U_MISMATCH;
|
||||
}
|
||||
|
||||
// -------------------- Key and Post Context --------------------
|
||||
|
||||
iSeg = firstKeySeg;
|
||||
nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
|
||||
|
||||
oPattern = 0;
|
||||
oText = pos.start;
|
||||
keyLimit = 0;
|
||||
@ -429,8 +370,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
||||
if (incremental && oText == pos.limit) {
|
||||
// We've reached the limit without a mismatch and
|
||||
// without completing our match.
|
||||
m = U_PARTIAL_MATCH;
|
||||
goto exit;
|
||||
return U_PARTIAL_MATCH;
|
||||
}
|
||||
|
||||
// It might seem that we could do a check like this here:
|
||||
@ -445,10 +385,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
||||
// depending on whether we're in the key or in the post
|
||||
// context.
|
||||
|
||||
while (oPattern == nextSegPos) {
|
||||
segPos[iSeg] = oText;
|
||||
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
|
||||
}
|
||||
if (oPattern == keyLength) {
|
||||
keyLimit = oText;
|
||||
}
|
||||
@ -467,13 +403,12 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
||||
keyChar == text.charAt(oText)) {
|
||||
++oText;
|
||||
} else {
|
||||
m = U_MISMATCH;
|
||||
goto exit;
|
||||
return U_MISMATCH;
|
||||
}
|
||||
} else {
|
||||
m = matcher->matches(text, oText, matchLimit, incremental);
|
||||
if (m != U_MATCH) {
|
||||
goto exit;
|
||||
return m;
|
||||
}
|
||||
}
|
||||
|
||||
@ -486,10 +421,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
||||
//! // at the end of the key.
|
||||
//! return UnicodeMatcher.U_MISMATCH;
|
||||
//!}
|
||||
}
|
||||
while (oPattern == nextSegPos) {
|
||||
segPos[iSeg] = oText;
|
||||
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
|
||||
}
|
||||
if (oPattern == keyLength) {
|
||||
keyLimit = oText;
|
||||
@ -509,8 +440,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
||||
// =========================== REPLACE ==========================
|
||||
|
||||
// We have a full match. The key is between pos.start and
|
||||
// keyLimit. Segment indices have been recorded in segPos[].
|
||||
// Perform a replacement.
|
||||
// keyLimit.
|
||||
|
||||
if (segments == NULL) {
|
||||
text.handleReplaceBetween(pos.start, keyLimit, output);
|
||||
@ -562,11 +492,22 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
||||
buf.remove();
|
||||
}
|
||||
// Copy segment with out-of-band data
|
||||
b *= 2;
|
||||
int32_t start = segPos[SEGMENTS_NUM(segments,b)];
|
||||
int32_t limit = segPos[SEGMENTS_NUM(segments,b+1)];
|
||||
text.copy(start, limit, dest);
|
||||
dest += limit - start;
|
||||
StringMatcher* m = (StringMatcher*) segments[b];
|
||||
int32_t start = m->getMatchStart();
|
||||
int32_t limit = m->getMatchLimit();
|
||||
// If there was no match, that means that a quantifier
|
||||
// matched zero-length. E.g., x (a)* y matched "xy".
|
||||
if (start >= 0) {
|
||||
// Adjust indices for segments in post context
|
||||
// for any inserted text between the key and
|
||||
// the post context.
|
||||
if (start >= keyLimit) {
|
||||
start += dest - keyLimit;
|
||||
limit += dest - keyLimit;
|
||||
}
|
||||
text.copy(start, limit, dest);
|
||||
dest += limit - start;
|
||||
}
|
||||
}
|
||||
oOutput += UTF_CHAR_LENGTH(c);
|
||||
}
|
||||
@ -600,13 +541,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
||||
pos.contextLimit += lenDelta;
|
||||
// Restrict new value of start to [minOText, min(oText, pos.limit)].
|
||||
pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));
|
||||
m = U_MATCH;
|
||||
|
||||
exit:
|
||||
if (segPos != _segPos) {
|
||||
delete[] segPos;
|
||||
}
|
||||
return m;
|
||||
return U_MATCH;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -727,23 +662,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
|
||||
UBool escapeUnprintable) const {
|
||||
int32_t i;
|
||||
|
||||
// iseg indexes into segments[] directly (not offset from FSPI)
|
||||
int32_t iseg = FIRST_SEG_POS_INDEX-1;
|
||||
int32_t nextSeg = -1;
|
||||
// Build an array of booleans specifying open vs. close paren
|
||||
UBool _isOpen[2*MAX_STATIC_SEGS];
|
||||
UBool *isOpen = _isOpen;
|
||||
if (segments != 0) {
|
||||
if (SEGMENTS_COUNT(segments) > MAX_STATIC_SEGS) {
|
||||
isOpen = new UBool[2*SEGMENTS_COUNT(segments)];
|
||||
}
|
||||
for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
|
||||
isOpen[SEGMENTS_NUM(segments,i) ] = TRUE;
|
||||
isOpen[SEGMENTS_NUM(segments,i+1)] = FALSE;
|
||||
}
|
||||
nextSeg = segments[++iseg];
|
||||
}
|
||||
|
||||
// Accumulate special characters (and non-specials following them)
|
||||
// into quoteBuf. Append quoteBuf, within single quotes, when
|
||||
// a non-quoted element must be inserted.
|
||||
@ -765,14 +683,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
|
||||
appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
|
||||
// Append either '(' or ')' if we are at a segment index
|
||||
if (i == nextSeg) {
|
||||
appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
|
||||
(UChar)0x0028 : (UChar)0x0029,
|
||||
TRUE, escapeUnprintable, quoteBuf);
|
||||
nextSeg = segments[++iseg];
|
||||
}
|
||||
|
||||
if (emitBraces && i == (anteContextLength + keyLength)) {
|
||||
appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
@ -787,11 +697,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
|
||||
}
|
||||
}
|
||||
|
||||
if (i == nextSeg) {
|
||||
// assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
|
||||
appendToRule(rule, (UChar)0x0029 /*)*/, TRUE, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
|
||||
if (emitBraces && i == (anteContextLength + keyLength)) {
|
||||
appendToRule(rule, (UChar)0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
@ -854,9 +759,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
|
||||
|
||||
appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);
|
||||
|
||||
if (isOpen != _isOpen) {
|
||||
delete[] isOpen;
|
||||
}
|
||||
return rule;
|
||||
}
|
||||
|
||||
|
@ -33,6 +33,16 @@ class TransliterationRuleData;
|
||||
* Variables are detected by looking up each character in a supplied
|
||||
* variable list to see if it has been so defined.
|
||||
*
|
||||
* <p>A rule may contain segments in its input string and segment
|
||||
* references in its output string. A segment is a substring of the
|
||||
* input pattern, indicated by an offset and limit. The segment may
|
||||
* be in the preceding or following context. It may not span a
|
||||
* context boundary. A segment reference is a special character in
|
||||
* the output string that causes a segment of the input string (not
|
||||
* the input pattern) to be copied to the output string. The range of
|
||||
* special characters that represent segment references is defined by
|
||||
* RuleBasedTransliterator.Data.
|
||||
*
|
||||
* @author Alan Liu
|
||||
*/
|
||||
class TransliterationRule {
|
||||
@ -65,20 +75,20 @@ private:
|
||||
UnicodeString output;
|
||||
|
||||
/**
|
||||
* An array of integers encoding the position of the segments.
|
||||
* See rbt_pars.cpp::Segments for more details.
|
||||
* An array of matcher objects corresponding to the input pattern
|
||||
* segments. If there are no segments this is null. N.B. This is
|
||||
* a UnicodeMatcher for generality, but in practice it is always a
|
||||
* StringMatcher. In the future we may generalize this, but for
|
||||
* now we sometimes cast down to StringMatcher.
|
||||
*
|
||||
* The array is owned, but the pointers within it are not.
|
||||
*/
|
||||
int32_t* segments;
|
||||
UnicodeMatcher** segments;
|
||||
|
||||
/**
|
||||
* A value we compute from segments. The first index into segments[]
|
||||
* that is >= anteContextLength. That is, the first one that is within
|
||||
* the forward scanned part of the pattern -- the key or the postContext.
|
||||
* If there are no segments, this has the value -1. This index is relative
|
||||
* to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
|
||||
* segments[FIRST_SEG_POS_INDEX + firstKeySeg].
|
||||
* The number of elements in segments[] or zero if segments is NULL.
|
||||
*/
|
||||
int32_t firstKeySeg;
|
||||
int32_t segmentsCount;
|
||||
|
||||
/**
|
||||
* The length of the string that must match before the key. If
|
||||
@ -143,11 +153,10 @@ public:
|
||||
* 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
|
||||
* "xyz" and moves the cursor to before "a". It would have a cursorOffset
|
||||
* of -3.
|
||||
* @param adoptedSegs array of 2n integers. Each of n pairs consists of offset,
|
||||
* limit for a segment of the input string. Characters in the output string
|
||||
* refer to these segments if they are in a special range determined by the
|
||||
* associated RuleBasedTransliterator.Data object. May be null if there are
|
||||
* no segments.
|
||||
* @param segs array of UnicodeMatcher corresponding to input pattern
|
||||
* segments, or null if there are none. The array itself is adopted,
|
||||
* but the pointers within it are not.
|
||||
* @param segsCount number of elements in segs[]
|
||||
* @param anchorStart TRUE if the the rule is anchored on the left to
|
||||
* the context start
|
||||
* @param anchorEnd TRUE if the rule is anchored on the right to the
|
||||
@ -157,7 +166,8 @@ public:
|
||||
int32_t anteContextPos, int32_t postContextPos,
|
||||
const UnicodeString& outputStr,
|
||||
int32_t cursorPosition, int32_t cursorOffset,
|
||||
int32_t* adoptedSegs,
|
||||
UnicodeMatcher** segs,
|
||||
int32_t segsCount,
|
||||
UBool anchorStart, UBool anchorEnd,
|
||||
const TransliterationRuleData* data,
|
||||
UErrorCode& status);
|
||||
|
@ -18,7 +18,9 @@ StringMatcher::StringMatcher(const UnicodeString& theString,
|
||||
UBool isSeg,
|
||||
const TransliterationRuleData& theData) :
|
||||
data(theData),
|
||||
isSegment(isSeg)
|
||||
isSegment(isSeg),
|
||||
matchStart(-1),
|
||||
matchLimit(-1)
|
||||
{
|
||||
theString.extractBetween(start, limit, pattern);
|
||||
}
|
||||
@ -27,7 +29,9 @@ StringMatcher::StringMatcher(const StringMatcher& o) :
|
||||
UnicodeMatcher(o),
|
||||
pattern(o.pattern),
|
||||
data(o.data),
|
||||
isSegment(o.isSegment)
|
||||
isSegment(o.isSegment),
|
||||
matchStart(o.matchStart),
|
||||
matchLimit(o.matchStart)
|
||||
{
|
||||
}
|
||||
|
||||
@ -54,6 +58,7 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
|
||||
int32_t i;
|
||||
int32_t cursor = offset;
|
||||
if (limit < cursor) {
|
||||
// Match in the reverse direction
|
||||
for (i=pattern.length()-1; i>=0; --i) {
|
||||
UChar keyChar = pattern.charAt(i);
|
||||
const UnicodeMatcher* subm = data.lookup(keyChar);
|
||||
@ -72,6 +77,14 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
|
||||
}
|
||||
}
|
||||
}
|
||||
// Record the match position, but adjust for a normal
|
||||
// forward start, limit, and only if a prior match does not
|
||||
// exist -- we want the rightmost match.
|
||||
if (matchStart < 0) {
|
||||
// cast away const -- should modify method to be non-const
|
||||
((StringMatcher*)this)->matchStart = cursor+1;
|
||||
((StringMatcher*)this)->matchLimit = offset+1;
|
||||
}
|
||||
} else {
|
||||
for (i=0; i<pattern.length(); ++i) {
|
||||
if (incremental && cursor == limit) {
|
||||
@ -99,6 +112,10 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
|
||||
}
|
||||
}
|
||||
}
|
||||
// Record the match position
|
||||
// cast away const -- should modify method to be non-const
|
||||
((StringMatcher*)this)->matchStart = offset;
|
||||
((StringMatcher*)this)->matchLimit = cursor;
|
||||
}
|
||||
|
||||
offset = cursor;
|
||||
@ -128,8 +145,8 @@ UnicodeString& StringMatcher::toPattern(UnicodeString& result,
|
||||
result.append((UChar)41); /*)*/
|
||||
}
|
||||
// Flush quoteBuf out to result
|
||||
TransliterationRule::appendToRule(result, (UChar32)(isSegment?41/*)*/:-1),
|
||||
TRUE, escapeUnprintable, quoteBuf);
|
||||
TransliterationRule::appendToRule(result, -1,
|
||||
TRUE, escapeUnprintable, quoteBuf);
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -145,6 +162,32 @@ UBool StringMatcher::matchesIndexValue(uint8_t v) const {
|
||||
return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove any match data. This must be called before performing a
|
||||
* set of matches with this segment.
|
||||
*/
|
||||
void StringMatcher::resetMatch() {
|
||||
matchStart = matchLimit = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the start offset, in the match text, of the <em>rightmost</em>
|
||||
* match. This method may get moved up into the UnicodeMatcher if
|
||||
* it turns out to be useful to generalize this.
|
||||
*/
|
||||
int32_t StringMatcher::getMatchStart() const {
|
||||
return matchStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the limit offset, in the match text, of the <em>rightmost</em>
|
||||
* match. This method may get moved up into the UnicodeMatcher if
|
||||
* it turns out to be useful to generalize this.
|
||||
*/
|
||||
int32_t StringMatcher::getMatchLimit() const {
|
||||
return matchLimit;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
//eof
|
||||
|
@ -59,6 +59,26 @@ class StringMatcher : public UnicodeMatcher {
|
||||
*/
|
||||
virtual UBool matchesIndexValue(uint8_t v) const;
|
||||
|
||||
/**
|
||||
* Remove any match data. This must be called before performing a
|
||||
* set of matches with this segment.
|
||||
*/
|
||||
void resetMatch();
|
||||
|
||||
/**
|
||||
* Return the start offset, in the match text, of the <em>rightmost</em>
|
||||
* match. This method may get moved up into the UnicodeMatcher if
|
||||
* it turns out to be useful to generalize this.
|
||||
*/
|
||||
int32_t getMatchStart() const;
|
||||
|
||||
/**
|
||||
* Return the limit offset, in the match text, of the <em>rightmost</em>
|
||||
* match. This method may get moved up into the UnicodeMatcher if
|
||||
* it turns out to be useful to generalize this.
|
||||
*/
|
||||
int32_t getMatchLimit() const;
|
||||
|
||||
private:
|
||||
|
||||
UnicodeString pattern;
|
||||
@ -66,6 +86,10 @@ class StringMatcher : public UnicodeMatcher {
|
||||
const TransliterationRuleData& data;
|
||||
|
||||
UBool isSegment;
|
||||
|
||||
int32_t matchStart;
|
||||
|
||||
int32_t matchLimit;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
|
||||
* $Date: 2001/10/26 22:59:26 $
|
||||
* $Revision: 1.57 $
|
||||
* $Date: 2001/10/30 18:08:19 $
|
||||
* $Revision: 1.58 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -1268,9 +1268,11 @@ public class TransliteratorTest extends TestFmwk {
|
||||
"c abc ababc",
|
||||
"d d abd");
|
||||
|
||||
// NOTE: The (ab)+ when referenced just yields a single "ab",
|
||||
// not the full sequence of them. This accords with perl behavior.
|
||||
expect("(ab)+ {x} > '(' $1 ')';",
|
||||
"x abx ababxy",
|
||||
"x ab(ab) abab(abab)y");
|
||||
"x ab(ab) abab(ab)y");
|
||||
|
||||
expect("b+ > x;",
|
||||
"ac abc abbc abbbc",
|
||||
@ -1288,12 +1290,11 @@ public class TransliteratorTest extends TestFmwk {
|
||||
"qa qab qaba qababc",
|
||||
"xa x xa xc");
|
||||
|
||||
// Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
|
||||
// In perl, it only matches the first occurrence, so the output
|
||||
// is "()a (ab) (ab)a (ab)c".
|
||||
// NOTE: The (ab)+ when referenced just yields a single "ab",
|
||||
// not the full sequence of them. This accords with perl behavior.
|
||||
expect("q(ab)* > '(' $1 ')';",
|
||||
"qa qab qaba qababc",
|
||||
"()a (ab) (ab)a (abab)c");
|
||||
"()a (ab) (ab)a (ab)c");
|
||||
|
||||
// 'foo'+ and 'foo'* -- the quantifier should apply to the entire
|
||||
// quoted string
|
||||
@ -1574,6 +1575,46 @@ public class TransliteratorTest extends TestFmwk {
|
||||
expect(gr, "\u03B1\u0314", "ha");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test quantified segment behavior. We want:
|
||||
* ([abc])+ > x $1 x; applied to "cba" produces "xax"
|
||||
*/
|
||||
public void TestQuantifiedSegment() {
|
||||
// The normal case
|
||||
expect("([abc]+) > x $1 x;", "cba", "xcbax");
|
||||
|
||||
// The tricky case; the quantifier is around the segment
|
||||
expect("([abc])+ > x $1 x;", "cba", "xax");
|
||||
|
||||
// Tricky case in reverse direction
|
||||
expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
|
||||
|
||||
// Check post-context segment
|
||||
expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
|
||||
|
||||
// Test toRule/toPattern for non-quantified segment.
|
||||
// Careful with spacing here.
|
||||
String r = "([a-c]){q} > x $1 x;";
|
||||
Transliterator t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
|
||||
String rr = t.toRules(true);
|
||||
if (!r.equals(rr)) {
|
||||
errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||
} else {
|
||||
logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||
}
|
||||
|
||||
// Test toRule/toPattern for quantified segment.
|
||||
// Careful with spacing here.
|
||||
r = "([a-c])+{q} > x $1 x;";
|
||||
t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
|
||||
rr = t.toRules(true);
|
||||
if (!r.equals(rr)) {
|
||||
errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||
} else {
|
||||
logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||
}
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// icu4j ONLY
|
||||
// These tests are not mirrored (yet) in icu4c at
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/StringMatcher.java,v $
|
||||
* $Date: 2001/10/25 22:32:02 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2001/10/30 18:04:08 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -18,16 +18,27 @@ class StringMatcher implements UnicodeMatcher {
|
||||
|
||||
private boolean isSegment;
|
||||
|
||||
private int matchStart;
|
||||
|
||||
private int matchLimit;
|
||||
|
||||
private final RuleBasedTransliterator.Data data;
|
||||
|
||||
public StringMatcher(String theString,
|
||||
boolean isSeg,
|
||||
RuleBasedTransliterator.Data theData) {
|
||||
data = theData;
|
||||
isSegment = isSeg;
|
||||
pattern = theString;
|
||||
matchStart = matchLimit = -1;
|
||||
}
|
||||
|
||||
public StringMatcher(String theString,
|
||||
int start,
|
||||
int limit,
|
||||
boolean isSeg,
|
||||
RuleBasedTransliterator.Data theData) {
|
||||
data = theData;
|
||||
isSegment = isSeg;
|
||||
pattern = theString.substring(start, limit);
|
||||
this(theString.substring(start, limit), isSeg, theData);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -40,6 +51,7 @@ class StringMatcher implements UnicodeMatcher {
|
||||
int i;
|
||||
int[] cursor = new int[] { offset[0] };
|
||||
if (limit < cursor[0]) {
|
||||
// Match in the reverse direction
|
||||
for (i=pattern.length()-1; i>=0; --i) {
|
||||
char keyChar = pattern.charAt(i);
|
||||
UnicodeMatcher subm = data.lookup(keyChar);
|
||||
@ -58,6 +70,13 @@ class StringMatcher implements UnicodeMatcher {
|
||||
}
|
||||
}
|
||||
}
|
||||
// Record the match position, but adjust for a normal
|
||||
// forward start, limit, and only if a prior match does not
|
||||
// exist -- we want the rightmost match.
|
||||
if (matchStart < 0) {
|
||||
matchStart = cursor[0]+1;
|
||||
matchLimit = offset[0]+1;
|
||||
}
|
||||
} else {
|
||||
for (i=0; i<pattern.length(); ++i) {
|
||||
if (incremental && cursor[0] == limit) {
|
||||
@ -85,6 +104,9 @@ class StringMatcher implements UnicodeMatcher {
|
||||
}
|
||||
}
|
||||
}
|
||||
// Record the match position
|
||||
matchStart = offset[0];
|
||||
matchLimit = cursor[0];
|
||||
}
|
||||
|
||||
offset[0] = cursor[0];
|
||||
@ -114,7 +136,7 @@ class StringMatcher implements UnicodeMatcher {
|
||||
result.append(')');
|
||||
}
|
||||
// Flush quoteBuf out to result
|
||||
TransliterationRule.appendToRule(result, (isSegment?')':-1),
|
||||
TransliterationRule.appendToRule(result, -1,
|
||||
true, escapeUnprintable, quoteBuf);
|
||||
return result.toString();
|
||||
}
|
||||
@ -130,6 +152,32 @@ class StringMatcher implements UnicodeMatcher {
|
||||
UnicodeMatcher m = data.lookup(c);
|
||||
return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove any match data. This must be called before performing a
|
||||
* set of matches with this segment.
|
||||
*/
|
||||
public void resetMatch() {
|
||||
matchStart = matchLimit = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the start offset, in the match text, of the <em>rightmost</em>
|
||||
* match. This method may get moved up into the UnicodeMatcher if
|
||||
* it turns out to be useful to generalize this.
|
||||
*/
|
||||
public int getMatchStart() {
|
||||
return matchStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the limit offset, in the match text, of the <em>rightmost</em>
|
||||
* match. This method may get moved up into the UnicodeMatcher if
|
||||
* it turns out to be useful to generalize this.
|
||||
*/
|
||||
public int getMatchLimit() {
|
||||
return matchLimit;
|
||||
}
|
||||
}
|
||||
|
||||
//eof
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
|
||||
* $Date: 2001/10/25 23:22:15 $
|
||||
* $Revision: 1.33 $
|
||||
* $Date: 2001/10/30 18:04:08 $
|
||||
* $Revision: 1.34 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -30,13 +30,15 @@ import com.ibm.util.Utility;
|
||||
* Variables are detected by looking up each character in a supplied
|
||||
* variable list to see if it has been so defined.
|
||||
*
|
||||
* <p>A rule may contain segments in its input string and segment references in
|
||||
* its output string. A segment is a substring of the input pattern, indicated
|
||||
* by an offset and limit. The segment may span the preceding or following
|
||||
* context. A segment reference is a special character in the output string
|
||||
* that causes a segment of the input string (not the input pattern) to be
|
||||
* copied to the output string. The range of special characters that represent
|
||||
* segment references is defined by RuleBasedTransliterator.Data.
|
||||
* <p>A rule may contain segments in its input string and segment
|
||||
* references in its output string. A segment is a substring of the
|
||||
* input pattern, indicated by an offset and limit. The segment may
|
||||
* be in the preceding or following context. It may not span a
|
||||
* context boundary. A segment reference is a special character in
|
||||
* the output string that causes a segment of the input string (not
|
||||
* the input pattern) to be copied to the output string. The range of
|
||||
* special characters that represent segment references is defined by
|
||||
* RuleBasedTransliterator.Data.
|
||||
*
|
||||
* <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
|
||||
* string "abc.123" to "ab1.c23".
|
||||
@ -44,7 +46,7 @@ import com.ibm.util.Utility;
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.33 $ $Date: 2001/10/25 23:22:15 $
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.34 $ $Date: 2001/10/30 18:04:08 $
|
||||
*/
|
||||
class TransliterationRule {
|
||||
|
||||
@ -64,20 +66,13 @@ class TransliterationRule {
|
||||
private String output;
|
||||
|
||||
/**
|
||||
* An array of integers encoding the position of the segments.
|
||||
* See RuleBasedTransliterator.Segments for more details.
|
||||
* An array of matcher objects corresponding to the input pattern
|
||||
* segments. If there are no segments this is null. N.B. This is
|
||||
* a UnicodeMatcher for generality, but in practice it is always a
|
||||
* StringMatcher. In the future we may generalize this, but for
|
||||
* now we sometimes cast down to StringMatcher.
|
||||
*/
|
||||
int[] segments;
|
||||
|
||||
/**
|
||||
* A value we compute from segments. The first index into segments[]
|
||||
* that is >= anteContextLength. That is, the first one that is within
|
||||
* the forward scanned part of the pattern -- the key or the postContext.
|
||||
* If there are no segments, this has the value -1. This index is relative
|
||||
* to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
|
||||
* segments[FIRST_SEG_POS_INDEX + firstKeySeg].
|
||||
*/
|
||||
int firstKeySeg;
|
||||
UnicodeMatcher[] segments;
|
||||
|
||||
/**
|
||||
* The length of the string that must match before the key. If
|
||||
@ -127,20 +122,6 @@ class TransliterationRule {
|
||||
private static final char APOSTROPHE = '\'';
|
||||
private static final char BACKSLASH = '\\';
|
||||
|
||||
// Macros for accessing the array of integers encoding the position of
|
||||
// the segments. See RuleBasedTransliterator.Segments for more details.
|
||||
// SEGMENTS_COUNT number of segments, n (half the number of parens)
|
||||
// SEGMENTS_LEN length of the segments array (number of elements)
|
||||
// SEGMENTS_POS position in 'pattern' of parenthesis i, where i=0..2n-1
|
||||
// SEGMENTS_NUM index into segments to access POS of $1.open,
|
||||
// $1.close, $2.open, $2.close,.., $n.open, $n.close
|
||||
// Relative to FIRST_SEG_POS_INDEX. Ranges from 0..2n-1.
|
||||
static final int FIRST_SEG_POS_INDEX = 2;
|
||||
static final int SEGMENTS_COUNT(int[] x) { return x[0]; }
|
||||
static final int SEGMENTS_LEN(int[] x) { return (SEGMENTS_COUNT(x)*4+4); }
|
||||
static final int SEGMENTS_POS(int[] x,int i) { return x[FIRST_SEG_POS_INDEX+i]; }
|
||||
static final int SEGMENTS_NUM(int[] x,int i) { return x[x[1]+i]-FIRST_SEG_POS_INDEX; }
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999-2001. All rights reserved.";
|
||||
|
||||
@ -165,12 +146,8 @@ class TransliterationRule {
|
||||
* 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
|
||||
* "xyz" and moves the cursor to before "a". It would have a cursorOffset
|
||||
* of -3.
|
||||
* @param segs array of 2n integers. Each of n pairs consists of offset,
|
||||
* limit for a segment of the input string. Characters in the output string
|
||||
* refer to these segments if they are in a special range determined by the
|
||||
* associated RuleBasedTransliterator.Data object. May be null if there are
|
||||
* no segments. The caller is responsible for validating that segments
|
||||
* are well-formed.
|
||||
* @param segs array of UnicodeMatcher corresponding to input pattern
|
||||
* segments, or null if there are none
|
||||
* @param anchorStart true if the the rule is anchored on the left to
|
||||
* the context start
|
||||
* @param anchorEnd true if the rule is anchored on the right to the
|
||||
@ -180,7 +157,7 @@ class TransliterationRule {
|
||||
int anteContextPos, int postContextPos,
|
||||
String output,
|
||||
int cursorPos, int cursorOffset,
|
||||
int[] segs,
|
||||
UnicodeMatcher[] segs,
|
||||
boolean anchorStart, boolean anchorEnd,
|
||||
RuleBasedTransliterator.Data theData) {
|
||||
data = theData;
|
||||
@ -212,25 +189,11 @@ class TransliterationRule {
|
||||
this.cursorPos = cursorPos + cursorOffset;
|
||||
this.output = output;
|
||||
// We don't validate the segments array. The caller must
|
||||
// guarantee that the segments are well-formed.
|
||||
// guarantee that the segments are well-formed (that is, that
|
||||
// all $n references in the output refer to indices of this
|
||||
// array, and that no array elements are null).
|
||||
this.segments = segs;
|
||||
|
||||
// Find the position of the first segment index that is after the
|
||||
// anteContext (in the key). Note that this may be a start or a
|
||||
// limit index. If all segments are in the ante context,
|
||||
// firstKeySeg should point past the last segment -- that is, it
|
||||
// should point at the end marker, which is -1. This allows the
|
||||
// code to back up by one to obtain the last ante context segment.
|
||||
firstKeySeg = -1;
|
||||
if (segments != null) {
|
||||
firstKeySeg = FIRST_SEG_POS_INDEX;
|
||||
while (segments[firstKeySeg] >= 0 &&
|
||||
segments[firstKeySeg] < anteContextLength) {
|
||||
++firstKeySeg;
|
||||
}
|
||||
firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
|
||||
}
|
||||
|
||||
pattern = input;
|
||||
flags = 0;
|
||||
if (anchorStart) {
|
||||
@ -410,25 +373,12 @@ class TransliterationRule {
|
||||
|
||||
// ============================ MATCH ===========================
|
||||
|
||||
// Record the actual positions, in the text, of the segments.
|
||||
// These are recorded in the order that they occur in the pattern.
|
||||
|
||||
// segPos[] is an array of 2*SEGMENTS_COUNT elements. It
|
||||
// records the position in 'text' of each segment boundary, in
|
||||
// the order that they occur in 'pattern'.
|
||||
int[] segPos = null;
|
||||
// Reset segment match data
|
||||
if (segments != null) {
|
||||
segPos = new int[2*SEGMENTS_COUNT(segments)];
|
||||
for (int i=0; i<segments.length; ++i) {
|
||||
((StringMatcher) segments[i]).resetMatch();
|
||||
}
|
||||
}
|
||||
// iSeg is an index into segments[] that accesses the first
|
||||
// array. As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
|
||||
// When indexing into segments[] FIRST_SEG_POS_INDEX must be
|
||||
// added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
|
||||
int iSeg = firstKeySeg - 1;
|
||||
// nextSegPos is an offset in 'pattern'. When the cursor is
|
||||
// equal to nextSegPos, we are at a segment boundary, and we
|
||||
// record the position in the real text in segPos[].
|
||||
int nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
|
||||
|
||||
int lenDelta, keyLimit;
|
||||
int[] intRef = new int[1];
|
||||
@ -465,15 +415,6 @@ class TransliterationRule {
|
||||
}
|
||||
oText = intRef[0];
|
||||
}
|
||||
while (nextSegPos == oPattern) {
|
||||
segPos[iSeg] = oText;
|
||||
if (oText >= 0) {
|
||||
segPos[iSeg] += UTF16.getCharCount(UTF16.charAt(text, oText));
|
||||
} else {
|
||||
++segPos[iSeg];
|
||||
}
|
||||
nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
|
||||
}
|
||||
}
|
||||
|
||||
minOText = posAfter(text, oText);
|
||||
@ -486,9 +427,6 @@ class TransliterationRule {
|
||||
|
||||
// -------------------- Key and Post Context --------------------
|
||||
|
||||
iSeg = firstKeySeg;
|
||||
nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
|
||||
|
||||
oPattern = 0;
|
||||
oText = pos.start;
|
||||
keyLimit = 0;
|
||||
@ -511,10 +449,6 @@ class TransliterationRule {
|
||||
// depending on whether we're in the key or in the post
|
||||
// context.
|
||||
|
||||
while (oPattern == nextSegPos) {
|
||||
segPos[iSeg] = oText;
|
||||
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
|
||||
}
|
||||
if (oPattern == keyLength) {
|
||||
keyLimit = oText;
|
||||
}
|
||||
@ -554,10 +488,6 @@ class TransliterationRule {
|
||||
//! return UnicodeMatcher.U_MISMATCH;
|
||||
//!}
|
||||
}
|
||||
while (oPattern == nextSegPos) {
|
||||
segPos[iSeg] = oText;
|
||||
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
|
||||
}
|
||||
if (oPattern == keyLength) {
|
||||
keyLimit = oText;
|
||||
}
|
||||
@ -576,8 +506,7 @@ class TransliterationRule {
|
||||
// =========================== REPLACE ==========================
|
||||
|
||||
// We have a full match. The key is between pos.start and
|
||||
// keyLimit. Segment indices have been recorded in segPos[].
|
||||
// Perform a replacement.
|
||||
// keyLimit.
|
||||
|
||||
if (segments == null) {
|
||||
text.replace(pos.start, keyLimit, output);
|
||||
@ -629,11 +558,22 @@ class TransliterationRule {
|
||||
buf.setLength(0);
|
||||
}
|
||||
// Copy segment with out-of-band data
|
||||
b *= 2;
|
||||
int start = segPos[SEGMENTS_NUM(segments,b)];
|
||||
int limit = segPos[SEGMENTS_NUM(segments,b+1)];
|
||||
text.copy(start, limit, dest);
|
||||
dest += limit - start;
|
||||
StringMatcher m = (StringMatcher) segments[b];
|
||||
int start = m.getMatchStart();
|
||||
int limit = m.getMatchLimit();
|
||||
// If there was no match, that means that a quantifier
|
||||
// matched zero-length. E.g., x (a)* y matched "xy".
|
||||
if (start >= 0) {
|
||||
// Adjust indices for segments in post context
|
||||
// for any inserted text between the key and
|
||||
// the post context.
|
||||
if (start >= keyLimit) {
|
||||
start += dest - keyLimit;
|
||||
limit += dest - keyLimit;
|
||||
}
|
||||
text.copy(start, limit, dest);
|
||||
dest += limit - start;
|
||||
}
|
||||
}
|
||||
oOutput += UTF16.getCharCount(c);
|
||||
}
|
||||
@ -790,20 +730,6 @@ class TransliterationRule {
|
||||
|
||||
StringBuffer rule = new StringBuffer();
|
||||
|
||||
// iseg indexes into segments[] directly (not offset from FSPI)
|
||||
int iseg = FIRST_SEG_POS_INDEX-1;
|
||||
int nextSeg = -1;
|
||||
// Build an array of booleans specifying open vs. close paren
|
||||
boolean[] isOpen = null;
|
||||
if (segments != null) {
|
||||
isOpen = new boolean[2*SEGMENTS_COUNT(segments)];
|
||||
for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
|
||||
isOpen[SEGMENTS_NUM(segments,i) ] = true;
|
||||
isOpen[SEGMENTS_NUM(segments,i+1)] = false;
|
||||
}
|
||||
nextSeg = segments[++iseg];
|
||||
}
|
||||
|
||||
// Accumulate special characters (and non-specials following them)
|
||||
// into quoteBuf. Append quoteBuf, within single quotes, when
|
||||
// a non-quoted element must be inserted.
|
||||
@ -825,14 +751,6 @@ class TransliterationRule {
|
||||
appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
|
||||
// Append either '(' or ')' if we are at a segment index
|
||||
if (i == nextSeg) {
|
||||
appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
|
||||
'(' : ')',
|
||||
true, escapeUnprintable, quoteBuf);
|
||||
nextSeg = segments[++iseg];
|
||||
}
|
||||
|
||||
if (emitBraces && i == (anteContextLength + keyLength)) {
|
||||
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
@ -847,11 +765,6 @@ class TransliterationRule {
|
||||
}
|
||||
}
|
||||
|
||||
if (i == nextSeg) {
|
||||
// assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
|
||||
appendToRule(rule, ')', true, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
|
||||
if (emitBraces && i == (anteContextLength + keyLength)) {
|
||||
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
@ -885,7 +798,7 @@ class TransliterationRule {
|
||||
} else {
|
||||
++seg; // make 1-based
|
||||
appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf);
|
||||
rule.append(0x24 /*$*/);
|
||||
rule.append('$');
|
||||
boolean show = false; // true if we should display digits
|
||||
for (int p=9; p>=0; --p) {
|
||||
int d = seg / POW10[p];
|
||||
@ -938,6 +851,9 @@ class TransliterationRule {
|
||||
|
||||
/**
|
||||
* $Log: TransliterationRule.java,v $
|
||||
* Revision 1.34 2001/10/30 18:04:08 alan
|
||||
* jitterbug 1406: make quantified segments behave like perl counterparts
|
||||
*
|
||||
* Revision 1.33 2001/10/25 23:22:15 alan
|
||||
* jitterbug 73: changes to support zero-length matchers at end of key
|
||||
*
|
||||
|
@ -4,8 +4,8 @@
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $
|
||||
* $Date: 2001/10/24 00:03:38 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2001/10/30 18:04:09 $
|
||||
* $Revision: 1.8 $
|
||||
**********************************************************************
|
||||
*/
|
||||
package com.ibm.text;
|
||||
@ -117,6 +117,7 @@ class TransliteratorParser {
|
||||
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
|
||||
|
||||
private static final String OPERATORS = "=><";
|
||||
private static final String HALF_ENDERS = "=><;";
|
||||
|
||||
// Other special characters
|
||||
private static final char QUOTE = '\'';
|
||||
@ -142,7 +143,7 @@ class TransliteratorParser {
|
||||
// private static final char ANCHOR_END = '$';
|
||||
|
||||
// Segments of the input string are delimited by "(" and ")". In the
|
||||
// output string these segments are referenced as "$1" through "$9".
|
||||
// output string these segments are referenced as "$1", "$2", etc.
|
||||
private static final char SEGMENT_OPEN = '(';
|
||||
private static final char SEGMENT_CLOSE = ')';
|
||||
|
||||
@ -285,209 +286,6 @@ class TransliteratorParser {
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// class Segments
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Segments are parentheses-enclosed regions of the input string.
|
||||
* These are referenced in the output string using the notation $1,
|
||||
* $2, etc. Numbering is in order of appearance of the left
|
||||
* parenthesis. Number is one-based. Segments are defined as start,
|
||||
* limit pairs. Segments may nest.
|
||||
*
|
||||
* During parsing, segment data is encoded in an object of class
|
||||
* Segments. At runtime, the same data is encoded in compact form as
|
||||
* an array of integers in a TransliterationRule. The runtime encoding
|
||||
* must satisfy three goals:
|
||||
*
|
||||
* 1. Iterate over the offsets in a pattern, from left to right,
|
||||
* and indicate all segment boundaries, in order. This is done
|
||||
* during matching.
|
||||
*
|
||||
* 2. Given a reference $n, produce the start and limit offsets
|
||||
* for that segment. This is done during replacement.
|
||||
*
|
||||
* 3. Similar to goal 1, but in addition, indicate whether each
|
||||
* segment boundary is a start or a limit, in other words, whether
|
||||
* each is an open paren or a close paren. This is required by
|
||||
* the toRule() method.
|
||||
*
|
||||
* Goal 1 must be satisfied at high speed since this is done during
|
||||
* matching. Goal 2 is next most important. Goal 3 is not performance
|
||||
* critical since it is only needed by toRule().
|
||||
*
|
||||
* The array of integers is actually two arrays concatenated. The
|
||||
* first gives the index values of the open and close parentheses in
|
||||
* the order they appear. The second maps segment numbers to the
|
||||
* indices of the first array. The two arrays have the same length.
|
||||
* Iterating over the first array satisfies goal 1. Indexing into the
|
||||
* second array satisfies goal 2. Goal 3 is satisfied by iterating
|
||||
* over the second array and constructing the required data when
|
||||
* needed. This is what toRule() does.
|
||||
*
|
||||
* Example: (a b(c d)e f)
|
||||
* 0 1 2 3 4 5 6
|
||||
*
|
||||
* First array: Indices are 0, 2, 4, and 6.
|
||||
|
||||
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
|
||||
* second array is 0, 3, 1 2 -- these give the indices in the
|
||||
* first array at which $1:open, $1:close, $2:open, and $2:close
|
||||
* occur.
|
||||
*
|
||||
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
|
||||
*
|
||||
* Each subarray is terminated with a -1, and two leading entries
|
||||
* give the number of segments and the offset to the first entry
|
||||
* of the second array. In addition, the second array value are
|
||||
* all offset by 2 so they index directly into the final array.
|
||||
* The total array size is 4*segments[0] + 4. The second index is
|
||||
* 2*segments[0] + 3.
|
||||
*
|
||||
* In the output string, a segment reference is indicated by a
|
||||
* character in a special range, as defined by
|
||||
* RuleBasedTransliterator.Data.
|
||||
*
|
||||
* Most rules have no segments, in which case segments is null, and the
|
||||
* output string need not be checked for segment reference characters.
|
||||
*
|
||||
* See also rbt_rule.h/cpp.
|
||||
*/
|
||||
private static class Segments {
|
||||
|
||||
private Vector offsets; // holds Integer objects
|
||||
|
||||
private Vector isOpenParen; // holds Boolean objects
|
||||
|
||||
private int offset(int i) {
|
||||
return ((Integer) offsets.elementAt(i)).intValue();
|
||||
}
|
||||
|
||||
private boolean isOpen(int i) {
|
||||
return ((Boolean) isOpenParen.elementAt(i)).booleanValue();
|
||||
}
|
||||
|
||||
// size of the Vectors
|
||||
private int size() {
|
||||
// assert(offset.size() == isOpenParen.size());
|
||||
return offsets.size();
|
||||
}
|
||||
|
||||
public Segments() {
|
||||
offsets = new Vector();
|
||||
isOpenParen = new Vector();
|
||||
}
|
||||
|
||||
public void addParenthesisAt(int offset, boolean isOpen) {
|
||||
offsets.addElement(new Integer(offset));
|
||||
isOpenParen.addElement(new Boolean(isOpen));
|
||||
}
|
||||
|
||||
public int getLastParenOffset(boolean[] isOpenParen) {
|
||||
if (size() == 0) {
|
||||
return -1;
|
||||
}
|
||||
isOpenParen[0] = isOpen(size()-1);
|
||||
return offset(size()-1);
|
||||
}
|
||||
|
||||
// Remove the last (rightmost) segment. Store its offsets in start
|
||||
// and limit, and then convert all offsets at or after start to be
|
||||
// equal to start. Upon failure, return FALSE. Assume that the
|
||||
// caller has already called getLastParenOffset() and validated that
|
||||
// there is at least one parenthesis and that the last one is a close
|
||||
// paren.
|
||||
public boolean extractLastParenSubstring(int[] start, int[] limit) {
|
||||
// assert(offsets.size() > 0);
|
||||
// assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
|
||||
int i = size() - 1;
|
||||
int n = 1; // count of close parens we need to match
|
||||
// Record position of the last close paren
|
||||
limit[0] = offset(i);
|
||||
--i; // back up to the one before the last one
|
||||
while (i >= 0 && n != 0) {
|
||||
n += isOpen(i) ? -1 : 1;
|
||||
}
|
||||
if (n != 0) {
|
||||
return false;
|
||||
}
|
||||
// assert(i>=0);
|
||||
start[0] = offset(i);
|
||||
// Reset all segment pairs from i to size() - 1 to [start, start+1).
|
||||
while (i<size()) {
|
||||
int o = isOpen(i) ? start[0] : (start[0]+1);
|
||||
offsets.setElementAt(new Integer(o), i);
|
||||
++i;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Assume caller has already gotten a TRUE validate().
|
||||
public int[] createArray() {
|
||||
int c = count(); // number of segments
|
||||
int arrayLen = 4*c + 4;
|
||||
int[] array = new int[arrayLen];
|
||||
int a2offset = 2*c + 3; // offset to array 2
|
||||
|
||||
array[0] = c;
|
||||
array[1] = a2offset;
|
||||
int i;
|
||||
for (i=0; i<2*c; ++i) {
|
||||
array[2+i] = offset(i);
|
||||
}
|
||||
array[a2offset-1] = -1;
|
||||
array[arrayLen-1] = -1;
|
||||
// Now walk through and match up segment numbers with parentheses.
|
||||
// Number segments from 0. We're going to offset all entries by 2
|
||||
// to skip the first two elements, array[0] and array[1].
|
||||
Stack stack = new Stack();
|
||||
int nextOpen = 0; // seg # of next open, 0-based
|
||||
for (i=0; i<2*c; ++i) {
|
||||
boolean open = isOpen(i);
|
||||
// Let seg be the zero-based segment number.
|
||||
// Open parens are at 2*seg in array 2.
|
||||
// Close parens are at 2*seg+1 in array 2.
|
||||
if (open) {
|
||||
array[a2offset + 2*nextOpen] = 2+i;
|
||||
stack.push(new Integer(nextOpen));
|
||||
++nextOpen;
|
||||
} else {
|
||||
int nextClose = ((Integer) stack.pop()).intValue();
|
||||
array[a2offset + 2*nextClose+1] = 2+i;
|
||||
}
|
||||
}
|
||||
// assert(stack.empty());
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
public boolean validate() {
|
||||
// want number of parens >= 2
|
||||
// want number of parens to be even
|
||||
// want first paren '('
|
||||
// want parens to match up in the end
|
||||
if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
|
||||
return false;
|
||||
}
|
||||
int n = 0;
|
||||
for (int i=0; i<size(); ++i) {
|
||||
n += isOpen(i) ? 1 : -1;
|
||||
if (n < 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return n == 0;
|
||||
}
|
||||
|
||||
// Number of segments
|
||||
// Assume caller has already gotten a TRUE validate().
|
||||
public int count() {
|
||||
// assert(validate());
|
||||
return size() / 2;
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// class RuleHalf
|
||||
//----------------------------------------------------------------------
|
||||
@ -505,11 +303,7 @@ class TransliteratorParser {
|
||||
public int ante = -1; // position of ante context marker '{' in text
|
||||
public int post = -1; // position of post context marker '}' in text
|
||||
|
||||
// Record the position of the segment substrings and references. A
|
||||
// given side should have segments or segment references, but not
|
||||
// both.
|
||||
public Segments segments = null;
|
||||
public int maxRef = -1; // index of largest ref (1..9)
|
||||
public int maxRef = -1; // n where maximum segment ref is $n; 1-based
|
||||
|
||||
// Record the offset to the cursor either to the left or to the
|
||||
// right of the key. This is indicated by characters on the output
|
||||
@ -521,29 +315,88 @@ class TransliteratorParser {
|
||||
// output text.
|
||||
public int cursorOffset = 0; // only nonzero on output side
|
||||
|
||||
// Position of first CURSOR_OFFSET on _right_. This will be -1
|
||||
// for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
|
||||
private int cursorOffsetPos = 0;
|
||||
|
||||
public boolean anchorStart = false;
|
||||
public boolean anchorEnd = false;
|
||||
|
||||
/**
|
||||
* UnicodeMatcher objects corresponding to each segment.
|
||||
*/
|
||||
public Vector segments = new Vector();
|
||||
|
||||
/**
|
||||
* The segment number from 0..n-1 of the next '(' we see
|
||||
* during parsing; 0-based.
|
||||
*/
|
||||
private int nextSegmentNumber = 0;
|
||||
|
||||
/**
|
||||
* Parse one side of a rule, stopping at either the limit,
|
||||
* the END_OF_RULE character, or an operator. Return
|
||||
* the pos of the terminating character (or limit).
|
||||
* the END_OF_RULE character, or an operator.
|
||||
* @return the index after the terminating character, or
|
||||
* if limit was reached, limit
|
||||
*/
|
||||
public int parse(String rule, int pos, int limit,
|
||||
TransliteratorParser parser) {
|
||||
int start = pos;
|
||||
StringBuffer buf = new StringBuffer();
|
||||
pos = parseSection(rule, pos, limit, parser, buf, false);
|
||||
text = buf.toString();
|
||||
|
||||
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
||||
syntaxError("Misplaced " + CURSOR_POS, rule, start);
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a section of one side of a rule, stopping at either
|
||||
* the limit, the END_OF_RULE character, an operator, or a
|
||||
* segment close character. This method parses both a
|
||||
* top-level rule half and a segment within such a rule half.
|
||||
* It calls itself recursively to parse segments and nested
|
||||
* segments.
|
||||
* @param buf buffer into which to accumulate the rule pattern
|
||||
* characters, either literal characters from the rule or
|
||||
* standins for UnicodeMatcher objects including segments.
|
||||
* @param isSegment if true, then we've already seen a '(' and
|
||||
* pos on entry points right after it. Accumulate everything
|
||||
* up to the closing ')', put it in a segment matcher object,
|
||||
* generate a standin for it, and add the standin to buf. As
|
||||
* a side effect, update the segments vector with a reference
|
||||
* to the segment matcher. This works recursively for nested
|
||||
* segments. If isSegment is false, just accumulate
|
||||
* characters into buf.
|
||||
* @return the index after the terminating character, or
|
||||
* if limit was reached, limit
|
||||
*/
|
||||
private int parseSection(String rule, int pos, int limit,
|
||||
TransliteratorParser parser,
|
||||
StringBuffer buf,
|
||||
boolean isSegment) {
|
||||
int start = pos;
|
||||
ParsePosition pp = null;
|
||||
int cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
|
||||
boolean done = false;
|
||||
int quoteStart = -1; // Most recent 'single quoted string'
|
||||
int quoteLimit = -1;
|
||||
int varStart = -1; // Most recent $variableReference
|
||||
int varLimit = -1;
|
||||
int[] iref = new int[1];
|
||||
|
||||
// If isSegment, then bufSegStart is the offset in buf to
|
||||
// the first character of the segment we are parsing.
|
||||
int bufSegStart = 0;
|
||||
int segmentNumber = 0;
|
||||
if (isSegment) {
|
||||
bufSegStart = buf.length();
|
||||
segmentNumber = nextSegmentNumber++;
|
||||
}
|
||||
|
||||
main:
|
||||
while (pos < limit && !done) {
|
||||
while (pos < limit) {
|
||||
char c = rule.charAt(pos++);
|
||||
if (Character.isWhitespace(c)) {
|
||||
// Ignore whitespace. Note that this is not Unicode
|
||||
@ -551,8 +404,11 @@ class TransliteratorParser {
|
||||
// whitespace likely to be seen in code.
|
||||
continue;
|
||||
}
|
||||
if (OPERATORS.indexOf(c) >= 0) {
|
||||
--pos; // Backup to point to operator
|
||||
// HALF_ENDERS is all chars that end a rule half: "<>=;"
|
||||
if (HALF_ENDERS.indexOf(c) >= 0) {
|
||||
if (isSegment) {
|
||||
syntaxError("Unclosed segment", rule, start);
|
||||
}
|
||||
break main;
|
||||
}
|
||||
if (anchorEnd) {
|
||||
@ -614,7 +470,12 @@ class TransliteratorParser {
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (c) {
|
||||
|
||||
//------------------------------------------------------
|
||||
// Elements allowed within and out of segments
|
||||
//------------------------------------------------------
|
||||
case ANCHOR_START:
|
||||
if (buf.length() == 0 && !anchorStart) {
|
||||
anchorStart = true;
|
||||
@ -624,17 +485,8 @@ class TransliteratorParser {
|
||||
}
|
||||
break;
|
||||
case SEGMENT_OPEN:
|
||||
case SEGMENT_CLOSE:
|
||||
// Handle segment definitions "(" and ")"
|
||||
// Parse "(", ")"
|
||||
if (segments == null) {
|
||||
segments = new Segments();
|
||||
}
|
||||
segments.addParenthesisAt(buf.length(), c == SEGMENT_OPEN);
|
||||
pos = parseSection(rule, pos, limit, parser, buf, true);
|
||||
break;
|
||||
case END_OF_RULE:
|
||||
--pos; // Backup to point to END_OF_RULE
|
||||
break main;
|
||||
case SymbolTable.SYMBOL_REF:
|
||||
// Handle variable references and segment references "$1" .. "$9"
|
||||
{
|
||||
@ -676,7 +528,7 @@ class TransliteratorParser {
|
||||
}
|
||||
pp.setIndex(pos);
|
||||
String name = parser.parseData.
|
||||
parseReference(rule, pp, limit);
|
||||
parseReference(rule, pp, limit);
|
||||
if (name == null) {
|
||||
// This means the '$' was not followed by a
|
||||
// valid name. Try to interpret it as an
|
||||
@ -697,25 +549,129 @@ class TransliteratorParser {
|
||||
}
|
||||
}
|
||||
break;
|
||||
case DOT:
|
||||
buf.append(parser.getDotStandIn());
|
||||
break;
|
||||
case KLEENE_STAR:
|
||||
case ONE_OR_MORE:
|
||||
case ZERO_OR_ONE:
|
||||
// Quantifiers. We handle single characters, quoted strings,
|
||||
// variable references, and segments.
|
||||
// a+ matches aaa
|
||||
// 'foo'+ matches foofoofoo
|
||||
// $v+ matches xyxyxy if $v == xy
|
||||
// (seg)+ matches segsegseg
|
||||
{
|
||||
if (isSegment && buf.length() == bufSegStart) {
|
||||
// The */+ immediately follows '('
|
||||
syntaxError("Misplaced quantifier", rule, start);
|
||||
break;
|
||||
}
|
||||
|
||||
int qstart, qlimit;
|
||||
// The */+ follows an isolated character or quote
|
||||
// or variable reference
|
||||
if (buf.length() == quoteLimit) {
|
||||
// The */+ follows a 'quoted string'
|
||||
qstart = quoteStart;
|
||||
qlimit = quoteLimit;
|
||||
} else if (buf.length() == varLimit) {
|
||||
// The */+ follows a $variableReference
|
||||
qstart = varStart;
|
||||
qlimit = varLimit;
|
||||
} else {
|
||||
// The */+ follows a single character, possibly
|
||||
// a segment standin
|
||||
qstart = buf.length() - 1;
|
||||
qlimit = qstart + 1;
|
||||
}
|
||||
|
||||
UnicodeMatcher m =
|
||||
new StringMatcher(buf.toString(), qstart, qlimit,
|
||||
false, parser.data);
|
||||
int min = 0;
|
||||
int max = Quantifier.MAX;
|
||||
switch (c) {
|
||||
case ONE_OR_MORE:
|
||||
min = 1;
|
||||
break;
|
||||
case ZERO_OR_ONE:
|
||||
min = 0;
|
||||
max = 1;
|
||||
break;
|
||||
// case KLEENE_STAR:
|
||||
// do nothing -- min, max already set
|
||||
}
|
||||
m = new Quantifier(m, min, max);
|
||||
buf.setLength(qstart);
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
}
|
||||
break;
|
||||
|
||||
//------------------------------------------------------
|
||||
// Elements allowed ONLY WITHIN segments
|
||||
//------------------------------------------------------
|
||||
case SEGMENT_CLOSE:
|
||||
if (isSegment) {
|
||||
// We're done parsing a segment. The relevant
|
||||
// characters are in buf, starting at offset
|
||||
// bufSegStart. Extract them into a string
|
||||
// matcher, and replace them with a standin
|
||||
// for that matcher.
|
||||
StringMatcher m =
|
||||
new StringMatcher(buf.substring(bufSegStart),
|
||||
true, parser.data);
|
||||
// Since we call parseSection() recursively,
|
||||
// nested segments will result in segment i+1
|
||||
// getting parsed and stored before segment i;
|
||||
// be careful with the vector handling here.
|
||||
if ((segmentNumber+1) > segments.size()) {
|
||||
segments.setSize(segmentNumber+1);
|
||||
}
|
||||
segments.setElementAt(m, segmentNumber);
|
||||
buf.setLength(bufSegStart);
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
break main;
|
||||
}
|
||||
// If we aren't in a segment, then a segment close
|
||||
// character is a syntax error.
|
||||
syntaxError("Unquoted special", rule, start);
|
||||
break;
|
||||
|
||||
//------------------------------------------------------
|
||||
// Elements allowed ONLY OUTSIDE segments
|
||||
//------------------------------------------------------
|
||||
case CONTEXT_ANTE:
|
||||
if (isSegment) {
|
||||
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||
}
|
||||
if (ante >= 0) {
|
||||
syntaxError("Multiple ante contexts", rule, start);
|
||||
}
|
||||
ante = buf.length();
|
||||
break;
|
||||
case CONTEXT_POST:
|
||||
if (isSegment) {
|
||||
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||
}
|
||||
if (post >= 0) {
|
||||
syntaxError("Multiple post contexts", rule, start);
|
||||
}
|
||||
post = buf.length();
|
||||
break;
|
||||
case CURSOR_POS:
|
||||
if (isSegment) {
|
||||
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||
}
|
||||
if (cursor >= 0) {
|
||||
syntaxError("Multiple cursors", rule, start);
|
||||
}
|
||||
cursor = buf.length();
|
||||
break;
|
||||
case CURSOR_OFFSET:
|
||||
if (isSegment) {
|
||||
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||
}
|
||||
if (cursorOffset < 0) {
|
||||
if (buf.length() > 0) {
|
||||
syntaxError("Misplaced " + c, rule, start);
|
||||
@ -737,74 +693,10 @@ class TransliteratorParser {
|
||||
}
|
||||
}
|
||||
break;
|
||||
case DOT:
|
||||
buf.append(parser.getDotStandIn());
|
||||
break;
|
||||
case KLEENE_STAR:
|
||||
case ONE_OR_MORE:
|
||||
case ZERO_OR_ONE:
|
||||
// Quantifiers. We handle single characters, quoted strings,
|
||||
// variable references, and segments.
|
||||
// a+ matches aaa
|
||||
// 'foo'+ matches foofoofoo
|
||||
// $v+ matches xyxyxy if $v == xy
|
||||
// (seg)+ matches segsegseg
|
||||
{
|
||||
int qstart, qlimit;
|
||||
boolean[] isOpenParen = new boolean[1];
|
||||
boolean isSegment = false;
|
||||
if (segments != null &&
|
||||
segments.getLastParenOffset(isOpenParen) == buf.length()) {
|
||||
// The */+ immediately follows a segment
|
||||
if (isOpenParen[0]) {
|
||||
syntaxError("Misplaced quantifier", rule, start);
|
||||
}
|
||||
int[] startparam = new int[1];
|
||||
int[] limitparam = new int[1];
|
||||
if (!segments.extractLastParenSubstring(startparam, limitparam)) {
|
||||
syntaxError("Mismatched segment delimiters", rule, start);
|
||||
}
|
||||
qstart = startparam[0];
|
||||
qlimit = limitparam[0];
|
||||
isSegment = true;
|
||||
} else {
|
||||
// The */+ follows an isolated character or quote
|
||||
// or variable reference
|
||||
if (buf.length() == quoteLimit) {
|
||||
// The */+ follows a 'quoted string'
|
||||
qstart = quoteStart;
|
||||
qlimit = quoteLimit;
|
||||
} else if (buf.length() == varLimit) {
|
||||
// The */+ follows a $variableReference
|
||||
qstart = varStart;
|
||||
qlimit = varLimit;
|
||||
} else {
|
||||
// The */+ follows a single character
|
||||
qstart = buf.length() - 1;
|
||||
qlimit = qstart + 1;
|
||||
}
|
||||
}
|
||||
UnicodeMatcher m =
|
||||
new StringMatcher(buf.toString(), qstart, qlimit,
|
||||
isSegment, parser.data);
|
||||
int min = 0;
|
||||
int max = Quantifier.MAX;
|
||||
switch (c) {
|
||||
case ONE_OR_MORE:
|
||||
min = 1;
|
||||
break;
|
||||
case ZERO_OR_ONE:
|
||||
min = 0;
|
||||
max = 1;
|
||||
break;
|
||||
// case KLEENE_STAR:
|
||||
// do nothing -- min, max already set
|
||||
}
|
||||
m = new Quantifier(m, min, max);
|
||||
buf.setLength(qstart);
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
}
|
||||
break;
|
||||
|
||||
//------------------------------------------------------
|
||||
// Non-special characters
|
||||
//------------------------------------------------------
|
||||
default:
|
||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||
// in the printable ASCII range. These characters are
|
||||
@ -819,11 +711,6 @@ class TransliteratorParser {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
||||
syntaxError("Misplaced " + CURSOR_POS, rule, start);
|
||||
}
|
||||
text = buf.toString();
|
||||
return pos;
|
||||
}
|
||||
|
||||
@ -838,10 +725,12 @@ class TransliteratorParser {
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and return an int[] array of segments.
|
||||
* Create and return a UnicodeMatcher[] array of segments,
|
||||
* or null if there are no segments.
|
||||
*/
|
||||
int[] createSegments() {
|
||||
return (segments == null) ? null : segments.createArray();
|
||||
UnicodeMatcher[] createSegments() {
|
||||
return (segments.size() == 0) ? null :
|
||||
(UnicodeMatcher[]) segments.toArray(new UnicodeMatcher[segments.size()]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1096,9 +985,10 @@ class TransliteratorParser {
|
||||
pos = left.parse(rule, pos, limit, this);
|
||||
|
||||
if (pos == limit ||
|
||||
OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
|
||||
syntaxError("No operator", rule, start);
|
||||
OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
|
||||
syntaxError("No operator pos=" + pos, rule, start);
|
||||
}
|
||||
++pos;
|
||||
|
||||
// Found an operator char. Check for forward-reverse operator.
|
||||
if (operator == REVERSE_RULE_OP &&
|
||||
@ -1110,7 +1000,7 @@ class TransliteratorParser {
|
||||
pos = right.parse(rule, pos, limit, this);
|
||||
|
||||
if (pos < limit) {
|
||||
if (rule.charAt(pos) == END_OF_RULE) {
|
||||
if (rule.charAt(--pos) == END_OF_RULE) {
|
||||
++pos;
|
||||
} else {
|
||||
// RuleHalf parser must have terminated at an operator
|
||||
@ -1173,7 +1063,7 @@ class TransliteratorParser {
|
||||
// apply.
|
||||
if (operator == FWDREV_RULE_OP) {
|
||||
right.removeContext();
|
||||
right.segments = null;
|
||||
right.segments.removeAllElements();
|
||||
left.cursor = left.maxRef = -1;
|
||||
left.cursorOffset = 0;
|
||||
}
|
||||
@ -1193,7 +1083,7 @@ class TransliteratorParser {
|
||||
// cannot place the cursor outside the limits of the context.
|
||||
// Anchors are only allowed on the input side.
|
||||
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
|
||||
right.segments != null || left.maxRef >= 0 ||
|
||||
right.segments.size() > 0 || left.maxRef >= 0 ||
|
||||
(right.cursorOffset != 0 && right.cursor < 0) ||
|
||||
// - The following two checks were used to ensure that the
|
||||
// - the cursor offset stayed within the ante- or postcontext.
|
||||
@ -1208,14 +1098,8 @@ class TransliteratorParser {
|
||||
// Check integrity of segments and segment references. Each
|
||||
// segment's start must have a corresponding limit, and the
|
||||
// references must not refer to segments that do not exist.
|
||||
if (left.segments != null) {
|
||||
if (!left.segments.validate()) {
|
||||
syntaxError("Missing segment close", rule, start);
|
||||
}
|
||||
int n = left.segments.count();
|
||||
if (right.maxRef > n) {
|
||||
syntaxError("Undefined segment reference", rule, start);
|
||||
}
|
||||
if (right.maxRef > left.segments.size()) {
|
||||
syntaxError("Undefined segment reference $" + right.maxRef, rule, start);
|
||||
}
|
||||
|
||||
data.ruleSet.addRule(new TransliterationRule(
|
||||
@ -1363,7 +1247,7 @@ class TransliteratorParser {
|
||||
char generateStandInFor(UnicodeMatcher matcher) {
|
||||
// assert(matcher != null);
|
||||
if (variableNext >= variableLimit) {
|
||||
throw new RuntimeException("Private use variables exhausted");
|
||||
throw new RuntimeException("Variable range exhausted");
|
||||
}
|
||||
variablesVector.addElement(matcher);
|
||||
return variableNext++;
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
|
||||
* $Date: 2001/10/26 22:59:26 $
|
||||
* $Revision: 1.57 $
|
||||
* $Date: 2001/10/30 18:08:19 $
|
||||
* $Revision: 1.58 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -1268,9 +1268,11 @@ public class TransliteratorTest extends TestFmwk {
|
||||
"c abc ababc",
|
||||
"d d abd");
|
||||
|
||||
// NOTE: The (ab)+ when referenced just yields a single "ab",
|
||||
// not the full sequence of them. This accords with perl behavior.
|
||||
expect("(ab)+ {x} > '(' $1 ')';",
|
||||
"x abx ababxy",
|
||||
"x ab(ab) abab(abab)y");
|
||||
"x ab(ab) abab(ab)y");
|
||||
|
||||
expect("b+ > x;",
|
||||
"ac abc abbc abbbc",
|
||||
@ -1288,12 +1290,11 @@ public class TransliteratorTest extends TestFmwk {
|
||||
"qa qab qaba qababc",
|
||||
"xa x xa xc");
|
||||
|
||||
// Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
|
||||
// In perl, it only matches the first occurrence, so the output
|
||||
// is "()a (ab) (ab)a (ab)c".
|
||||
// NOTE: The (ab)+ when referenced just yields a single "ab",
|
||||
// not the full sequence of them. This accords with perl behavior.
|
||||
expect("q(ab)* > '(' $1 ')';",
|
||||
"qa qab qaba qababc",
|
||||
"()a (ab) (ab)a (abab)c");
|
||||
"()a (ab) (ab)a (ab)c");
|
||||
|
||||
// 'foo'+ and 'foo'* -- the quantifier should apply to the entire
|
||||
// quoted string
|
||||
@ -1574,6 +1575,46 @@ public class TransliteratorTest extends TestFmwk {
|
||||
expect(gr, "\u03B1\u0314", "ha");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test quantified segment behavior. We want:
|
||||
* ([abc])+ > x $1 x; applied to "cba" produces "xax"
|
||||
*/
|
||||
public void TestQuantifiedSegment() {
|
||||
// The normal case
|
||||
expect("([abc]+) > x $1 x;", "cba", "xcbax");
|
||||
|
||||
// The tricky case; the quantifier is around the segment
|
||||
expect("([abc])+ > x $1 x;", "cba", "xax");
|
||||
|
||||
// Tricky case in reverse direction
|
||||
expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
|
||||
|
||||
// Check post-context segment
|
||||
expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
|
||||
|
||||
// Test toRule/toPattern for non-quantified segment.
|
||||
// Careful with spacing here.
|
||||
String r = "([a-c]){q} > x $1 x;";
|
||||
Transliterator t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
|
||||
String rr = t.toRules(true);
|
||||
if (!r.equals(rr)) {
|
||||
errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||
} else {
|
||||
logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||
}
|
||||
|
||||
// Test toRule/toPattern for quantified segment.
|
||||
// Careful with spacing here.
|
||||
r = "([a-c])+{q} > x $1 x;";
|
||||
t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
|
||||
rr = t.toRules(true);
|
||||
if (!r.equals(rr)) {
|
||||
errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||
} else {
|
||||
logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||
}
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// icu4j ONLY
|
||||
// These tests are not mirrored (yet) in icu4c at
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/StringMatcher.java,v $
|
||||
* $Date: 2001/10/25 22:32:02 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2001/10/30 18:04:08 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -18,16 +18,27 @@ class StringMatcher implements UnicodeMatcher {
|
||||
|
||||
private boolean isSegment;
|
||||
|
||||
private int matchStart;
|
||||
|
||||
private int matchLimit;
|
||||
|
||||
private final RuleBasedTransliterator.Data data;
|
||||
|
||||
public StringMatcher(String theString,
|
||||
boolean isSeg,
|
||||
RuleBasedTransliterator.Data theData) {
|
||||
data = theData;
|
||||
isSegment = isSeg;
|
||||
pattern = theString;
|
||||
matchStart = matchLimit = -1;
|
||||
}
|
||||
|
||||
public StringMatcher(String theString,
|
||||
int start,
|
||||
int limit,
|
||||
boolean isSeg,
|
||||
RuleBasedTransliterator.Data theData) {
|
||||
data = theData;
|
||||
isSegment = isSeg;
|
||||
pattern = theString.substring(start, limit);
|
||||
this(theString.substring(start, limit), isSeg, theData);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -40,6 +51,7 @@ class StringMatcher implements UnicodeMatcher {
|
||||
int i;
|
||||
int[] cursor = new int[] { offset[0] };
|
||||
if (limit < cursor[0]) {
|
||||
// Match in the reverse direction
|
||||
for (i=pattern.length()-1; i>=0; --i) {
|
||||
char keyChar = pattern.charAt(i);
|
||||
UnicodeMatcher subm = data.lookup(keyChar);
|
||||
@ -58,6 +70,13 @@ class StringMatcher implements UnicodeMatcher {
|
||||
}
|
||||
}
|
||||
}
|
||||
// Record the match position, but adjust for a normal
|
||||
// forward start, limit, and only if a prior match does not
|
||||
// exist -- we want the rightmost match.
|
||||
if (matchStart < 0) {
|
||||
matchStart = cursor[0]+1;
|
||||
matchLimit = offset[0]+1;
|
||||
}
|
||||
} else {
|
||||
for (i=0; i<pattern.length(); ++i) {
|
||||
if (incremental && cursor[0] == limit) {
|
||||
@ -85,6 +104,9 @@ class StringMatcher implements UnicodeMatcher {
|
||||
}
|
||||
}
|
||||
}
|
||||
// Record the match position
|
||||
matchStart = offset[0];
|
||||
matchLimit = cursor[0];
|
||||
}
|
||||
|
||||
offset[0] = cursor[0];
|
||||
@ -114,7 +136,7 @@ class StringMatcher implements UnicodeMatcher {
|
||||
result.append(')');
|
||||
}
|
||||
// Flush quoteBuf out to result
|
||||
TransliterationRule.appendToRule(result, (isSegment?')':-1),
|
||||
TransliterationRule.appendToRule(result, -1,
|
||||
true, escapeUnprintable, quoteBuf);
|
||||
return result.toString();
|
||||
}
|
||||
@ -130,6 +152,32 @@ class StringMatcher implements UnicodeMatcher {
|
||||
UnicodeMatcher m = data.lookup(c);
|
||||
return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove any match data. This must be called before performing a
|
||||
* set of matches with this segment.
|
||||
*/
|
||||
public void resetMatch() {
|
||||
matchStart = matchLimit = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the start offset, in the match text, of the <em>rightmost</em>
|
||||
* match. This method may get moved up into the UnicodeMatcher if
|
||||
* it turns out to be useful to generalize this.
|
||||
*/
|
||||
public int getMatchStart() {
|
||||
return matchStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the limit offset, in the match text, of the <em>rightmost</em>
|
||||
* match. This method may get moved up into the UnicodeMatcher if
|
||||
* it turns out to be useful to generalize this.
|
||||
*/
|
||||
public int getMatchLimit() {
|
||||
return matchLimit;
|
||||
}
|
||||
}
|
||||
|
||||
//eof
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
|
||||
* $Date: 2001/10/25 23:22:15 $
|
||||
* $Revision: 1.33 $
|
||||
* $Date: 2001/10/30 18:04:08 $
|
||||
* $Revision: 1.34 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -30,13 +30,15 @@ import com.ibm.util.Utility;
|
||||
* Variables are detected by looking up each character in a supplied
|
||||
* variable list to see if it has been so defined.
|
||||
*
|
||||
* <p>A rule may contain segments in its input string and segment references in
|
||||
* its output string. A segment is a substring of the input pattern, indicated
|
||||
* by an offset and limit. The segment may span the preceding or following
|
||||
* context. A segment reference is a special character in the output string
|
||||
* that causes a segment of the input string (not the input pattern) to be
|
||||
* copied to the output string. The range of special characters that represent
|
||||
* segment references is defined by RuleBasedTransliterator.Data.
|
||||
* <p>A rule may contain segments in its input string and segment
|
||||
* references in its output string. A segment is a substring of the
|
||||
* input pattern, indicated by an offset and limit. The segment may
|
||||
* be in the preceding or following context. It may not span a
|
||||
* context boundary. A segment reference is a special character in
|
||||
* the output string that causes a segment of the input string (not
|
||||
* the input pattern) to be copied to the output string. The range of
|
||||
* special characters that represent segment references is defined by
|
||||
* RuleBasedTransliterator.Data.
|
||||
*
|
||||
* <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
|
||||
* string "abc.123" to "ab1.c23".
|
||||
@ -44,7 +46,7 @@ import com.ibm.util.Utility;
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.33 $ $Date: 2001/10/25 23:22:15 $
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.34 $ $Date: 2001/10/30 18:04:08 $
|
||||
*/
|
||||
class TransliterationRule {
|
||||
|
||||
@ -64,20 +66,13 @@ class TransliterationRule {
|
||||
private String output;
|
||||
|
||||
/**
|
||||
* An array of integers encoding the position of the segments.
|
||||
* See RuleBasedTransliterator.Segments for more details.
|
||||
* An array of matcher objects corresponding to the input pattern
|
||||
* segments. If there are no segments this is null. N.B. This is
|
||||
* a UnicodeMatcher for generality, but in practice it is always a
|
||||
* StringMatcher. In the future we may generalize this, but for
|
||||
* now we sometimes cast down to StringMatcher.
|
||||
*/
|
||||
int[] segments;
|
||||
|
||||
/**
|
||||
* A value we compute from segments. The first index into segments[]
|
||||
* that is >= anteContextLength. That is, the first one that is within
|
||||
* the forward scanned part of the pattern -- the key or the postContext.
|
||||
* If there are no segments, this has the value -1. This index is relative
|
||||
* to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
|
||||
* segments[FIRST_SEG_POS_INDEX + firstKeySeg].
|
||||
*/
|
||||
int firstKeySeg;
|
||||
UnicodeMatcher[] segments;
|
||||
|
||||
/**
|
||||
* The length of the string that must match before the key. If
|
||||
@ -127,20 +122,6 @@ class TransliterationRule {
|
||||
private static final char APOSTROPHE = '\'';
|
||||
private static final char BACKSLASH = '\\';
|
||||
|
||||
// Macros for accessing the array of integers encoding the position of
|
||||
// the segments. See RuleBasedTransliterator.Segments for more details.
|
||||
// SEGMENTS_COUNT number of segments, n (half the number of parens)
|
||||
// SEGMENTS_LEN length of the segments array (number of elements)
|
||||
// SEGMENTS_POS position in 'pattern' of parenthesis i, where i=0..2n-1
|
||||
// SEGMENTS_NUM index into segments to access POS of $1.open,
|
||||
// $1.close, $2.open, $2.close,.., $n.open, $n.close
|
||||
// Relative to FIRST_SEG_POS_INDEX. Ranges from 0..2n-1.
|
||||
static final int FIRST_SEG_POS_INDEX = 2;
|
||||
static final int SEGMENTS_COUNT(int[] x) { return x[0]; }
|
||||
static final int SEGMENTS_LEN(int[] x) { return (SEGMENTS_COUNT(x)*4+4); }
|
||||
static final int SEGMENTS_POS(int[] x,int i) { return x[FIRST_SEG_POS_INDEX+i]; }
|
||||
static final int SEGMENTS_NUM(int[] x,int i) { return x[x[1]+i]-FIRST_SEG_POS_INDEX; }
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999-2001. All rights reserved.";
|
||||
|
||||
@ -165,12 +146,8 @@ class TransliterationRule {
|
||||
* 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
|
||||
* "xyz" and moves the cursor to before "a". It would have a cursorOffset
|
||||
* of -3.
|
||||
* @param segs array of 2n integers. Each of n pairs consists of offset,
|
||||
* limit for a segment of the input string. Characters in the output string
|
||||
* refer to these segments if they are in a special range determined by the
|
||||
* associated RuleBasedTransliterator.Data object. May be null if there are
|
||||
* no segments. The caller is responsible for validating that segments
|
||||
* are well-formed.
|
||||
* @param segs array of UnicodeMatcher corresponding to input pattern
|
||||
* segments, or null if there are none
|
||||
* @param anchorStart true if the the rule is anchored on the left to
|
||||
* the context start
|
||||
* @param anchorEnd true if the rule is anchored on the right to the
|
||||
@ -180,7 +157,7 @@ class TransliterationRule {
|
||||
int anteContextPos, int postContextPos,
|
||||
String output,
|
||||
int cursorPos, int cursorOffset,
|
||||
int[] segs,
|
||||
UnicodeMatcher[] segs,
|
||||
boolean anchorStart, boolean anchorEnd,
|
||||
RuleBasedTransliterator.Data theData) {
|
||||
data = theData;
|
||||
@ -212,25 +189,11 @@ class TransliterationRule {
|
||||
this.cursorPos = cursorPos + cursorOffset;
|
||||
this.output = output;
|
||||
// We don't validate the segments array. The caller must
|
||||
// guarantee that the segments are well-formed.
|
||||
// guarantee that the segments are well-formed (that is, that
|
||||
// all $n references in the output refer to indices of this
|
||||
// array, and that no array elements are null).
|
||||
this.segments = segs;
|
||||
|
||||
// Find the position of the first segment index that is after the
|
||||
// anteContext (in the key). Note that this may be a start or a
|
||||
// limit index. If all segments are in the ante context,
|
||||
// firstKeySeg should point past the last segment -- that is, it
|
||||
// should point at the end marker, which is -1. This allows the
|
||||
// code to back up by one to obtain the last ante context segment.
|
||||
firstKeySeg = -1;
|
||||
if (segments != null) {
|
||||
firstKeySeg = FIRST_SEG_POS_INDEX;
|
||||
while (segments[firstKeySeg] >= 0 &&
|
||||
segments[firstKeySeg] < anteContextLength) {
|
||||
++firstKeySeg;
|
||||
}
|
||||
firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
|
||||
}
|
||||
|
||||
pattern = input;
|
||||
flags = 0;
|
||||
if (anchorStart) {
|
||||
@ -410,25 +373,12 @@ class TransliterationRule {
|
||||
|
||||
// ============================ MATCH ===========================
|
||||
|
||||
// Record the actual positions, in the text, of the segments.
|
||||
// These are recorded in the order that they occur in the pattern.
|
||||
|
||||
// segPos[] is an array of 2*SEGMENTS_COUNT elements. It
|
||||
// records the position in 'text' of each segment boundary, in
|
||||
// the order that they occur in 'pattern'.
|
||||
int[] segPos = null;
|
||||
// Reset segment match data
|
||||
if (segments != null) {
|
||||
segPos = new int[2*SEGMENTS_COUNT(segments)];
|
||||
for (int i=0; i<segments.length; ++i) {
|
||||
((StringMatcher) segments[i]).resetMatch();
|
||||
}
|
||||
}
|
||||
// iSeg is an index into segments[] that accesses the first
|
||||
// array. As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
|
||||
// When indexing into segments[] FIRST_SEG_POS_INDEX must be
|
||||
// added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
|
||||
int iSeg = firstKeySeg - 1;
|
||||
// nextSegPos is an offset in 'pattern'. When the cursor is
|
||||
// equal to nextSegPos, we are at a segment boundary, and we
|
||||
// record the position in the real text in segPos[].
|
||||
int nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
|
||||
|
||||
int lenDelta, keyLimit;
|
||||
int[] intRef = new int[1];
|
||||
@ -465,15 +415,6 @@ class TransliterationRule {
|
||||
}
|
||||
oText = intRef[0];
|
||||
}
|
||||
while (nextSegPos == oPattern) {
|
||||
segPos[iSeg] = oText;
|
||||
if (oText >= 0) {
|
||||
segPos[iSeg] += UTF16.getCharCount(UTF16.charAt(text, oText));
|
||||
} else {
|
||||
++segPos[iSeg];
|
||||
}
|
||||
nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
|
||||
}
|
||||
}
|
||||
|
||||
minOText = posAfter(text, oText);
|
||||
@ -486,9 +427,6 @@ class TransliterationRule {
|
||||
|
||||
// -------------------- Key and Post Context --------------------
|
||||
|
||||
iSeg = firstKeySeg;
|
||||
nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
|
||||
|
||||
oPattern = 0;
|
||||
oText = pos.start;
|
||||
keyLimit = 0;
|
||||
@ -511,10 +449,6 @@ class TransliterationRule {
|
||||
// depending on whether we're in the key or in the post
|
||||
// context.
|
||||
|
||||
while (oPattern == nextSegPos) {
|
||||
segPos[iSeg] = oText;
|
||||
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
|
||||
}
|
||||
if (oPattern == keyLength) {
|
||||
keyLimit = oText;
|
||||
}
|
||||
@ -554,10 +488,6 @@ class TransliterationRule {
|
||||
//! return UnicodeMatcher.U_MISMATCH;
|
||||
//!}
|
||||
}
|
||||
while (oPattern == nextSegPos) {
|
||||
segPos[iSeg] = oText;
|
||||
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
|
||||
}
|
||||
if (oPattern == keyLength) {
|
||||
keyLimit = oText;
|
||||
}
|
||||
@ -576,8 +506,7 @@ class TransliterationRule {
|
||||
// =========================== REPLACE ==========================
|
||||
|
||||
// We have a full match. The key is between pos.start and
|
||||
// keyLimit. Segment indices have been recorded in segPos[].
|
||||
// Perform a replacement.
|
||||
// keyLimit.
|
||||
|
||||
if (segments == null) {
|
||||
text.replace(pos.start, keyLimit, output);
|
||||
@ -629,11 +558,22 @@ class TransliterationRule {
|
||||
buf.setLength(0);
|
||||
}
|
||||
// Copy segment with out-of-band data
|
||||
b *= 2;
|
||||
int start = segPos[SEGMENTS_NUM(segments,b)];
|
||||
int limit = segPos[SEGMENTS_NUM(segments,b+1)];
|
||||
text.copy(start, limit, dest);
|
||||
dest += limit - start;
|
||||
StringMatcher m = (StringMatcher) segments[b];
|
||||
int start = m.getMatchStart();
|
||||
int limit = m.getMatchLimit();
|
||||
// If there was no match, that means that a quantifier
|
||||
// matched zero-length. E.g., x (a)* y matched "xy".
|
||||
if (start >= 0) {
|
||||
// Adjust indices for segments in post context
|
||||
// for any inserted text between the key and
|
||||
// the post context.
|
||||
if (start >= keyLimit) {
|
||||
start += dest - keyLimit;
|
||||
limit += dest - keyLimit;
|
||||
}
|
||||
text.copy(start, limit, dest);
|
||||
dest += limit - start;
|
||||
}
|
||||
}
|
||||
oOutput += UTF16.getCharCount(c);
|
||||
}
|
||||
@ -790,20 +730,6 @@ class TransliterationRule {
|
||||
|
||||
StringBuffer rule = new StringBuffer();
|
||||
|
||||
// iseg indexes into segments[] directly (not offset from FSPI)
|
||||
int iseg = FIRST_SEG_POS_INDEX-1;
|
||||
int nextSeg = -1;
|
||||
// Build an array of booleans specifying open vs. close paren
|
||||
boolean[] isOpen = null;
|
||||
if (segments != null) {
|
||||
isOpen = new boolean[2*SEGMENTS_COUNT(segments)];
|
||||
for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
|
||||
isOpen[SEGMENTS_NUM(segments,i) ] = true;
|
||||
isOpen[SEGMENTS_NUM(segments,i+1)] = false;
|
||||
}
|
||||
nextSeg = segments[++iseg];
|
||||
}
|
||||
|
||||
// Accumulate special characters (and non-specials following them)
|
||||
// into quoteBuf. Append quoteBuf, within single quotes, when
|
||||
// a non-quoted element must be inserted.
|
||||
@ -825,14 +751,6 @@ class TransliterationRule {
|
||||
appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
|
||||
// Append either '(' or ')' if we are at a segment index
|
||||
if (i == nextSeg) {
|
||||
appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
|
||||
'(' : ')',
|
||||
true, escapeUnprintable, quoteBuf);
|
||||
nextSeg = segments[++iseg];
|
||||
}
|
||||
|
||||
if (emitBraces && i == (anteContextLength + keyLength)) {
|
||||
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
@ -847,11 +765,6 @@ class TransliterationRule {
|
||||
}
|
||||
}
|
||||
|
||||
if (i == nextSeg) {
|
||||
// assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
|
||||
appendToRule(rule, ')', true, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
|
||||
if (emitBraces && i == (anteContextLength + keyLength)) {
|
||||
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
@ -885,7 +798,7 @@ class TransliterationRule {
|
||||
} else {
|
||||
++seg; // make 1-based
|
||||
appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf);
|
||||
rule.append(0x24 /*$*/);
|
||||
rule.append('$');
|
||||
boolean show = false; // true if we should display digits
|
||||
for (int p=9; p>=0; --p) {
|
||||
int d = seg / POW10[p];
|
||||
@ -938,6 +851,9 @@ class TransliterationRule {
|
||||
|
||||
/**
|
||||
* $Log: TransliterationRule.java,v $
|
||||
* Revision 1.34 2001/10/30 18:04:08 alan
|
||||
* jitterbug 1406: make quantified segments behave like perl counterparts
|
||||
*
|
||||
* Revision 1.33 2001/10/25 23:22:15 alan
|
||||
* jitterbug 73: changes to support zero-length matchers at end of key
|
||||
*
|
||||
|
@ -4,8 +4,8 @@
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliteratorParser.java,v $
|
||||
* $Date: 2001/10/24 00:03:38 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2001/10/30 18:04:09 $
|
||||
* $Revision: 1.8 $
|
||||
**********************************************************************
|
||||
*/
|
||||
package com.ibm.text;
|
||||
@ -117,6 +117,7 @@ class TransliteratorParser {
|
||||
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
|
||||
|
||||
private static final String OPERATORS = "=><";
|
||||
private static final String HALF_ENDERS = "=><;";
|
||||
|
||||
// Other special characters
|
||||
private static final char QUOTE = '\'';
|
||||
@ -142,7 +143,7 @@ class TransliteratorParser {
|
||||
// private static final char ANCHOR_END = '$';
|
||||
|
||||
// Segments of the input string are delimited by "(" and ")". In the
|
||||
// output string these segments are referenced as "$1" through "$9".
|
||||
// output string these segments are referenced as "$1", "$2", etc.
|
||||
private static final char SEGMENT_OPEN = '(';
|
||||
private static final char SEGMENT_CLOSE = ')';
|
||||
|
||||
@ -285,209 +286,6 @@ class TransliteratorParser {
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// class Segments
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Segments are parentheses-enclosed regions of the input string.
|
||||
* These are referenced in the output string using the notation $1,
|
||||
* $2, etc. Numbering is in order of appearance of the left
|
||||
* parenthesis. Number is one-based. Segments are defined as start,
|
||||
* limit pairs. Segments may nest.
|
||||
*
|
||||
* During parsing, segment data is encoded in an object of class
|
||||
* Segments. At runtime, the same data is encoded in compact form as
|
||||
* an array of integers in a TransliterationRule. The runtime encoding
|
||||
* must satisfy three goals:
|
||||
*
|
||||
* 1. Iterate over the offsets in a pattern, from left to right,
|
||||
* and indicate all segment boundaries, in order. This is done
|
||||
* during matching.
|
||||
*
|
||||
* 2. Given a reference $n, produce the start and limit offsets
|
||||
* for that segment. This is done during replacement.
|
||||
*
|
||||
* 3. Similar to goal 1, but in addition, indicate whether each
|
||||
* segment boundary is a start or a limit, in other words, whether
|
||||
* each is an open paren or a close paren. This is required by
|
||||
* the toRule() method.
|
||||
*
|
||||
* Goal 1 must be satisfied at high speed since this is done during
|
||||
* matching. Goal 2 is next most important. Goal 3 is not performance
|
||||
* critical since it is only needed by toRule().
|
||||
*
|
||||
* The array of integers is actually two arrays concatenated. The
|
||||
* first gives the index values of the open and close parentheses in
|
||||
* the order they appear. The second maps segment numbers to the
|
||||
* indices of the first array. The two arrays have the same length.
|
||||
* Iterating over the first array satisfies goal 1. Indexing into the
|
||||
* second array satisfies goal 2. Goal 3 is satisfied by iterating
|
||||
* over the second array and constructing the required data when
|
||||
* needed. This is what toRule() does.
|
||||
*
|
||||
* Example: (a b(c d)e f)
|
||||
* 0 1 2 3 4 5 6
|
||||
*
|
||||
* First array: Indices are 0, 2, 4, and 6.
|
||||
|
||||
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
|
||||
* second array is 0, 3, 1 2 -- these give the indices in the
|
||||
* first array at which $1:open, $1:close, $2:open, and $2:close
|
||||
* occur.
|
||||
*
|
||||
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
|
||||
*
|
||||
* Each subarray is terminated with a -1, and two leading entries
|
||||
* give the number of segments and the offset to the first entry
|
||||
* of the second array. In addition, the second array value are
|
||||
* all offset by 2 so they index directly into the final array.
|
||||
* The total array size is 4*segments[0] + 4. The second index is
|
||||
* 2*segments[0] + 3.
|
||||
*
|
||||
* In the output string, a segment reference is indicated by a
|
||||
* character in a special range, as defined by
|
||||
* RuleBasedTransliterator.Data.
|
||||
*
|
||||
* Most rules have no segments, in which case segments is null, and the
|
||||
* output string need not be checked for segment reference characters.
|
||||
*
|
||||
* See also rbt_rule.h/cpp.
|
||||
*/
|
||||
private static class Segments {
|
||||
|
||||
private Vector offsets; // holds Integer objects
|
||||
|
||||
private Vector isOpenParen; // holds Boolean objects
|
||||
|
||||
private int offset(int i) {
|
||||
return ((Integer) offsets.elementAt(i)).intValue();
|
||||
}
|
||||
|
||||
private boolean isOpen(int i) {
|
||||
return ((Boolean) isOpenParen.elementAt(i)).booleanValue();
|
||||
}
|
||||
|
||||
// size of the Vectors
|
||||
private int size() {
|
||||
// assert(offset.size() == isOpenParen.size());
|
||||
return offsets.size();
|
||||
}
|
||||
|
||||
public Segments() {
|
||||
offsets = new Vector();
|
||||
isOpenParen = new Vector();
|
||||
}
|
||||
|
||||
public void addParenthesisAt(int offset, boolean isOpen) {
|
||||
offsets.addElement(new Integer(offset));
|
||||
isOpenParen.addElement(new Boolean(isOpen));
|
||||
}
|
||||
|
||||
public int getLastParenOffset(boolean[] isOpenParen) {
|
||||
if (size() == 0) {
|
||||
return -1;
|
||||
}
|
||||
isOpenParen[0] = isOpen(size()-1);
|
||||
return offset(size()-1);
|
||||
}
|
||||
|
||||
// Remove the last (rightmost) segment. Store its offsets in start
|
||||
// and limit, and then convert all offsets at or after start to be
|
||||
// equal to start. Upon failure, return FALSE. Assume that the
|
||||
// caller has already called getLastParenOffset() and validated that
|
||||
// there is at least one parenthesis and that the last one is a close
|
||||
// paren.
|
||||
public boolean extractLastParenSubstring(int[] start, int[] limit) {
|
||||
// assert(offsets.size() > 0);
|
||||
// assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
|
||||
int i = size() - 1;
|
||||
int n = 1; // count of close parens we need to match
|
||||
// Record position of the last close paren
|
||||
limit[0] = offset(i);
|
||||
--i; // back up to the one before the last one
|
||||
while (i >= 0 && n != 0) {
|
||||
n += isOpen(i) ? -1 : 1;
|
||||
}
|
||||
if (n != 0) {
|
||||
return false;
|
||||
}
|
||||
// assert(i>=0);
|
||||
start[0] = offset(i);
|
||||
// Reset all segment pairs from i to size() - 1 to [start, start+1).
|
||||
while (i<size()) {
|
||||
int o = isOpen(i) ? start[0] : (start[0]+1);
|
||||
offsets.setElementAt(new Integer(o), i);
|
||||
++i;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Assume caller has already gotten a TRUE validate().
|
||||
public int[] createArray() {
|
||||
int c = count(); // number of segments
|
||||
int arrayLen = 4*c + 4;
|
||||
int[] array = new int[arrayLen];
|
||||
int a2offset = 2*c + 3; // offset to array 2
|
||||
|
||||
array[0] = c;
|
||||
array[1] = a2offset;
|
||||
int i;
|
||||
for (i=0; i<2*c; ++i) {
|
||||
array[2+i] = offset(i);
|
||||
}
|
||||
array[a2offset-1] = -1;
|
||||
array[arrayLen-1] = -1;
|
||||
// Now walk through and match up segment numbers with parentheses.
|
||||
// Number segments from 0. We're going to offset all entries by 2
|
||||
// to skip the first two elements, array[0] and array[1].
|
||||
Stack stack = new Stack();
|
||||
int nextOpen = 0; // seg # of next open, 0-based
|
||||
for (i=0; i<2*c; ++i) {
|
||||
boolean open = isOpen(i);
|
||||
// Let seg be the zero-based segment number.
|
||||
// Open parens are at 2*seg in array 2.
|
||||
// Close parens are at 2*seg+1 in array 2.
|
||||
if (open) {
|
||||
array[a2offset + 2*nextOpen] = 2+i;
|
||||
stack.push(new Integer(nextOpen));
|
||||
++nextOpen;
|
||||
} else {
|
||||
int nextClose = ((Integer) stack.pop()).intValue();
|
||||
array[a2offset + 2*nextClose+1] = 2+i;
|
||||
}
|
||||
}
|
||||
// assert(stack.empty());
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
public boolean validate() {
|
||||
// want number of parens >= 2
|
||||
// want number of parens to be even
|
||||
// want first paren '('
|
||||
// want parens to match up in the end
|
||||
if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
|
||||
return false;
|
||||
}
|
||||
int n = 0;
|
||||
for (int i=0; i<size(); ++i) {
|
||||
n += isOpen(i) ? 1 : -1;
|
||||
if (n < 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return n == 0;
|
||||
}
|
||||
|
||||
// Number of segments
|
||||
// Assume caller has already gotten a TRUE validate().
|
||||
public int count() {
|
||||
// assert(validate());
|
||||
return size() / 2;
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// class RuleHalf
|
||||
//----------------------------------------------------------------------
|
||||
@ -505,11 +303,7 @@ class TransliteratorParser {
|
||||
public int ante = -1; // position of ante context marker '{' in text
|
||||
public int post = -1; // position of post context marker '}' in text
|
||||
|
||||
// Record the position of the segment substrings and references. A
|
||||
// given side should have segments or segment references, but not
|
||||
// both.
|
||||
public Segments segments = null;
|
||||
public int maxRef = -1; // index of largest ref (1..9)
|
||||
public int maxRef = -1; // n where maximum segment ref is $n; 1-based
|
||||
|
||||
// Record the offset to the cursor either to the left or to the
|
||||
// right of the key. This is indicated by characters on the output
|
||||
@ -521,29 +315,88 @@ class TransliteratorParser {
|
||||
// output text.
|
||||
public int cursorOffset = 0; // only nonzero on output side
|
||||
|
||||
// Position of first CURSOR_OFFSET on _right_. This will be -1
|
||||
// for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
|
||||
private int cursorOffsetPos = 0;
|
||||
|
||||
public boolean anchorStart = false;
|
||||
public boolean anchorEnd = false;
|
||||
|
||||
/**
|
||||
* UnicodeMatcher objects corresponding to each segment.
|
||||
*/
|
||||
public Vector segments = new Vector();
|
||||
|
||||
/**
|
||||
* The segment number from 0..n-1 of the next '(' we see
|
||||
* during parsing; 0-based.
|
||||
*/
|
||||
private int nextSegmentNumber = 0;
|
||||
|
||||
/**
|
||||
* Parse one side of a rule, stopping at either the limit,
|
||||
* the END_OF_RULE character, or an operator. Return
|
||||
* the pos of the terminating character (or limit).
|
||||
* the END_OF_RULE character, or an operator.
|
||||
* @return the index after the terminating character, or
|
||||
* if limit was reached, limit
|
||||
*/
|
||||
public int parse(String rule, int pos, int limit,
|
||||
TransliteratorParser parser) {
|
||||
int start = pos;
|
||||
StringBuffer buf = new StringBuffer();
|
||||
pos = parseSection(rule, pos, limit, parser, buf, false);
|
||||
text = buf.toString();
|
||||
|
||||
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
||||
syntaxError("Misplaced " + CURSOR_POS, rule, start);
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a section of one side of a rule, stopping at either
|
||||
* the limit, the END_OF_RULE character, an operator, or a
|
||||
* segment close character. This method parses both a
|
||||
* top-level rule half and a segment within such a rule half.
|
||||
* It calls itself recursively to parse segments and nested
|
||||
* segments.
|
||||
* @param buf buffer into which to accumulate the rule pattern
|
||||
* characters, either literal characters from the rule or
|
||||
* standins for UnicodeMatcher objects including segments.
|
||||
* @param isSegment if true, then we've already seen a '(' and
|
||||
* pos on entry points right after it. Accumulate everything
|
||||
* up to the closing ')', put it in a segment matcher object,
|
||||
* generate a standin for it, and add the standin to buf. As
|
||||
* a side effect, update the segments vector with a reference
|
||||
* to the segment matcher. This works recursively for nested
|
||||
* segments. If isSegment is false, just accumulate
|
||||
* characters into buf.
|
||||
* @return the index after the terminating character, or
|
||||
* if limit was reached, limit
|
||||
*/
|
||||
private int parseSection(String rule, int pos, int limit,
|
||||
TransliteratorParser parser,
|
||||
StringBuffer buf,
|
||||
boolean isSegment) {
|
||||
int start = pos;
|
||||
ParsePosition pp = null;
|
||||
int cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
|
||||
boolean done = false;
|
||||
int quoteStart = -1; // Most recent 'single quoted string'
|
||||
int quoteLimit = -1;
|
||||
int varStart = -1; // Most recent $variableReference
|
||||
int varLimit = -1;
|
||||
int[] iref = new int[1];
|
||||
|
||||
// If isSegment, then bufSegStart is the offset in buf to
|
||||
// the first character of the segment we are parsing.
|
||||
int bufSegStart = 0;
|
||||
int segmentNumber = 0;
|
||||
if (isSegment) {
|
||||
bufSegStart = buf.length();
|
||||
segmentNumber = nextSegmentNumber++;
|
||||
}
|
||||
|
||||
main:
|
||||
while (pos < limit && !done) {
|
||||
while (pos < limit) {
|
||||
char c = rule.charAt(pos++);
|
||||
if (Character.isWhitespace(c)) {
|
||||
// Ignore whitespace. Note that this is not Unicode
|
||||
@ -551,8 +404,11 @@ class TransliteratorParser {
|
||||
// whitespace likely to be seen in code.
|
||||
continue;
|
||||
}
|
||||
if (OPERATORS.indexOf(c) >= 0) {
|
||||
--pos; // Backup to point to operator
|
||||
// HALF_ENDERS is all chars that end a rule half: "<>=;"
|
||||
if (HALF_ENDERS.indexOf(c) >= 0) {
|
||||
if (isSegment) {
|
||||
syntaxError("Unclosed segment", rule, start);
|
||||
}
|
||||
break main;
|
||||
}
|
||||
if (anchorEnd) {
|
||||
@ -614,7 +470,12 @@ class TransliteratorParser {
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (c) {
|
||||
|
||||
//------------------------------------------------------
|
||||
// Elements allowed within and out of segments
|
||||
//------------------------------------------------------
|
||||
case ANCHOR_START:
|
||||
if (buf.length() == 0 && !anchorStart) {
|
||||
anchorStart = true;
|
||||
@ -624,17 +485,8 @@ class TransliteratorParser {
|
||||
}
|
||||
break;
|
||||
case SEGMENT_OPEN:
|
||||
case SEGMENT_CLOSE:
|
||||
// Handle segment definitions "(" and ")"
|
||||
// Parse "(", ")"
|
||||
if (segments == null) {
|
||||
segments = new Segments();
|
||||
}
|
||||
segments.addParenthesisAt(buf.length(), c == SEGMENT_OPEN);
|
||||
pos = parseSection(rule, pos, limit, parser, buf, true);
|
||||
break;
|
||||
case END_OF_RULE:
|
||||
--pos; // Backup to point to END_OF_RULE
|
||||
break main;
|
||||
case SymbolTable.SYMBOL_REF:
|
||||
// Handle variable references and segment references "$1" .. "$9"
|
||||
{
|
||||
@ -676,7 +528,7 @@ class TransliteratorParser {
|
||||
}
|
||||
pp.setIndex(pos);
|
||||
String name = parser.parseData.
|
||||
parseReference(rule, pp, limit);
|
||||
parseReference(rule, pp, limit);
|
||||
if (name == null) {
|
||||
// This means the '$' was not followed by a
|
||||
// valid name. Try to interpret it as an
|
||||
@ -697,25 +549,129 @@ class TransliteratorParser {
|
||||
}
|
||||
}
|
||||
break;
|
||||
case DOT:
|
||||
buf.append(parser.getDotStandIn());
|
||||
break;
|
||||
case KLEENE_STAR:
|
||||
case ONE_OR_MORE:
|
||||
case ZERO_OR_ONE:
|
||||
// Quantifiers. We handle single characters, quoted strings,
|
||||
// variable references, and segments.
|
||||
// a+ matches aaa
|
||||
// 'foo'+ matches foofoofoo
|
||||
// $v+ matches xyxyxy if $v == xy
|
||||
// (seg)+ matches segsegseg
|
||||
{
|
||||
if (isSegment && buf.length() == bufSegStart) {
|
||||
// The */+ immediately follows '('
|
||||
syntaxError("Misplaced quantifier", rule, start);
|
||||
break;
|
||||
}
|
||||
|
||||
int qstart, qlimit;
|
||||
// The */+ follows an isolated character or quote
|
||||
// or variable reference
|
||||
if (buf.length() == quoteLimit) {
|
||||
// The */+ follows a 'quoted string'
|
||||
qstart = quoteStart;
|
||||
qlimit = quoteLimit;
|
||||
} else if (buf.length() == varLimit) {
|
||||
// The */+ follows a $variableReference
|
||||
qstart = varStart;
|
||||
qlimit = varLimit;
|
||||
} else {
|
||||
// The */+ follows a single character, possibly
|
||||
// a segment standin
|
||||
qstart = buf.length() - 1;
|
||||
qlimit = qstart + 1;
|
||||
}
|
||||
|
||||
UnicodeMatcher m =
|
||||
new StringMatcher(buf.toString(), qstart, qlimit,
|
||||
false, parser.data);
|
||||
int min = 0;
|
||||
int max = Quantifier.MAX;
|
||||
switch (c) {
|
||||
case ONE_OR_MORE:
|
||||
min = 1;
|
||||
break;
|
||||
case ZERO_OR_ONE:
|
||||
min = 0;
|
||||
max = 1;
|
||||
break;
|
||||
// case KLEENE_STAR:
|
||||
// do nothing -- min, max already set
|
||||
}
|
||||
m = new Quantifier(m, min, max);
|
||||
buf.setLength(qstart);
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
}
|
||||
break;
|
||||
|
||||
//------------------------------------------------------
|
||||
// Elements allowed ONLY WITHIN segments
|
||||
//------------------------------------------------------
|
||||
case SEGMENT_CLOSE:
|
||||
if (isSegment) {
|
||||
// We're done parsing a segment. The relevant
|
||||
// characters are in buf, starting at offset
|
||||
// bufSegStart. Extract them into a string
|
||||
// matcher, and replace them with a standin
|
||||
// for that matcher.
|
||||
StringMatcher m =
|
||||
new StringMatcher(buf.substring(bufSegStart),
|
||||
true, parser.data);
|
||||
// Since we call parseSection() recursively,
|
||||
// nested segments will result in segment i+1
|
||||
// getting parsed and stored before segment i;
|
||||
// be careful with the vector handling here.
|
||||
if ((segmentNumber+1) > segments.size()) {
|
||||
segments.setSize(segmentNumber+1);
|
||||
}
|
||||
segments.setElementAt(m, segmentNumber);
|
||||
buf.setLength(bufSegStart);
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
break main;
|
||||
}
|
||||
// If we aren't in a segment, then a segment close
|
||||
// character is a syntax error.
|
||||
syntaxError("Unquoted special", rule, start);
|
||||
break;
|
||||
|
||||
//------------------------------------------------------
|
||||
// Elements allowed ONLY OUTSIDE segments
|
||||
//------------------------------------------------------
|
||||
case CONTEXT_ANTE:
|
||||
if (isSegment) {
|
||||
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||
}
|
||||
if (ante >= 0) {
|
||||
syntaxError("Multiple ante contexts", rule, start);
|
||||
}
|
||||
ante = buf.length();
|
||||
break;
|
||||
case CONTEXT_POST:
|
||||
if (isSegment) {
|
||||
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||
}
|
||||
if (post >= 0) {
|
||||
syntaxError("Multiple post contexts", rule, start);
|
||||
}
|
||||
post = buf.length();
|
||||
break;
|
||||
case CURSOR_POS:
|
||||
if (isSegment) {
|
||||
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||
}
|
||||
if (cursor >= 0) {
|
||||
syntaxError("Multiple cursors", rule, start);
|
||||
}
|
||||
cursor = buf.length();
|
||||
break;
|
||||
case CURSOR_OFFSET:
|
||||
if (isSegment) {
|
||||
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||
}
|
||||
if (cursorOffset < 0) {
|
||||
if (buf.length() > 0) {
|
||||
syntaxError("Misplaced " + c, rule, start);
|
||||
@ -737,74 +693,10 @@ class TransliteratorParser {
|
||||
}
|
||||
}
|
||||
break;
|
||||
case DOT:
|
||||
buf.append(parser.getDotStandIn());
|
||||
break;
|
||||
case KLEENE_STAR:
|
||||
case ONE_OR_MORE:
|
||||
case ZERO_OR_ONE:
|
||||
// Quantifiers. We handle single characters, quoted strings,
|
||||
// variable references, and segments.
|
||||
// a+ matches aaa
|
||||
// 'foo'+ matches foofoofoo
|
||||
// $v+ matches xyxyxy if $v == xy
|
||||
// (seg)+ matches segsegseg
|
||||
{
|
||||
int qstart, qlimit;
|
||||
boolean[] isOpenParen = new boolean[1];
|
||||
boolean isSegment = false;
|
||||
if (segments != null &&
|
||||
segments.getLastParenOffset(isOpenParen) == buf.length()) {
|
||||
// The */+ immediately follows a segment
|
||||
if (isOpenParen[0]) {
|
||||
syntaxError("Misplaced quantifier", rule, start);
|
||||
}
|
||||
int[] startparam = new int[1];
|
||||
int[] limitparam = new int[1];
|
||||
if (!segments.extractLastParenSubstring(startparam, limitparam)) {
|
||||
syntaxError("Mismatched segment delimiters", rule, start);
|
||||
}
|
||||
qstart = startparam[0];
|
||||
qlimit = limitparam[0];
|
||||
isSegment = true;
|
||||
} else {
|
||||
// The */+ follows an isolated character or quote
|
||||
// or variable reference
|
||||
if (buf.length() == quoteLimit) {
|
||||
// The */+ follows a 'quoted string'
|
||||
qstart = quoteStart;
|
||||
qlimit = quoteLimit;
|
||||
} else if (buf.length() == varLimit) {
|
||||
// The */+ follows a $variableReference
|
||||
qstart = varStart;
|
||||
qlimit = varLimit;
|
||||
} else {
|
||||
// The */+ follows a single character
|
||||
qstart = buf.length() - 1;
|
||||
qlimit = qstart + 1;
|
||||
}
|
||||
}
|
||||
UnicodeMatcher m =
|
||||
new StringMatcher(buf.toString(), qstart, qlimit,
|
||||
isSegment, parser.data);
|
||||
int min = 0;
|
||||
int max = Quantifier.MAX;
|
||||
switch (c) {
|
||||
case ONE_OR_MORE:
|
||||
min = 1;
|
||||
break;
|
||||
case ZERO_OR_ONE:
|
||||
min = 0;
|
||||
max = 1;
|
||||
break;
|
||||
// case KLEENE_STAR:
|
||||
// do nothing -- min, max already set
|
||||
}
|
||||
m = new Quantifier(m, min, max);
|
||||
buf.setLength(qstart);
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
}
|
||||
break;
|
||||
|
||||
//------------------------------------------------------
|
||||
// Non-special characters
|
||||
//------------------------------------------------------
|
||||
default:
|
||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||
// in the printable ASCII range. These characters are
|
||||
@ -819,11 +711,6 @@ class TransliteratorParser {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
||||
syntaxError("Misplaced " + CURSOR_POS, rule, start);
|
||||
}
|
||||
text = buf.toString();
|
||||
return pos;
|
||||
}
|
||||
|
||||
@ -838,10 +725,12 @@ class TransliteratorParser {
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and return an int[] array of segments.
|
||||
* Create and return a UnicodeMatcher[] array of segments,
|
||||
* or null if there are no segments.
|
||||
*/
|
||||
int[] createSegments() {
|
||||
return (segments == null) ? null : segments.createArray();
|
||||
UnicodeMatcher[] createSegments() {
|
||||
return (segments.size() == 0) ? null :
|
||||
(UnicodeMatcher[]) segments.toArray(new UnicodeMatcher[segments.size()]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1096,9 +985,10 @@ class TransliteratorParser {
|
||||
pos = left.parse(rule, pos, limit, this);
|
||||
|
||||
if (pos == limit ||
|
||||
OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
|
||||
syntaxError("No operator", rule, start);
|
||||
OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
|
||||
syntaxError("No operator pos=" + pos, rule, start);
|
||||
}
|
||||
++pos;
|
||||
|
||||
// Found an operator char. Check for forward-reverse operator.
|
||||
if (operator == REVERSE_RULE_OP &&
|
||||
@ -1110,7 +1000,7 @@ class TransliteratorParser {
|
||||
pos = right.parse(rule, pos, limit, this);
|
||||
|
||||
if (pos < limit) {
|
||||
if (rule.charAt(pos) == END_OF_RULE) {
|
||||
if (rule.charAt(--pos) == END_OF_RULE) {
|
||||
++pos;
|
||||
} else {
|
||||
// RuleHalf parser must have terminated at an operator
|
||||
@ -1173,7 +1063,7 @@ class TransliteratorParser {
|
||||
// apply.
|
||||
if (operator == FWDREV_RULE_OP) {
|
||||
right.removeContext();
|
||||
right.segments = null;
|
||||
right.segments.removeAllElements();
|
||||
left.cursor = left.maxRef = -1;
|
||||
left.cursorOffset = 0;
|
||||
}
|
||||
@ -1193,7 +1083,7 @@ class TransliteratorParser {
|
||||
// cannot place the cursor outside the limits of the context.
|
||||
// Anchors are only allowed on the input side.
|
||||
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
|
||||
right.segments != null || left.maxRef >= 0 ||
|
||||
right.segments.size() > 0 || left.maxRef >= 0 ||
|
||||
(right.cursorOffset != 0 && right.cursor < 0) ||
|
||||
// - The following two checks were used to ensure that the
|
||||
// - the cursor offset stayed within the ante- or postcontext.
|
||||
@ -1208,14 +1098,8 @@ class TransliteratorParser {
|
||||
// Check integrity of segments and segment references. Each
|
||||
// segment's start must have a corresponding limit, and the
|
||||
// references must not refer to segments that do not exist.
|
||||
if (left.segments != null) {
|
||||
if (!left.segments.validate()) {
|
||||
syntaxError("Missing segment close", rule, start);
|
||||
}
|
||||
int n = left.segments.count();
|
||||
if (right.maxRef > n) {
|
||||
syntaxError("Undefined segment reference", rule, start);
|
||||
}
|
||||
if (right.maxRef > left.segments.size()) {
|
||||
syntaxError("Undefined segment reference $" + right.maxRef, rule, start);
|
||||
}
|
||||
|
||||
data.ruleSet.addRule(new TransliterationRule(
|
||||
@ -1363,7 +1247,7 @@ class TransliteratorParser {
|
||||
char generateStandInFor(UnicodeMatcher matcher) {
|
||||
// assert(matcher != null);
|
||||
if (variableNext >= variableLimit) {
|
||||
throw new RuntimeException("Private use variables exhausted");
|
||||
throw new RuntimeException("Variable range exhausted");
|
||||
}
|
||||
variablesVector.addElement(matcher);
|
||||
return variableNext++;
|
||||
|
Loading…
Reference in New Issue
Block a user