ICU-1406 make quantified segments behave like perl counterparts
X-SVN-Rev: 6493
This commit is contained in:
parent
0d08aaadcc
commit
2c2b11dfe8
@ -63,6 +63,10 @@ static const UChar gOPERATORS[] = {
|
|||||||
0x3D, 0x3E, 0x3C, 0 // "=><"
|
0x3D, 0x3E, 0x3C, 0 // "=><"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static const UChar HALF_ENDERS[] = {
|
||||||
|
0x3D, 0x3E, 0x3C, 59, 0 // "=><;"
|
||||||
|
};
|
||||||
|
|
||||||
// These are also used in Transliterator::toRules()
|
// These are also used in Transliterator::toRules()
|
||||||
static const int32_t ID_TOKEN_LEN = 2;
|
static const int32_t ID_TOKEN_LEN = 2;
|
||||||
static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'
|
static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'
|
||||||
@ -147,256 +151,6 @@ UnicodeString ParseData::parseReference(const UnicodeString& text,
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
//----------------------------------------------------------------------
|
|
||||||
// Segments
|
|
||||||
//----------------------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Segments are parentheses-enclosed regions of the input string.
|
|
||||||
* These are referenced in the output string using the notation $1,
|
|
||||||
* $2, etc. Numbering is in order of appearance of the left
|
|
||||||
* parenthesis. Number is one-based. Segments are defined as start,
|
|
||||||
* limit pairs. Segments may nest.
|
|
||||||
*
|
|
||||||
* During parsing, segment data is encoded in an object of class
|
|
||||||
* Segments. At runtime, the same data is encoded in compact form as
|
|
||||||
* an array of integers in a TransliterationRule. The runtime encoding
|
|
||||||
* must satisfy three goals:
|
|
||||||
*
|
|
||||||
* 1. Iterate over the offsets in a pattern, from left to right,
|
|
||||||
* and indicate all segment boundaries, in order. This is done
|
|
||||||
* during matching.
|
|
||||||
*
|
|
||||||
* 2. Given a reference $n, produce the start and limit offsets
|
|
||||||
* for that segment. This is done during replacement.
|
|
||||||
*
|
|
||||||
* 3. Similar to goal 1, but in addition, indicate whether each
|
|
||||||
* segment boundary is a start or a limit, in other words, whether
|
|
||||||
* each is an open paren or a close paren. This is required by
|
|
||||||
* the toRule() method.
|
|
||||||
*
|
|
||||||
* Goal 1 must be satisfied at high speed since this is done during
|
|
||||||
* matching. Goal 2 is next most important. Goal 3 is not performance
|
|
||||||
* critical since it is only needed by toRule().
|
|
||||||
*
|
|
||||||
* The array of integers is actually two arrays concatenated. The
|
|
||||||
* first gives the index values of the open and close parentheses in
|
|
||||||
* the order they appear. The second maps segment numbers to the
|
|
||||||
* indices of the first array. The two arrays have the same length.
|
|
||||||
* Iterating over the first array satisfies goal 1. Indexing into the
|
|
||||||
* second array satisfies goal 2. Goal 3 is satisfied by iterating
|
|
||||||
* over the second array and constructing the required data when
|
|
||||||
* needed. This is what toRule() does.
|
|
||||||
*
|
|
||||||
* Example: (a b(c d)e f)
|
|
||||||
* 0 1 2 3 4 5 6
|
|
||||||
*
|
|
||||||
* First array: Indices are 0, 2, 4, and 6.
|
|
||||||
|
|
||||||
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
|
|
||||||
* second array is 0, 3, 1 2 -- these give the indices in the
|
|
||||||
* first array at which $1:open, $1:close, $2:open, and $2:close
|
|
||||||
* occur.
|
|
||||||
*
|
|
||||||
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
|
|
||||||
*
|
|
||||||
* Each subarray is terminated with a -1, and two leading entries
|
|
||||||
* give the number of segments and the offset to the first entry
|
|
||||||
* of the second array. In addition, the second array value are
|
|
||||||
* all offset by 2 so they index directly into the final array.
|
|
||||||
* The total array size is 4*segments[0] + 4. The second index is
|
|
||||||
* 2*segments[0] + 3.
|
|
||||||
*
|
|
||||||
* In the output string, a segment reference is indicated by a
|
|
||||||
* character in a special range, as defined by
|
|
||||||
* RuleBasedTransliterator.Data.
|
|
||||||
*
|
|
||||||
* Most rules have no segments, in which case segments is null, and the
|
|
||||||
* output string need not be checked for segment reference characters.
|
|
||||||
*
|
|
||||||
* See also rbt_rule.h/cpp.
|
|
||||||
*/
|
|
||||||
class Segments {
|
|
||||||
UVector offsets;
|
|
||||||
UVector isOpenParen;
|
|
||||||
public:
|
|
||||||
Segments(UErrorCode &status);
|
|
||||||
~Segments();
|
|
||||||
void addParenthesisAt(int32_t offset, UBool isOpenParen, UErrorCode &status);
|
|
||||||
int32_t getLastParenOffset(UBool& isOpenParen) const;
|
|
||||||
UBool extractLastParenSubstring(int32_t& start, int32_t& limit);
|
|
||||||
int32_t* createArray(UErrorCode &status) const;
|
|
||||||
UBool validate() const;
|
|
||||||
int32_t count() const; // number of segments
|
|
||||||
private:
|
|
||||||
int32_t offset(int32_t i) const;
|
|
||||||
UBool isOpen(int32_t i) const;
|
|
||||||
int32_t size() const; // size of the UVectors
|
|
||||||
};
|
|
||||||
|
|
||||||
int32_t Segments::offset(int32_t i) const {
|
|
||||||
return offsets.elementAti(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
UBool Segments::isOpen(int32_t i) const {
|
|
||||||
return isOpenParen.elementAti(i) != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t Segments::size() const {
|
|
||||||
// assert(offset.size() == isOpenParen.size());
|
|
||||||
return offsets.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
Segments::Segments(UErrorCode &status)
|
|
||||||
: offsets(status),
|
|
||||||
isOpenParen(status)
|
|
||||||
{}
|
|
||||||
Segments::~Segments() {}
|
|
||||||
|
|
||||||
void Segments::addParenthesisAt(int32_t offset, UBool isOpen, UErrorCode &status) {
|
|
||||||
offsets.addElement(offset, status);
|
|
||||||
isOpenParen.addElement(isOpen ? 1 : 0, status);
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t Segments::getLastParenOffset(UBool& isOpenParenReturn) const {
|
|
||||||
if (size() == 0) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
isOpenParenReturn = isOpen(size()-1);
|
|
||||||
return offset(size()-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remove the last (rightmost) segment. Store its offsets in start
|
|
||||||
// and limit, and then convert all offsets at or after start to be
|
|
||||||
// equal to start. Upon failure, return FALSE. Assume that the
|
|
||||||
// caller has already called getLastParenOffset() and validated that
|
|
||||||
// there is at least one parenthesis and that the last one is a close
|
|
||||||
// paren.
|
|
||||||
UBool Segments::extractLastParenSubstring(int32_t& start, int32_t& limit) {
|
|
||||||
// assert(offsets.size() > 0);
|
|
||||||
// assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
|
|
||||||
int32_t i = size() - 1;
|
|
||||||
int32_t n = 1; // count of close parens we need to match
|
|
||||||
// Record position of the last close paren
|
|
||||||
limit = offset(i);
|
|
||||||
--i; // back up to the one before the last one
|
|
||||||
while (i >= 0 && n != 0) {
|
|
||||||
n += isOpen(i) ? -1 : 1;
|
|
||||||
}
|
|
||||||
if (n != 0) {
|
|
||||||
return FALSE;
|
|
||||||
}
|
|
||||||
// assert(i>=0);
|
|
||||||
start = offset(i);
|
|
||||||
// Reset all segment pairs from i to size() - 1 to [start, start+1).
|
|
||||||
while (i<size()) {
|
|
||||||
int32_t o = isOpen(i) ? start : (start+1);
|
|
||||||
offsets.setElementAt(o, i);
|
|
||||||
++i;
|
|
||||||
}
|
|
||||||
return TRUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assume caller has already gotten a TRUE validate().
|
|
||||||
int32_t* Segments::createArray(UErrorCode &status) const {
|
|
||||||
int32_t c = count(); // number of segments
|
|
||||||
int32_t arrayLen = 4*c + 4;
|
|
||||||
int32_t *array = new int32_t[arrayLen];
|
|
||||||
int32_t a2offset = 2*c + 3; // offset to array 2
|
|
||||||
|
|
||||||
if (array == NULL) {
|
|
||||||
status = U_MEMORY_ALLOCATION_ERROR;
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
array[0] = c;
|
|
||||||
array[1] = a2offset;
|
|
||||||
int32_t i;
|
|
||||||
for (i=0; i<2*c; ++i) {
|
|
||||||
array[2+i] = offset(i);
|
|
||||||
}
|
|
||||||
array[a2offset-1] = -1;
|
|
||||||
array[arrayLen-1] = -1;
|
|
||||||
// Now walk through and match up segment numbers with parentheses.
|
|
||||||
// Number segments from 0. We're going to offset all entries by 2
|
|
||||||
// to skip the first two elements, array[0] and array[1].
|
|
||||||
UStack stack(status);
|
|
||||||
int32_t nextOpen = 0; // seg # of next open, 0-based
|
|
||||||
if (U_FAILURE(status)) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
for (i=0; i<2*c; ++i) {
|
|
||||||
UBool open = isOpen(i);
|
|
||||||
// Let seg be the zero-based segment number.
|
|
||||||
// Open parens are at 2*seg in array 2.
|
|
||||||
// Close parens are at 2*seg+1 in array 2.
|
|
||||||
if (open) {
|
|
||||||
array[a2offset + 2*nextOpen] = 2+i;
|
|
||||||
stack.push(nextOpen, status);
|
|
||||||
++nextOpen;
|
|
||||||
} else {
|
|
||||||
int32_t nextClose = stack.popi();
|
|
||||||
array[a2offset + 2*nextClose+1] = 2+i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// assert(stack.empty());
|
|
||||||
|
|
||||||
// Perform a series of checks on the array. DO NOT COMPILE INTO
|
|
||||||
// PRODUCTION CODE. Use to debug array building problems.
|
|
||||||
//
|
|
||||||
//::if (!stack.empty()) {
|
|
||||||
//:: __asm int 03;
|
|
||||||
//::}
|
|
||||||
//::// check the array
|
|
||||||
//::if (array[0] < 1) {
|
|
||||||
//:: __asm int 03;
|
|
||||||
//::}
|
|
||||||
//::if (array[1] < 5) {
|
|
||||||
//:: __asm int 03;
|
|
||||||
//::}
|
|
||||||
//::for (i=2; i<2+array[0]*2; ++i) {
|
|
||||||
//:: if (array[i] < 0) { // array[i] is an offset into the rule
|
|
||||||
//:: __asm int 03;
|
|
||||||
//:: }
|
|
||||||
//::}
|
|
||||||
//::if (array[2+array[0]*2] != -1) {
|
|
||||||
//:: __asm int 03;
|
|
||||||
//::}
|
|
||||||
//::for (i=array[1]; i<array[1]+array[0]*2; ++i) {
|
|
||||||
//:: if (array[i] < 2 || array[i] >= (2+2*array[0])) {
|
|
||||||
//:: __asm int 03;
|
|
||||||
//:: }
|
|
||||||
//::}
|
|
||||||
//::if (array[array[1]+array[0]*2] != -1) {
|
|
||||||
//:: __asm int 03;
|
|
||||||
//::}
|
|
||||||
|
|
||||||
return array;
|
|
||||||
}
|
|
||||||
|
|
||||||
UBool Segments::validate() const {
|
|
||||||
// want number of parens >= 2
|
|
||||||
// want number of parens to be even
|
|
||||||
// want first paren '('
|
|
||||||
// want parens to match up in the end
|
|
||||||
if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
|
|
||||||
return FALSE;
|
|
||||||
}
|
|
||||||
int32_t n = 0;
|
|
||||||
for (int32_t i=0; i<size(); ++i) {
|
|
||||||
n += isOpen(i) ? 1 : -1;
|
|
||||||
if (n < 0) {
|
|
||||||
return FALSE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return n == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assume caller has already gotten a TRUE validate().
|
|
||||||
int32_t Segments::count() const {
|
|
||||||
// assert(validate());
|
|
||||||
return size() / 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
// BEGIN RuleHalf
|
// BEGIN RuleHalf
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
@ -416,11 +170,7 @@ public:
|
|||||||
int32_t ante; // position of ante context marker '{' in text
|
int32_t ante; // position of ante context marker '{' in text
|
||||||
int32_t post; // position of post context marker '}' in text
|
int32_t post; // position of post context marker '}' in text
|
||||||
|
|
||||||
// Record the position of the segment substrings and references. A
|
int32_t maxRef; // n where maximum segment ref is $n; 1-based
|
||||||
// given side should have segments or segment references, but not
|
|
||||||
// both.
|
|
||||||
Segments* segments;
|
|
||||||
int32_t maxRef; // index of largest ref ($n) on the right
|
|
||||||
|
|
||||||
// Record the offset to the cursor either to the left or to the
|
// Record the offset to the cursor either to the left or to the
|
||||||
// right of the key. This is indicated by characters on the output
|
// right of the key. This is indicated by characters on the output
|
||||||
@ -432,8 +182,25 @@ public:
|
|||||||
// output text.
|
// output text.
|
||||||
int32_t cursorOffset; // only nonzero on output side
|
int32_t cursorOffset; // only nonzero on output side
|
||||||
|
|
||||||
|
// Position of first CURSOR_OFFSET on _right_. This will be -1
|
||||||
|
// for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
|
||||||
|
int32_t cursorOffsetPos;
|
||||||
|
|
||||||
UBool anchorStart;
|
UBool anchorStart;
|
||||||
UBool anchorEnd;
|
UBool anchorEnd;
|
||||||
|
|
||||||
|
UErrorCode ec;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* UnicodeMatcher objects corresponding to each segment.
|
||||||
|
*/
|
||||||
|
UVector segments;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The segment number from 0..n-1 of the next '(' we see
|
||||||
|
* during parsing; 0-based.
|
||||||
|
*/
|
||||||
|
int32_t nextSegmentNumber;
|
||||||
|
|
||||||
TransliteratorParser& parser;
|
TransliteratorParser& parser;
|
||||||
|
|
||||||
@ -443,22 +210,22 @@ public:
|
|||||||
RuleHalf(TransliteratorParser& parser);
|
RuleHalf(TransliteratorParser& parser);
|
||||||
~RuleHalf();
|
~RuleHalf();
|
||||||
|
|
||||||
/**
|
|
||||||
* Parse one side of a rule, stopping at either the limit,
|
|
||||||
* the END_OF_RULE character, or an operator. Return
|
|
||||||
* the pos of the terminating character (or limit).
|
|
||||||
*/
|
|
||||||
int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit);
|
int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit);
|
||||||
|
|
||||||
|
int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
|
||||||
|
UnicodeString& buf,
|
||||||
|
UBool isSegment);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove context.
|
* Remove context.
|
||||||
*/
|
*/
|
||||||
void removeContext();
|
void removeContext();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create and return an int[] array of segments.
|
* Create and return a UnicodeMatcher*[] array of segments,
|
||||||
|
* or NULL if there are no segments.
|
||||||
*/
|
*/
|
||||||
int32_t* createSegments(UErrorCode& status) const;
|
UnicodeMatcher** createSegments(UErrorCode& status) const;
|
||||||
|
|
||||||
int syntaxError(UErrorCode code,
|
int syntaxError(UErrorCode code,
|
||||||
const UnicodeString& rule,
|
const UnicodeString& rule,
|
||||||
@ -472,30 +239,69 @@ private:
|
|||||||
RuleHalf& operator=(const RuleHalf&);
|
RuleHalf& operator=(const RuleHalf&);
|
||||||
};
|
};
|
||||||
|
|
||||||
RuleHalf::RuleHalf(TransliteratorParser& p) : parser(p) {
|
RuleHalf::RuleHalf(TransliteratorParser& p) :
|
||||||
|
ec(U_ZERO_ERROR),
|
||||||
|
segments(ec),
|
||||||
|
parser(p)
|
||||||
|
{
|
||||||
cursor = -1;
|
cursor = -1;
|
||||||
ante = -1;
|
ante = -1;
|
||||||
post = -1;
|
post = -1;
|
||||||
segments = NULL;
|
|
||||||
maxRef = -1;
|
maxRef = -1;
|
||||||
cursorOffset = 0;
|
cursorOffset = 0;
|
||||||
|
cursorOffsetPos = 0;
|
||||||
anchorStart = anchorEnd = FALSE;
|
anchorStart = anchorEnd = FALSE;
|
||||||
|
segments.removeAllElements();
|
||||||
|
nextSegmentNumber = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
RuleHalf::~RuleHalf() {
|
RuleHalf::~RuleHalf() {
|
||||||
delete segments;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse one side of a rule, stopping at either the limit,
|
* Parse one side of a rule, stopping at either the limit,
|
||||||
* the END_OF_RULE character, or an operator. Return
|
* the END_OF_RULE character, or an operator.
|
||||||
* the pos of the terminating character (or limit).
|
* @return the index after the terminating character, or
|
||||||
|
* if limit was reached, limit
|
||||||
*/
|
*/
|
||||||
int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
||||||
int32_t start = pos;
|
int32_t start = pos;
|
||||||
UnicodeString& buf = text;
|
text.truncate(0);
|
||||||
|
pos = parseSection(rule, pos, limit, text, FALSE);
|
||||||
|
|
||||||
|
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
||||||
|
return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a section of one side of a rule, stopping at either
|
||||||
|
* the limit, the END_OF_RULE character, an operator, or a
|
||||||
|
* segment close character. This method parses both a
|
||||||
|
* top-level rule half and a segment within such a rule half.
|
||||||
|
* It calls itself recursively to parse segments and nested
|
||||||
|
* segments.
|
||||||
|
* @param buf buffer into which to accumulate the rule pattern
|
||||||
|
* characters, either literal characters from the rule or
|
||||||
|
* standins for UnicodeMatcher objects including segments.
|
||||||
|
* @param isSegment if true, then we've already seen a '(' and
|
||||||
|
* pos on entry points right after it. Accumulate everything
|
||||||
|
* up to the closing ')', put it in a segment matcher object,
|
||||||
|
* generate a standin for it, and add the standin to buf. As
|
||||||
|
* a side effect, update the segments vector with a reference
|
||||||
|
* to the segment matcher. This works recursively for nested
|
||||||
|
* segments. If isSegment is false, just accumulate
|
||||||
|
* characters into buf.
|
||||||
|
* @return the index after the terminating character, or
|
||||||
|
* if limit was reached, limit
|
||||||
|
*/
|
||||||
|
int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
|
||||||
|
UnicodeString& buf,
|
||||||
|
UBool isSegment) {
|
||||||
|
int32_t start = pos;
|
||||||
ParsePosition pp;
|
ParsePosition pp;
|
||||||
int32_t cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
|
|
||||||
UnicodeString scratch;
|
UnicodeString scratch;
|
||||||
UBool done = FALSE;
|
UBool done = FALSE;
|
||||||
int32_t quoteStart = -1; // Most recent 'single quoted string'
|
int32_t quoteStart = -1; // Most recent 'single quoted string'
|
||||||
@ -503,6 +309,15 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
|||||||
int32_t varStart = -1; // Most recent $variableReference
|
int32_t varStart = -1; // Most recent $variableReference
|
||||||
int32_t varLimit = -1;
|
int32_t varLimit = -1;
|
||||||
|
|
||||||
|
// If isSegment, then bufSegStart is the offset in buf to
|
||||||
|
// the first character of the segment we are parsing.
|
||||||
|
int32_t bufSegStart = 0;
|
||||||
|
int32_t segmentNumber = 0;
|
||||||
|
if (isSegment) {
|
||||||
|
bufSegStart = buf.length();
|
||||||
|
segmentNumber = nextSegmentNumber++;
|
||||||
|
}
|
||||||
|
|
||||||
while (pos < limit && !done) {
|
while (pos < limit && !done) {
|
||||||
UChar c = rule.charAt(pos++);
|
UChar c = rule.charAt(pos++);
|
||||||
if (u_isWhitespace(c)) {
|
if (u_isWhitespace(c)) {
|
||||||
@ -511,8 +326,11 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
|||||||
// whitespace likely to be seen in code.
|
// whitespace likely to be seen in code.
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (u_strchr(gOPERATORS, c) != NULL) {
|
if (u_strchr(HALF_ENDERS, c) != NULL) {
|
||||||
--pos; // Backup to point to operator
|
if (isSegment) {
|
||||||
|
// Unclosed segment
|
||||||
|
return syntaxError(U_UNCLOSED_SEGMENT, rule, start);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (anchorEnd) {
|
if (anchorEnd) {
|
||||||
@ -575,6 +393,10 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
switch (c) {
|
switch (c) {
|
||||||
|
|
||||||
|
//------------------------------------------------------
|
||||||
|
// Elements allowed within and out of segments
|
||||||
|
//------------------------------------------------------
|
||||||
case ANCHOR_START:
|
case ANCHOR_START:
|
||||||
if (buf.length() == 0 && !anchorStart) {
|
if (buf.length() == 0 && !anchorStart) {
|
||||||
anchorStart = TRUE;
|
anchorStart = TRUE;
|
||||||
@ -584,17 +406,7 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case SEGMENT_OPEN:
|
case SEGMENT_OPEN:
|
||||||
case SEGMENT_CLOSE:
|
pos = parseSection(rule, pos, limit, buf, TRUE);
|
||||||
// Handle segment definitions "(" and ")"
|
|
||||||
// Parse "(", ")"
|
|
||||||
if (segments == NULL) {
|
|
||||||
segments = new Segments(parser.status);
|
|
||||||
}
|
|
||||||
segments->addParenthesisAt(buf.length(), c == SEGMENT_OPEN, parser.status);
|
|
||||||
break;
|
|
||||||
case END_OF_RULE:
|
|
||||||
--pos; // Backup to point to END_OF_RULE
|
|
||||||
done = TRUE;
|
|
||||||
break;
|
break;
|
||||||
case SymbolTable::SYMBOL_REF:
|
case SymbolTable::SYMBOL_REF:
|
||||||
// Handle variable references and segment references "$1" .. "$9"
|
// Handle variable references and segment references "$1" .. "$9"
|
||||||
@ -655,25 +467,128 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case DOT:
|
||||||
|
buf.append(parser.getDotStandIn());
|
||||||
|
break;
|
||||||
|
case KLEENE_STAR:
|
||||||
|
case ONE_OR_MORE:
|
||||||
|
case ZERO_OR_ONE:
|
||||||
|
// Quantifiers. We handle single characters, quoted strings,
|
||||||
|
// variable references, and segments.
|
||||||
|
// a+ matches aaa
|
||||||
|
// 'foo'+ matches foofoofoo
|
||||||
|
// $v+ matches xyxyxy if $v == xy
|
||||||
|
// (seg)+ matches segsegseg
|
||||||
|
{
|
||||||
|
if (isSegment && buf.length() == bufSegStart) {
|
||||||
|
// The */+ immediately follows '('
|
||||||
|
return syntaxError(U_MISPLACED_QUANTIFIER, rule, start);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t qstart, qlimit;
|
||||||
|
// The */+ follows an isolated character or quote
|
||||||
|
// or variable reference
|
||||||
|
if (buf.length() == quoteLimit) {
|
||||||
|
// The */+ follows a 'quoted string'
|
||||||
|
qstart = quoteStart;
|
||||||
|
qlimit = quoteLimit;
|
||||||
|
} else if (buf.length() == varLimit) {
|
||||||
|
// The */+ follows a $variableReference
|
||||||
|
qstart = varStart;
|
||||||
|
qlimit = varLimit;
|
||||||
|
} else {
|
||||||
|
// The */+ follows a single character, possibly
|
||||||
|
// a segment standin
|
||||||
|
qstart = buf.length() - 1;
|
||||||
|
qlimit = qstart + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
UnicodeMatcher *m =
|
||||||
|
new StringMatcher(buf, qstart, qlimit, FALSE, *parser.data);
|
||||||
|
int32_t min = 0;
|
||||||
|
int32_t max = Quantifier::MAX;
|
||||||
|
switch (c) {
|
||||||
|
case ONE_OR_MORE:
|
||||||
|
min = 1;
|
||||||
|
break;
|
||||||
|
case ZERO_OR_ONE:
|
||||||
|
min = 0;
|
||||||
|
max = 1;
|
||||||
|
break;
|
||||||
|
// case KLEENE_STAR:
|
||||||
|
// do nothing -- min, max already set
|
||||||
|
}
|
||||||
|
m = new Quantifier(m, min, max);
|
||||||
|
buf.truncate(qstart);
|
||||||
|
buf.append(parser.generateStandInFor(m));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
//------------------------------------------------------
|
||||||
|
// Elements allowed ONLY WITHIN segments
|
||||||
|
//------------------------------------------------------
|
||||||
|
case SEGMENT_CLOSE:
|
||||||
|
if (isSegment) {
|
||||||
|
// We're done parsing a segment. The relevant
|
||||||
|
// characters are in buf, starting at offset
|
||||||
|
// bufSegStart. Extract them into a string
|
||||||
|
// matcher, and replace them with a standin
|
||||||
|
// for that matcher.
|
||||||
|
StringMatcher *m =
|
||||||
|
new StringMatcher(buf, bufSegStart, buf.length(),
|
||||||
|
TRUE, *parser.data);
|
||||||
|
// Since we call parseSection() recursively,
|
||||||
|
// nested segments will result in segment i+1
|
||||||
|
// getting parsed and stored before segment i;
|
||||||
|
// be careful with the vector handling here.
|
||||||
|
if ((segmentNumber+1) > segments.size()) {
|
||||||
|
segments.setSize(segmentNumber+1);
|
||||||
|
}
|
||||||
|
segments.setElementAt(m, segmentNumber);
|
||||||
|
buf.truncate(bufSegStart);
|
||||||
|
buf.append(parser.generateStandInFor(m));
|
||||||
|
done = TRUE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we aren't in a segment, then a segment close
|
||||||
|
// character is a syntax error.
|
||||||
|
return syntaxError(U_UNQUOTED_SPECIAL, rule, start);
|
||||||
|
|
||||||
|
//------------------------------------------------------
|
||||||
|
// Elements allowed ONLY OUTSIDE segments
|
||||||
|
//------------------------------------------------------
|
||||||
case CONTEXT_ANTE:
|
case CONTEXT_ANTE:
|
||||||
|
if (isSegment) {
|
||||||
|
return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
|
||||||
|
}
|
||||||
if (ante >= 0) {
|
if (ante >= 0) {
|
||||||
return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start);
|
return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start);
|
||||||
}
|
}
|
||||||
ante = buf.length();
|
ante = buf.length();
|
||||||
break;
|
break;
|
||||||
case CONTEXT_POST:
|
case CONTEXT_POST:
|
||||||
|
if (isSegment) {
|
||||||
|
return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
|
||||||
|
}
|
||||||
if (post >= 0) {
|
if (post >= 0) {
|
||||||
return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start);
|
return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start);
|
||||||
}
|
}
|
||||||
post = buf.length();
|
post = buf.length();
|
||||||
break;
|
break;
|
||||||
case CURSOR_POS:
|
case CURSOR_POS:
|
||||||
|
if (isSegment) {
|
||||||
|
return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
|
||||||
|
}
|
||||||
if (cursor >= 0) {
|
if (cursor >= 0) {
|
||||||
return syntaxError(U_MULTIPLE_CURSORS, rule, start);
|
return syntaxError(U_MULTIPLE_CURSORS, rule, start);
|
||||||
}
|
}
|
||||||
cursor = buf.length();
|
cursor = buf.length();
|
||||||
break;
|
break;
|
||||||
case CURSOR_OFFSET:
|
case CURSOR_OFFSET:
|
||||||
|
if (isSegment) {
|
||||||
|
return syntaxError(U_ILLEGAL_CHAR_IN_SEGMENT, rule, start);
|
||||||
|
}
|
||||||
if (cursorOffset < 0) {
|
if (cursorOffset < 0) {
|
||||||
if (buf.length() > 0) {
|
if (buf.length() > 0) {
|
||||||
return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
|
return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
|
||||||
@ -695,69 +610,11 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case DOT:
|
|
||||||
buf.append(parser.getDotStandIn());
|
|
||||||
break;
|
//------------------------------------------------------
|
||||||
case KLEENE_STAR:
|
// Non-special characters
|
||||||
case ONE_OR_MORE:
|
//------------------------------------------------------
|
||||||
case ZERO_OR_ONE:
|
|
||||||
// Quantifiers. We handle single characters, quoted strings,
|
|
||||||
// variable references, and segments.
|
|
||||||
// a+ matches aaa
|
|
||||||
// 'foo'+ matches foofoofoo
|
|
||||||
// $v+ matches xyxyxy if $v == xy
|
|
||||||
// (seg)+ matches segsegseg
|
|
||||||
{
|
|
||||||
int32_t start, limit;
|
|
||||||
UBool isOpenParen;
|
|
||||||
UBool isSegment = FALSE;
|
|
||||||
if (segments != 0 &&
|
|
||||||
segments->getLastParenOffset(isOpenParen) == buf.length()) {
|
|
||||||
// The */+ immediately follows a segment
|
|
||||||
if (isOpenParen) {
|
|
||||||
return syntaxError(U_MISPLACED_QUANTIFIER, rule, start);
|
|
||||||
}
|
|
||||||
if (!segments->extractLastParenSubstring(start, limit)) {
|
|
||||||
return syntaxError(U_MISMATCHED_SEGMENT_DELIMITERS, rule, start);
|
|
||||||
}
|
|
||||||
isSegment = TRUE;
|
|
||||||
} else {
|
|
||||||
// The */+ follows an isolated character or quote
|
|
||||||
// or variable reference
|
|
||||||
if (buf.length() == quoteLimit) {
|
|
||||||
// The */+ follows a 'quoted string'
|
|
||||||
start = quoteStart;
|
|
||||||
limit = quoteLimit;
|
|
||||||
} else if (buf.length() == varLimit) {
|
|
||||||
// The */+ follows a $variableReference
|
|
||||||
start = varStart;
|
|
||||||
limit = varLimit;
|
|
||||||
} else {
|
|
||||||
// The */+ follows a single character
|
|
||||||
start = buf.length() - 1;
|
|
||||||
limit = start + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
UnicodeMatcher *m =
|
|
||||||
new StringMatcher(buf, start, limit, isSegment, *parser.data);
|
|
||||||
int32_t min = 0;
|
|
||||||
int32_t max = Quantifier::MAX;
|
|
||||||
switch (c) {
|
|
||||||
case ONE_OR_MORE:
|
|
||||||
min = 1;
|
|
||||||
break;
|
|
||||||
case ZERO_OR_ONE:
|
|
||||||
min = 0;
|
|
||||||
max = 1;
|
|
||||||
break;
|
|
||||||
// case KLEENE_STAR:
|
|
||||||
// do nothing -- min, max already set
|
|
||||||
}
|
|
||||||
m = new Quantifier(m, min, max);
|
|
||||||
buf.truncate(start);
|
|
||||||
buf.append(parser.generateStandInFor(m));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||||
// in the printable ASCII range. These characters are
|
// in the printable ASCII range. These characters are
|
||||||
@ -773,10 +630,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
|
||||||
return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start);
|
|
||||||
}
|
|
||||||
// text = buf.toString();
|
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -797,10 +650,15 @@ void RuleHalf::removeContext() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create and return an int32_t[] array of segments.
|
* Create and return a UnicodeMatcher*[] array of segments,
|
||||||
|
* or NULL if there are no segments.
|
||||||
*/
|
*/
|
||||||
int32_t* RuleHalf::createSegments(UErrorCode& status) const {
|
UnicodeMatcher** RuleHalf::createSegments(UErrorCode& status) const {
|
||||||
return (segments == 0) ? 0 : segments->createArray(status);
|
if (segments.size() == 0) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
UnicodeMatcher** result = new UnicodeMatcher*[segments.size()];
|
||||||
|
return (UnicodeMatcher**) segments.toArray((void**) result);
|
||||||
}
|
}
|
||||||
|
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
@ -1172,9 +1030,10 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
|||||||
return start;
|
return start;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(pos++))) == NULL) {
|
if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) {
|
||||||
return syntaxError(U_MISSING_OPERATOR, rule, start);
|
return syntaxError(U_MISSING_OPERATOR, rule, start);
|
||||||
}
|
}
|
||||||
|
++pos;
|
||||||
|
|
||||||
// Found an operator char. Check for forward-reverse operator.
|
// Found an operator char. Check for forward-reverse operator.
|
||||||
if (op == REVERSE_RULE_OP &&
|
if (op == REVERSE_RULE_OP &&
|
||||||
@ -1189,7 +1048,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (pos < limit) {
|
if (pos < limit) {
|
||||||
if (rule.charAt(pos) == END_OF_RULE) {
|
if (rule.charAt(--pos) == END_OF_RULE) {
|
||||||
++pos;
|
++pos;
|
||||||
} else {
|
} else {
|
||||||
// RuleHalf parser must have terminated at an operator
|
// RuleHalf parser must have terminated at an operator
|
||||||
@ -1251,8 +1110,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
|||||||
// apply.
|
// apply.
|
||||||
if (op == FWDREV_RULE_OP) {
|
if (op == FWDREV_RULE_OP) {
|
||||||
right->removeContext();
|
right->removeContext();
|
||||||
delete right->segments;
|
right->segments.removeAllElements();
|
||||||
right->segments = NULL;
|
|
||||||
left->cursor = left->maxRef = -1;
|
left->cursor = left->maxRef = -1;
|
||||||
left->cursorOffset = 0;
|
left->cursorOffset = 0;
|
||||||
}
|
}
|
||||||
@ -1272,7 +1130,7 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
|||||||
// cannot place the cursor outside the limits of the context.
|
// cannot place the cursor outside the limits of the context.
|
||||||
// Anchors are only allowed on the input side.
|
// Anchors are only allowed on the input side.
|
||||||
if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 ||
|
if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 ||
|
||||||
right->segments != NULL || left->maxRef >= 0 ||
|
right->segments.size() > 0 || left->maxRef >= 0 ||
|
||||||
(right->cursorOffset != 0 && right->cursor < 0) ||
|
(right->cursorOffset != 0 && right->cursor < 0) ||
|
||||||
// - The following two checks were used to ensure that the
|
// - The following two checks were used to ensure that the
|
||||||
// - the cursor offset stayed within the ante- or postcontext.
|
// - the cursor offset stayed within the ante- or postcontext.
|
||||||
@ -1288,20 +1146,15 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
|||||||
// Check integrity of segments and segment references. Each
|
// Check integrity of segments and segment references. Each
|
||||||
// segment's start must have a corresponding limit, and the
|
// segment's start must have a corresponding limit, and the
|
||||||
// references must not refer to segments that do not exist.
|
// references must not refer to segments that do not exist.
|
||||||
if (left->segments != NULL) {
|
if (right->maxRef > left->segments.size()) {
|
||||||
if (!left->segments->validate()) {
|
return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start);
|
||||||
return syntaxError(U_MISSING_SEGMENT_CLOSE, rule, start);
|
|
||||||
}
|
|
||||||
int32_t n = left->segments->count();
|
|
||||||
if (right->maxRef > n) {
|
|
||||||
return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
data->ruleSet.addRule(new TransliterationRule(
|
data->ruleSet.addRule(new TransliterationRule(
|
||||||
left->text, left->ante, left->post,
|
left->text, left->ante, left->post,
|
||||||
right->text, right->cursor, right->cursorOffset,
|
right->text, right->cursor, right->cursorOffset,
|
||||||
left->createSegments(status),
|
left->createSegments(status),
|
||||||
|
left->segments.size(),
|
||||||
left->anchorStart, left->anchorEnd,
|
left->anchorStart, left->anchorEnd,
|
||||||
data,
|
data,
|
||||||
status), status);
|
status), status);
|
||||||
@ -1366,7 +1219,7 @@ UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
|
|||||||
if (variableNext >= variableLimit) {
|
if (variableNext >= variableLimit) {
|
||||||
// throw new RuntimeException("Private use variables exhausted");
|
// throw new RuntimeException("Private use variables exhausted");
|
||||||
delete adopted;
|
delete adopted;
|
||||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
status = U_VARIABLE_RANGE_EXHAUSTED;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
variablesVector->addElement(adopted, status);
|
variablesVector->addElement(adopted, status);
|
||||||
|
@ -14,28 +14,11 @@
|
|||||||
#include "unicode/uniset.h"
|
#include "unicode/uniset.h"
|
||||||
#include "unicode/unicode.h"
|
#include "unicode/unicode.h"
|
||||||
#include "cmemory.h"
|
#include "cmemory.h"
|
||||||
|
#include "strmatch.h"
|
||||||
|
|
||||||
static const UChar APOSTROPHE = 0x0027; // '\''
|
static const UChar APOSTROPHE = 0x0027; // '\''
|
||||||
static const UChar BACKSLASH = 0x005C; // '\'
|
static const UChar BACKSLASH = 0x005C; // '\'
|
||||||
|
|
||||||
// To process segments we need to allocate arrays of integers. We use
|
|
||||||
// stack storage as long as the segment count is <= MAX_STATIC_SEGS.
|
|
||||||
// Otherwise, we allocate heap space.
|
|
||||||
#define MAX_STATIC_SEGS 20
|
|
||||||
|
|
||||||
// Macros for accessing the array of integers encoding the position of
|
|
||||||
// SEGMENTS_COUNT number of segments, n (half the number of parens)
|
|
||||||
// SEGMENTS_LEN length of the segments array (number of elements)
|
|
||||||
// SEGMENTS_POS position in 'pattern' of parenthesis i, where i=0..2n-1
|
|
||||||
// SEGMENTS_NUM index into segments to access POS of $1.open,
|
|
||||||
// $1.close, $2.open, $2.close,.., $n.open, $n.close
|
|
||||||
// Relative to FIRST_SEG_POS_INDEX. Ranges from 0..2n-1.
|
|
||||||
#define FIRST_SEG_POS_INDEX 2
|
|
||||||
#define SEGMENTS_COUNT(x) x[0]
|
|
||||||
#define SEGMENTS_LEN(x) (SEGMENTS_COUNT(x)*4+4)
|
|
||||||
#define SEGMENTS_POS(x,i) x[FIRST_SEG_POS_INDEX+i]
|
|
||||||
#define SEGMENTS_NUM(x,i) (x[x[1]+i]-FIRST_SEG_POS_INDEX)
|
|
||||||
|
|
||||||
U_NAMESPACE_BEGIN
|
U_NAMESPACE_BEGIN
|
||||||
|
|
||||||
const UChar TransliterationRule::ETHER = 0xFFFF;
|
const UChar TransliterationRule::ETHER = 0xFFFF;
|
||||||
@ -56,11 +39,10 @@ const UChar TransliterationRule::ETHER = 0xFFFF;
|
|||||||
* <code>output</code>; that is, -1 is equivalent to
|
* <code>output</code>; that is, -1 is equivalent to
|
||||||
* <code>output.length()</code>. If greater than
|
* <code>output.length()</code>. If greater than
|
||||||
* <code>output.length()</code> then an exception is thrown.
|
* <code>output.length()</code> then an exception is thrown.
|
||||||
* @param adoptedSegs array of 2n integers. Each of n pairs consists of offset,
|
* @param segs array of UnicodeMatcher corresponding to input pattern
|
||||||
* limit for a segment of the input string. Characters in the output string
|
* segments, or null if there are none. The array itself is adopted,
|
||||||
* refer to these segments if they are in a special range determined by the
|
* but the pointers within it are not.
|
||||||
* associated RuleBasedTransliterator.Data object. May be null if there are
|
* @param segsCount number of elements in segs[]
|
||||||
* no segments.
|
|
||||||
* @param anchorStart TRUE if the the rule is anchored on the left to
|
* @param anchorStart TRUE if the the rule is anchored on the left to
|
||||||
* the context start
|
* the context start
|
||||||
* @param anchorEnd TRUE if the rule is anchored on the right to the
|
* @param anchorEnd TRUE if the rule is anchored on the right to the
|
||||||
@ -70,7 +52,8 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
|
|||||||
int32_t anteContextPos, int32_t postContextPos,
|
int32_t anteContextPos, int32_t postContextPos,
|
||||||
const UnicodeString& outputStr,
|
const UnicodeString& outputStr,
|
||||||
int32_t cursorPosition, int32_t cursorOffset,
|
int32_t cursorPosition, int32_t cursorOffset,
|
||||||
int32_t* adoptedSegs,
|
UnicodeMatcher** segs,
|
||||||
|
int32_t segsCount,
|
||||||
UBool anchorStart, UBool anchorEnd,
|
UBool anchorStart, UBool anchorEnd,
|
||||||
const TransliterationRuleData* theData,
|
const TransliterationRuleData* theData,
|
||||||
UErrorCode& status) :
|
UErrorCode& status) :
|
||||||
@ -113,23 +96,11 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
|
|||||||
this->cursorPos = cursorPosition + cursorOffset;
|
this->cursorPos = cursorPosition + cursorOffset;
|
||||||
this->output = outputStr;
|
this->output = outputStr;
|
||||||
// We don't validate the segments array. The caller must
|
// We don't validate the segments array. The caller must
|
||||||
// guarantee that the segments are well-formed.
|
// guarantee that the segments are well-formed (that is, that
|
||||||
this->segments = adoptedSegs;
|
// all $n references in the output refer to indices of this
|
||||||
// Find the position of the first segment index that is after the
|
// array, and that no array elements are null).
|
||||||
// anteContext (in the key). Note that this may be a start or a
|
this->segments = segs;
|
||||||
// limit index. If all segments are in the ante context,
|
this->segmentsCount = segsCount;
|
||||||
// firstKeySeg should point past the last segment -- that is, it
|
|
||||||
// should point at the end marker, which is -1. This allows the
|
|
||||||
// code to back up by one to obtain the last ante context segment.
|
|
||||||
firstKeySeg = -1;
|
|
||||||
if (segments != 0) {
|
|
||||||
firstKeySeg = FIRST_SEG_POS_INDEX;
|
|
||||||
while (segments[firstKeySeg] >= 0 &&
|
|
||||||
segments[firstKeySeg] < anteContextLength) {
|
|
||||||
++firstKeySeg;
|
|
||||||
}
|
|
||||||
firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
|
|
||||||
}
|
|
||||||
|
|
||||||
pattern = input;
|
pattern = input;
|
||||||
flags = 0;
|
flags = 0;
|
||||||
@ -149,18 +120,17 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
|
|||||||
TransliterationRule::TransliterationRule(TransliterationRule& other) :
|
TransliterationRule::TransliterationRule(TransliterationRule& other) :
|
||||||
pattern(other.pattern),
|
pattern(other.pattern),
|
||||||
output(other.output),
|
output(other.output),
|
||||||
firstKeySeg(other.firstKeySeg),
|
|
||||||
anteContextLength(other.anteContextLength),
|
anteContextLength(other.anteContextLength),
|
||||||
keyLength(other.keyLength),
|
keyLength(other.keyLength),
|
||||||
cursorPos(other.cursorPos),
|
cursorPos(other.cursorPos),
|
||||||
flags(other.flags),
|
flags(other.flags),
|
||||||
data(other.data) {
|
data(other.data) {
|
||||||
|
|
||||||
segments = 0;
|
segments = NULL;
|
||||||
if (other.segments != 0) {
|
segmentsCount = 0;
|
||||||
int32_t len = SEGMENTS_LEN(other.segments);
|
if (other.segmentsCount > 0) {
|
||||||
segments = new int32_t[len];
|
segments = new UnicodeMatcher*[other.segmentsCount];
|
||||||
uprv_memcpy(segments, other.segments, len*sizeof(segments[0]));
|
uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -341,26 +311,12 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||||||
|
|
||||||
// ============================ MATCH ===========================
|
// ============================ MATCH ===========================
|
||||||
|
|
||||||
// Record the actual positions, in the text, of the segments.
|
// Reset segment match data
|
||||||
// These are recorded in the order that they occur in the pattern.
|
if (segments != NULL) {
|
||||||
|
for (int32_t i=0; i<segmentsCount; ++i) {
|
||||||
// segPos[] is an array of 2*SEGMENTS_COUNT elements. It
|
((StringMatcher*) segments[i])->resetMatch();
|
||||||
// records the position in 'text' of each segment boundary, in
|
}
|
||||||
// the order that they occur in 'pattern'.
|
|
||||||
int32_t _segPos[2*MAX_STATIC_SEGS];
|
|
||||||
int32_t *segPos = _segPos;
|
|
||||||
if (segments != 0 && SEGMENTS_COUNT(segments) > MAX_STATIC_SEGS) {
|
|
||||||
segPos = new int32_t[2*SEGMENTS_COUNT(segments)];
|
|
||||||
}
|
}
|
||||||
// iSeg is an index into segments[] that accesses the first
|
|
||||||
// array. As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
|
|
||||||
// When indexing into segments[] FIRST_SEG_POS_INDEX must be
|
|
||||||
// added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
|
|
||||||
int32_t iSeg = firstKeySeg - 1;
|
|
||||||
// nextSegPos is an offset in 'pattern'. When the cursor is
|
|
||||||
// equal to nextSegPos, we are at a segment boundary, and we
|
|
||||||
// record the position in the real text in segPos[].
|
|
||||||
int32_t nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
|
|
||||||
|
|
||||||
UMatchDegree m;
|
UMatchDegree m;
|
||||||
int32_t lenDelta, keyLimit;
|
int32_t lenDelta, keyLimit;
|
||||||
@ -386,26 +342,15 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||||||
keyChar == text.charAt(oText)) {
|
keyChar == text.charAt(oText)) {
|
||||||
--oText;
|
--oText;
|
||||||
} else {
|
} else {
|
||||||
m = U_MISMATCH;
|
return U_MISMATCH;
|
||||||
goto exit;
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Subtract 1 from contextStart to make it a reverse limit
|
// Subtract 1 from contextStart to make it a reverse limit
|
||||||
if (matcher->matches(text, oText, pos.contextStart-1, FALSE)
|
if (matcher->matches(text, oText, pos.contextStart-1, FALSE)
|
||||||
!= U_MATCH) {
|
!= U_MATCH) {
|
||||||
m = U_MISMATCH;
|
return U_MISMATCH;
|
||||||
goto exit;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
while (nextSegPos == oPattern) {
|
|
||||||
segPos[iSeg] = oText;
|
|
||||||
if (oText >= 0) {
|
|
||||||
segPos[iSeg] += UTF_CHAR_LENGTH(text.char32At(oText));
|
|
||||||
} else {
|
|
||||||
++segPos[iSeg];
|
|
||||||
}
|
|
||||||
nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
minOText = posAfter(text, oText);
|
minOText = posAfter(text, oText);
|
||||||
@ -413,15 +358,11 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||||||
// ------------------------ Start Anchor ------------------------
|
// ------------------------ Start Anchor ------------------------
|
||||||
|
|
||||||
if ((flags & ANCHOR_START) && oText != posBefore(text, pos.contextStart)) {
|
if ((flags & ANCHOR_START) && oText != posBefore(text, pos.contextStart)) {
|
||||||
m = U_MISMATCH;
|
return U_MISMATCH;
|
||||||
goto exit;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------- Key and Post Context --------------------
|
// -------------------- Key and Post Context --------------------
|
||||||
|
|
||||||
iSeg = firstKeySeg;
|
|
||||||
nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
|
|
||||||
|
|
||||||
oPattern = 0;
|
oPattern = 0;
|
||||||
oText = pos.start;
|
oText = pos.start;
|
||||||
keyLimit = 0;
|
keyLimit = 0;
|
||||||
@ -429,8 +370,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||||||
if (incremental && oText == pos.limit) {
|
if (incremental && oText == pos.limit) {
|
||||||
// We've reached the limit without a mismatch and
|
// We've reached the limit without a mismatch and
|
||||||
// without completing our match.
|
// without completing our match.
|
||||||
m = U_PARTIAL_MATCH;
|
return U_PARTIAL_MATCH;
|
||||||
goto exit;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// It might seem that we could do a check like this here:
|
// It might seem that we could do a check like this here:
|
||||||
@ -445,10 +385,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||||||
// depending on whether we're in the key or in the post
|
// depending on whether we're in the key or in the post
|
||||||
// context.
|
// context.
|
||||||
|
|
||||||
while (oPattern == nextSegPos) {
|
|
||||||
segPos[iSeg] = oText;
|
|
||||||
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
|
|
||||||
}
|
|
||||||
if (oPattern == keyLength) {
|
if (oPattern == keyLength) {
|
||||||
keyLimit = oText;
|
keyLimit = oText;
|
||||||
}
|
}
|
||||||
@ -467,13 +403,12 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||||||
keyChar == text.charAt(oText)) {
|
keyChar == text.charAt(oText)) {
|
||||||
++oText;
|
++oText;
|
||||||
} else {
|
} else {
|
||||||
m = U_MISMATCH;
|
return U_MISMATCH;
|
||||||
goto exit;
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
m = matcher->matches(text, oText, matchLimit, incremental);
|
m = matcher->matches(text, oText, matchLimit, incremental);
|
||||||
if (m != U_MATCH) {
|
if (m != U_MATCH) {
|
||||||
goto exit;
|
return m;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -486,10 +421,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||||||
//! // at the end of the key.
|
//! // at the end of the key.
|
||||||
//! return UnicodeMatcher.U_MISMATCH;
|
//! return UnicodeMatcher.U_MISMATCH;
|
||||||
//!}
|
//!}
|
||||||
}
|
|
||||||
while (oPattern == nextSegPos) {
|
|
||||||
segPos[iSeg] = oText;
|
|
||||||
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
|
|
||||||
}
|
}
|
||||||
if (oPattern == keyLength) {
|
if (oPattern == keyLength) {
|
||||||
keyLimit = oText;
|
keyLimit = oText;
|
||||||
@ -509,8 +440,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||||||
// =========================== REPLACE ==========================
|
// =========================== REPLACE ==========================
|
||||||
|
|
||||||
// We have a full match. The key is between pos.start and
|
// We have a full match. The key is between pos.start and
|
||||||
// keyLimit. Segment indices have been recorded in segPos[].
|
// keyLimit.
|
||||||
// Perform a replacement.
|
|
||||||
|
|
||||||
if (segments == NULL) {
|
if (segments == NULL) {
|
||||||
text.handleReplaceBetween(pos.start, keyLimit, output);
|
text.handleReplaceBetween(pos.start, keyLimit, output);
|
||||||
@ -561,12 +491,23 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||||||
dest += buf.length();
|
dest += buf.length();
|
||||||
buf.remove();
|
buf.remove();
|
||||||
}
|
}
|
||||||
// Copy segment with out-of-band data
|
// Copy segment with out-of-band data
|
||||||
b *= 2;
|
StringMatcher* m = (StringMatcher*) segments[b];
|
||||||
int32_t start = segPos[SEGMENTS_NUM(segments,b)];
|
int32_t start = m->getMatchStart();
|
||||||
int32_t limit = segPos[SEGMENTS_NUM(segments,b+1)];
|
int32_t limit = m->getMatchLimit();
|
||||||
text.copy(start, limit, dest);
|
// If there was no match, that means that a quantifier
|
||||||
dest += limit - start;
|
// matched zero-length. E.g., x (a)* y matched "xy".
|
||||||
|
if (start >= 0) {
|
||||||
|
// Adjust indices for segments in post context
|
||||||
|
// for any inserted text between the key and
|
||||||
|
// the post context.
|
||||||
|
if (start >= keyLimit) {
|
||||||
|
start += dest - keyLimit;
|
||||||
|
limit += dest - keyLimit;
|
||||||
|
}
|
||||||
|
text.copy(start, limit, dest);
|
||||||
|
dest += limit - start;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
oOutput += UTF_CHAR_LENGTH(c);
|
oOutput += UTF_CHAR_LENGTH(c);
|
||||||
}
|
}
|
||||||
@ -600,13 +541,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||||||
pos.contextLimit += lenDelta;
|
pos.contextLimit += lenDelta;
|
||||||
// Restrict new value of start to [minOText, min(oText, pos.limit)].
|
// Restrict new value of start to [minOText, min(oText, pos.limit)].
|
||||||
pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));
|
pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));
|
||||||
m = U_MATCH;
|
return U_MATCH;
|
||||||
|
|
||||||
exit:
|
|
||||||
if (segPos != _segPos) {
|
|
||||||
delete[] segPos;
|
|
||||||
}
|
|
||||||
return m;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -727,23 +662,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
|
|||||||
UBool escapeUnprintable) const {
|
UBool escapeUnprintable) const {
|
||||||
int32_t i;
|
int32_t i;
|
||||||
|
|
||||||
// iseg indexes into segments[] directly (not offset from FSPI)
|
|
||||||
int32_t iseg = FIRST_SEG_POS_INDEX-1;
|
|
||||||
int32_t nextSeg = -1;
|
|
||||||
// Build an array of booleans specifying open vs. close paren
|
|
||||||
UBool _isOpen[2*MAX_STATIC_SEGS];
|
|
||||||
UBool *isOpen = _isOpen;
|
|
||||||
if (segments != 0) {
|
|
||||||
if (SEGMENTS_COUNT(segments) > MAX_STATIC_SEGS) {
|
|
||||||
isOpen = new UBool[2*SEGMENTS_COUNT(segments)];
|
|
||||||
}
|
|
||||||
for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
|
|
||||||
isOpen[SEGMENTS_NUM(segments,i) ] = TRUE;
|
|
||||||
isOpen[SEGMENTS_NUM(segments,i+1)] = FALSE;
|
|
||||||
}
|
|
||||||
nextSeg = segments[++iseg];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Accumulate special characters (and non-specials following them)
|
// Accumulate special characters (and non-specials following them)
|
||||||
// into quoteBuf. Append quoteBuf, within single quotes, when
|
// into quoteBuf. Append quoteBuf, within single quotes, when
|
||||||
// a non-quoted element must be inserted.
|
// a non-quoted element must be inserted.
|
||||||
@ -765,14 +683,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
|
|||||||
appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
|
appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Append either '(' or ')' if we are at a segment index
|
|
||||||
if (i == nextSeg) {
|
|
||||||
appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
|
|
||||||
(UChar)0x0028 : (UChar)0x0029,
|
|
||||||
TRUE, escapeUnprintable, quoteBuf);
|
|
||||||
nextSeg = segments[++iseg];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (emitBraces && i == (anteContextLength + keyLength)) {
|
if (emitBraces && i == (anteContextLength + keyLength)) {
|
||||||
appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
|
appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
|
||||||
}
|
}
|
||||||
@ -787,11 +697,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == nextSeg) {
|
|
||||||
// assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
|
|
||||||
appendToRule(rule, (UChar)0x0029 /*)*/, TRUE, escapeUnprintable, quoteBuf);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (emitBraces && i == (anteContextLength + keyLength)) {
|
if (emitBraces && i == (anteContextLength + keyLength)) {
|
||||||
appendToRule(rule, (UChar)0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
|
appendToRule(rule, (UChar)0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
|
||||||
}
|
}
|
||||||
@ -854,9 +759,6 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
|
|||||||
|
|
||||||
appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);
|
appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);
|
||||||
|
|
||||||
if (isOpen != _isOpen) {
|
|
||||||
delete[] isOpen;
|
|
||||||
}
|
|
||||||
return rule;
|
return rule;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,6 +33,16 @@ class TransliterationRuleData;
|
|||||||
* Variables are detected by looking up each character in a supplied
|
* Variables are detected by looking up each character in a supplied
|
||||||
* variable list to see if it has been so defined.
|
* variable list to see if it has been so defined.
|
||||||
*
|
*
|
||||||
|
* <p>A rule may contain segments in its input string and segment
|
||||||
|
* references in its output string. A segment is a substring of the
|
||||||
|
* input pattern, indicated by an offset and limit. The segment may
|
||||||
|
* be in the preceding or following context. It may not span a
|
||||||
|
* context boundary. A segment reference is a special character in
|
||||||
|
* the output string that causes a segment of the input string (not
|
||||||
|
* the input pattern) to be copied to the output string. The range of
|
||||||
|
* special characters that represent segment references is defined by
|
||||||
|
* RuleBasedTransliterator.Data.
|
||||||
|
*
|
||||||
* @author Alan Liu
|
* @author Alan Liu
|
||||||
*/
|
*/
|
||||||
class TransliterationRule {
|
class TransliterationRule {
|
||||||
@ -65,20 +75,20 @@ private:
|
|||||||
UnicodeString output;
|
UnicodeString output;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An array of integers encoding the position of the segments.
|
* An array of matcher objects corresponding to the input pattern
|
||||||
* See rbt_pars.cpp::Segments for more details.
|
* segments. If there are no segments this is null. N.B. This is
|
||||||
|
* a UnicodeMatcher for generality, but in practice it is always a
|
||||||
|
* StringMatcher. In the future we may generalize this, but for
|
||||||
|
* now we sometimes cast down to StringMatcher.
|
||||||
|
*
|
||||||
|
* The array is owned, but the pointers within it are not.
|
||||||
*/
|
*/
|
||||||
int32_t* segments;
|
UnicodeMatcher** segments;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A value we compute from segments. The first index into segments[]
|
* The number of elements in segments[] or zero if segments is NULL.
|
||||||
* that is >= anteContextLength. That is, the first one that is within
|
|
||||||
* the forward scanned part of the pattern -- the key or the postContext.
|
|
||||||
* If there are no segments, this has the value -1. This index is relative
|
|
||||||
* to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
|
|
||||||
* segments[FIRST_SEG_POS_INDEX + firstKeySeg].
|
|
||||||
*/
|
*/
|
||||||
int32_t firstKeySeg;
|
int32_t segmentsCount;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The length of the string that must match before the key. If
|
* The length of the string that must match before the key. If
|
||||||
@ -143,11 +153,10 @@ public:
|
|||||||
* 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
|
* 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
|
||||||
* "xyz" and moves the cursor to before "a". It would have a cursorOffset
|
* "xyz" and moves the cursor to before "a". It would have a cursorOffset
|
||||||
* of -3.
|
* of -3.
|
||||||
* @param adoptedSegs array of 2n integers. Each of n pairs consists of offset,
|
* @param segs array of UnicodeMatcher corresponding to input pattern
|
||||||
* limit for a segment of the input string. Characters in the output string
|
* segments, or null if there are none. The array itself is adopted,
|
||||||
* refer to these segments if they are in a special range determined by the
|
* but the pointers within it are not.
|
||||||
* associated RuleBasedTransliterator.Data object. May be null if there are
|
* @param segsCount number of elements in segs[]
|
||||||
* no segments.
|
|
||||||
* @param anchorStart TRUE if the the rule is anchored on the left to
|
* @param anchorStart TRUE if the the rule is anchored on the left to
|
||||||
* the context start
|
* the context start
|
||||||
* @param anchorEnd TRUE if the rule is anchored on the right to the
|
* @param anchorEnd TRUE if the rule is anchored on the right to the
|
||||||
@ -157,7 +166,8 @@ public:
|
|||||||
int32_t anteContextPos, int32_t postContextPos,
|
int32_t anteContextPos, int32_t postContextPos,
|
||||||
const UnicodeString& outputStr,
|
const UnicodeString& outputStr,
|
||||||
int32_t cursorPosition, int32_t cursorOffset,
|
int32_t cursorPosition, int32_t cursorOffset,
|
||||||
int32_t* adoptedSegs,
|
UnicodeMatcher** segs,
|
||||||
|
int32_t segsCount,
|
||||||
UBool anchorStart, UBool anchorEnd,
|
UBool anchorStart, UBool anchorEnd,
|
||||||
const TransliterationRuleData* data,
|
const TransliterationRuleData* data,
|
||||||
UErrorCode& status);
|
UErrorCode& status);
|
||||||
|
@ -18,7 +18,9 @@ StringMatcher::StringMatcher(const UnicodeString& theString,
|
|||||||
UBool isSeg,
|
UBool isSeg,
|
||||||
const TransliterationRuleData& theData) :
|
const TransliterationRuleData& theData) :
|
||||||
data(theData),
|
data(theData),
|
||||||
isSegment(isSeg)
|
isSegment(isSeg),
|
||||||
|
matchStart(-1),
|
||||||
|
matchLimit(-1)
|
||||||
{
|
{
|
||||||
theString.extractBetween(start, limit, pattern);
|
theString.extractBetween(start, limit, pattern);
|
||||||
}
|
}
|
||||||
@ -27,7 +29,9 @@ StringMatcher::StringMatcher(const StringMatcher& o) :
|
|||||||
UnicodeMatcher(o),
|
UnicodeMatcher(o),
|
||||||
pattern(o.pattern),
|
pattern(o.pattern),
|
||||||
data(o.data),
|
data(o.data),
|
||||||
isSegment(o.isSegment)
|
isSegment(o.isSegment),
|
||||||
|
matchStart(o.matchStart),
|
||||||
|
matchLimit(o.matchStart)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -54,6 +58,7 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
|
|||||||
int32_t i;
|
int32_t i;
|
||||||
int32_t cursor = offset;
|
int32_t cursor = offset;
|
||||||
if (limit < cursor) {
|
if (limit < cursor) {
|
||||||
|
// Match in the reverse direction
|
||||||
for (i=pattern.length()-1; i>=0; --i) {
|
for (i=pattern.length()-1; i>=0; --i) {
|
||||||
UChar keyChar = pattern.charAt(i);
|
UChar keyChar = pattern.charAt(i);
|
||||||
const UnicodeMatcher* subm = data.lookup(keyChar);
|
const UnicodeMatcher* subm = data.lookup(keyChar);
|
||||||
@ -72,6 +77,14 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Record the match position, but adjust for a normal
|
||||||
|
// forward start, limit, and only if a prior match does not
|
||||||
|
// exist -- we want the rightmost match.
|
||||||
|
if (matchStart < 0) {
|
||||||
|
// cast away const -- should modify method to be non-const
|
||||||
|
((StringMatcher*)this)->matchStart = cursor+1;
|
||||||
|
((StringMatcher*)this)->matchLimit = offset+1;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
for (i=0; i<pattern.length(); ++i) {
|
for (i=0; i<pattern.length(); ++i) {
|
||||||
if (incremental && cursor == limit) {
|
if (incremental && cursor == limit) {
|
||||||
@ -99,6 +112,10 @@ UMatchDegree StringMatcher::matches(const Replaceable& text,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Record the match position
|
||||||
|
// cast away const -- should modify method to be non-const
|
||||||
|
((StringMatcher*)this)->matchStart = offset;
|
||||||
|
((StringMatcher*)this)->matchLimit = cursor;
|
||||||
}
|
}
|
||||||
|
|
||||||
offset = cursor;
|
offset = cursor;
|
||||||
@ -128,8 +145,8 @@ UnicodeString& StringMatcher::toPattern(UnicodeString& result,
|
|||||||
result.append((UChar)41); /*)*/
|
result.append((UChar)41); /*)*/
|
||||||
}
|
}
|
||||||
// Flush quoteBuf out to result
|
// Flush quoteBuf out to result
|
||||||
TransliterationRule::appendToRule(result, (UChar32)(isSegment?41/*)*/:-1),
|
TransliterationRule::appendToRule(result, -1,
|
||||||
TRUE, escapeUnprintable, quoteBuf);
|
TRUE, escapeUnprintable, quoteBuf);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -145,6 +162,32 @@ UBool StringMatcher::matchesIndexValue(uint8_t v) const {
|
|||||||
return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
|
return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove any match data. This must be called before performing a
|
||||||
|
* set of matches with this segment.
|
||||||
|
*/
|
||||||
|
void StringMatcher::resetMatch() {
|
||||||
|
matchStart = matchLimit = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the start offset, in the match text, of the <em>rightmost</em>
|
||||||
|
* match. This method may get moved up into the UnicodeMatcher if
|
||||||
|
* it turns out to be useful to generalize this.
|
||||||
|
*/
|
||||||
|
int32_t StringMatcher::getMatchStart() const {
|
||||||
|
return matchStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the limit offset, in the match text, of the <em>rightmost</em>
|
||||||
|
* match. This method may get moved up into the UnicodeMatcher if
|
||||||
|
* it turns out to be useful to generalize this.
|
||||||
|
*/
|
||||||
|
int32_t StringMatcher::getMatchLimit() const {
|
||||||
|
return matchLimit;
|
||||||
|
}
|
||||||
|
|
||||||
U_NAMESPACE_END
|
U_NAMESPACE_END
|
||||||
|
|
||||||
//eof
|
//eof
|
||||||
|
@ -59,6 +59,26 @@ class StringMatcher : public UnicodeMatcher {
|
|||||||
*/
|
*/
|
||||||
virtual UBool matchesIndexValue(uint8_t v) const;
|
virtual UBool matchesIndexValue(uint8_t v) const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove any match data. This must be called before performing a
|
||||||
|
* set of matches with this segment.
|
||||||
|
*/
|
||||||
|
void resetMatch();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the start offset, in the match text, of the <em>rightmost</em>
|
||||||
|
* match. This method may get moved up into the UnicodeMatcher if
|
||||||
|
* it turns out to be useful to generalize this.
|
||||||
|
*/
|
||||||
|
int32_t getMatchStart() const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the limit offset, in the match text, of the <em>rightmost</em>
|
||||||
|
* match. This method may get moved up into the UnicodeMatcher if
|
||||||
|
* it turns out to be useful to generalize this.
|
||||||
|
*/
|
||||||
|
int32_t getMatchLimit() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
UnicodeString pattern;
|
UnicodeString pattern;
|
||||||
@ -66,6 +86,10 @@ class StringMatcher : public UnicodeMatcher {
|
|||||||
const TransliterationRuleData& data;
|
const TransliterationRuleData& data;
|
||||||
|
|
||||||
UBool isSegment;
|
UBool isSegment;
|
||||||
|
|
||||||
|
int32_t matchStart;
|
||||||
|
|
||||||
|
int32_t matchLimit;
|
||||||
};
|
};
|
||||||
|
|
||||||
U_NAMESPACE_END
|
U_NAMESPACE_END
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
|
||||||
* $Date: 2001/10/26 22:59:26 $
|
* $Date: 2001/10/30 18:08:19 $
|
||||||
* $Revision: 1.57 $
|
* $Revision: 1.58 $
|
||||||
*
|
*
|
||||||
*****************************************************************************************
|
*****************************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -1268,9 +1268,11 @@ public class TransliteratorTest extends TestFmwk {
|
|||||||
"c abc ababc",
|
"c abc ababc",
|
||||||
"d d abd");
|
"d d abd");
|
||||||
|
|
||||||
|
// NOTE: The (ab)+ when referenced just yields a single "ab",
|
||||||
|
// not the full sequence of them. This accords with perl behavior.
|
||||||
expect("(ab)+ {x} > '(' $1 ')';",
|
expect("(ab)+ {x} > '(' $1 ')';",
|
||||||
"x abx ababxy",
|
"x abx ababxy",
|
||||||
"x ab(ab) abab(abab)y");
|
"x ab(ab) abab(ab)y");
|
||||||
|
|
||||||
expect("b+ > x;",
|
expect("b+ > x;",
|
||||||
"ac abc abbc abbbc",
|
"ac abc abbc abbbc",
|
||||||
@ -1288,12 +1290,11 @@ public class TransliteratorTest extends TestFmwk {
|
|||||||
"qa qab qaba qababc",
|
"qa qab qaba qababc",
|
||||||
"xa x xa xc");
|
"xa x xa xc");
|
||||||
|
|
||||||
// Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
|
// NOTE: The (ab)+ when referenced just yields a single "ab",
|
||||||
// In perl, it only matches the first occurrence, so the output
|
// not the full sequence of them. This accords with perl behavior.
|
||||||
// is "()a (ab) (ab)a (ab)c".
|
|
||||||
expect("q(ab)* > '(' $1 ')';",
|
expect("q(ab)* > '(' $1 ')';",
|
||||||
"qa qab qaba qababc",
|
"qa qab qaba qababc",
|
||||||
"()a (ab) (ab)a (abab)c");
|
"()a (ab) (ab)a (ab)c");
|
||||||
|
|
||||||
// 'foo'+ and 'foo'* -- the quantifier should apply to the entire
|
// 'foo'+ and 'foo'* -- the quantifier should apply to the entire
|
||||||
// quoted string
|
// quoted string
|
||||||
@ -1574,6 +1575,46 @@ public class TransliteratorTest extends TestFmwk {
|
|||||||
expect(gr, "\u03B1\u0314", "ha");
|
expect(gr, "\u03B1\u0314", "ha");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test quantified segment behavior. We want:
|
||||||
|
* ([abc])+ > x $1 x; applied to "cba" produces "xax"
|
||||||
|
*/
|
||||||
|
public void TestQuantifiedSegment() {
|
||||||
|
// The normal case
|
||||||
|
expect("([abc]+) > x $1 x;", "cba", "xcbax");
|
||||||
|
|
||||||
|
// The tricky case; the quantifier is around the segment
|
||||||
|
expect("([abc])+ > x $1 x;", "cba", "xax");
|
||||||
|
|
||||||
|
// Tricky case in reverse direction
|
||||||
|
expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
|
||||||
|
|
||||||
|
// Check post-context segment
|
||||||
|
expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
|
||||||
|
|
||||||
|
// Test toRule/toPattern for non-quantified segment.
|
||||||
|
// Careful with spacing here.
|
||||||
|
String r = "([a-c]){q} > x $1 x;";
|
||||||
|
Transliterator t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
|
||||||
|
String rr = t.toRules(true);
|
||||||
|
if (!r.equals(rr)) {
|
||||||
|
errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||||
|
} else {
|
||||||
|
logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test toRule/toPattern for quantified segment.
|
||||||
|
// Careful with spacing here.
|
||||||
|
r = "([a-c])+{q} > x $1 x;";
|
||||||
|
t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
|
||||||
|
rr = t.toRules(true);
|
||||||
|
if (!r.equals(rr)) {
|
||||||
|
errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||||
|
} else {
|
||||||
|
logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//======================================================================
|
//======================================================================
|
||||||
// icu4j ONLY
|
// icu4j ONLY
|
||||||
// These tests are not mirrored (yet) in icu4c at
|
// These tests are not mirrored (yet) in icu4c at
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/StringMatcher.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/StringMatcher.java,v $
|
||||||
* $Date: 2001/10/25 22:32:02 $
|
* $Date: 2001/10/30 18:04:08 $
|
||||||
* $Revision: 1.2 $
|
* $Revision: 1.3 $
|
||||||
*
|
*
|
||||||
*****************************************************************************************
|
*****************************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -18,16 +18,27 @@ class StringMatcher implements UnicodeMatcher {
|
|||||||
|
|
||||||
private boolean isSegment;
|
private boolean isSegment;
|
||||||
|
|
||||||
|
private int matchStart;
|
||||||
|
|
||||||
|
private int matchLimit;
|
||||||
|
|
||||||
private final RuleBasedTransliterator.Data data;
|
private final RuleBasedTransliterator.Data data;
|
||||||
|
|
||||||
|
public StringMatcher(String theString,
|
||||||
|
boolean isSeg,
|
||||||
|
RuleBasedTransliterator.Data theData) {
|
||||||
|
data = theData;
|
||||||
|
isSegment = isSeg;
|
||||||
|
pattern = theString;
|
||||||
|
matchStart = matchLimit = -1;
|
||||||
|
}
|
||||||
|
|
||||||
public StringMatcher(String theString,
|
public StringMatcher(String theString,
|
||||||
int start,
|
int start,
|
||||||
int limit,
|
int limit,
|
||||||
boolean isSeg,
|
boolean isSeg,
|
||||||
RuleBasedTransliterator.Data theData) {
|
RuleBasedTransliterator.Data theData) {
|
||||||
data = theData;
|
this(theString.substring(start, limit), isSeg, theData);
|
||||||
isSegment = isSeg;
|
|
||||||
pattern = theString.substring(start, limit);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -40,6 +51,7 @@ class StringMatcher implements UnicodeMatcher {
|
|||||||
int i;
|
int i;
|
||||||
int[] cursor = new int[] { offset[0] };
|
int[] cursor = new int[] { offset[0] };
|
||||||
if (limit < cursor[0]) {
|
if (limit < cursor[0]) {
|
||||||
|
// Match in the reverse direction
|
||||||
for (i=pattern.length()-1; i>=0; --i) {
|
for (i=pattern.length()-1; i>=0; --i) {
|
||||||
char keyChar = pattern.charAt(i);
|
char keyChar = pattern.charAt(i);
|
||||||
UnicodeMatcher subm = data.lookup(keyChar);
|
UnicodeMatcher subm = data.lookup(keyChar);
|
||||||
@ -58,6 +70,13 @@ class StringMatcher implements UnicodeMatcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Record the match position, but adjust for a normal
|
||||||
|
// forward start, limit, and only if a prior match does not
|
||||||
|
// exist -- we want the rightmost match.
|
||||||
|
if (matchStart < 0) {
|
||||||
|
matchStart = cursor[0]+1;
|
||||||
|
matchLimit = offset[0]+1;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
for (i=0; i<pattern.length(); ++i) {
|
for (i=0; i<pattern.length(); ++i) {
|
||||||
if (incremental && cursor[0] == limit) {
|
if (incremental && cursor[0] == limit) {
|
||||||
@ -85,6 +104,9 @@ class StringMatcher implements UnicodeMatcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Record the match position
|
||||||
|
matchStart = offset[0];
|
||||||
|
matchLimit = cursor[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
offset[0] = cursor[0];
|
offset[0] = cursor[0];
|
||||||
@ -114,7 +136,7 @@ class StringMatcher implements UnicodeMatcher {
|
|||||||
result.append(')');
|
result.append(')');
|
||||||
}
|
}
|
||||||
// Flush quoteBuf out to result
|
// Flush quoteBuf out to result
|
||||||
TransliterationRule.appendToRule(result, (isSegment?')':-1),
|
TransliterationRule.appendToRule(result, -1,
|
||||||
true, escapeUnprintable, quoteBuf);
|
true, escapeUnprintable, quoteBuf);
|
||||||
return result.toString();
|
return result.toString();
|
||||||
}
|
}
|
||||||
@ -130,6 +152,32 @@ class StringMatcher implements UnicodeMatcher {
|
|||||||
UnicodeMatcher m = data.lookup(c);
|
UnicodeMatcher m = data.lookup(c);
|
||||||
return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
|
return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove any match data. This must be called before performing a
|
||||||
|
* set of matches with this segment.
|
||||||
|
*/
|
||||||
|
public void resetMatch() {
|
||||||
|
matchStart = matchLimit = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the start offset, in the match text, of the <em>rightmost</em>
|
||||||
|
* match. This method may get moved up into the UnicodeMatcher if
|
||||||
|
* it turns out to be useful to generalize this.
|
||||||
|
*/
|
||||||
|
public int getMatchStart() {
|
||||||
|
return matchStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the limit offset, in the match text, of the <em>rightmost</em>
|
||||||
|
* match. This method may get moved up into the UnicodeMatcher if
|
||||||
|
* it turns out to be useful to generalize this.
|
||||||
|
*/
|
||||||
|
public int getMatchLimit() {
|
||||||
|
return matchLimit;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//eof
|
//eof
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
|
||||||
* $Date: 2001/10/25 23:22:15 $
|
* $Date: 2001/10/30 18:04:08 $
|
||||||
* $Revision: 1.33 $
|
* $Revision: 1.34 $
|
||||||
*
|
*
|
||||||
*****************************************************************************************
|
*****************************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -30,13 +30,15 @@ import com.ibm.util.Utility;
|
|||||||
* Variables are detected by looking up each character in a supplied
|
* Variables are detected by looking up each character in a supplied
|
||||||
* variable list to see if it has been so defined.
|
* variable list to see if it has been so defined.
|
||||||
*
|
*
|
||||||
* <p>A rule may contain segments in its input string and segment references in
|
* <p>A rule may contain segments in its input string and segment
|
||||||
* its output string. A segment is a substring of the input pattern, indicated
|
* references in its output string. A segment is a substring of the
|
||||||
* by an offset and limit. The segment may span the preceding or following
|
* input pattern, indicated by an offset and limit. The segment may
|
||||||
* context. A segment reference is a special character in the output string
|
* be in the preceding or following context. It may not span a
|
||||||
* that causes a segment of the input string (not the input pattern) to be
|
* context boundary. A segment reference is a special character in
|
||||||
* copied to the output string. The range of special characters that represent
|
* the output string that causes a segment of the input string (not
|
||||||
* segment references is defined by RuleBasedTransliterator.Data.
|
* the input pattern) to be copied to the output string. The range of
|
||||||
|
* special characters that represent segment references is defined by
|
||||||
|
* RuleBasedTransliterator.Data.
|
||||||
*
|
*
|
||||||
* <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
|
* <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
|
||||||
* string "abc.123" to "ab1.c23".
|
* string "abc.123" to "ab1.c23".
|
||||||
@ -44,7 +46,7 @@ import com.ibm.util.Utility;
|
|||||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||||
*
|
*
|
||||||
* @author Alan Liu
|
* @author Alan Liu
|
||||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.33 $ $Date: 2001/10/25 23:22:15 $
|
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.34 $ $Date: 2001/10/30 18:04:08 $
|
||||||
*/
|
*/
|
||||||
class TransliterationRule {
|
class TransliterationRule {
|
||||||
|
|
||||||
@ -64,20 +66,13 @@ class TransliterationRule {
|
|||||||
private String output;
|
private String output;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An array of integers encoding the position of the segments.
|
* An array of matcher objects corresponding to the input pattern
|
||||||
* See RuleBasedTransliterator.Segments for more details.
|
* segments. If there are no segments this is null. N.B. This is
|
||||||
|
* a UnicodeMatcher for generality, but in practice it is always a
|
||||||
|
* StringMatcher. In the future we may generalize this, but for
|
||||||
|
* now we sometimes cast down to StringMatcher.
|
||||||
*/
|
*/
|
||||||
int[] segments;
|
UnicodeMatcher[] segments;
|
||||||
|
|
||||||
/**
|
|
||||||
* A value we compute from segments. The first index into segments[]
|
|
||||||
* that is >= anteContextLength. That is, the first one that is within
|
|
||||||
* the forward scanned part of the pattern -- the key or the postContext.
|
|
||||||
* If there are no segments, this has the value -1. This index is relative
|
|
||||||
* to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
|
|
||||||
* segments[FIRST_SEG_POS_INDEX + firstKeySeg].
|
|
||||||
*/
|
|
||||||
int firstKeySeg;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The length of the string that must match before the key. If
|
* The length of the string that must match before the key. If
|
||||||
@ -127,20 +122,6 @@ class TransliterationRule {
|
|||||||
private static final char APOSTROPHE = '\'';
|
private static final char APOSTROPHE = '\'';
|
||||||
private static final char BACKSLASH = '\\';
|
private static final char BACKSLASH = '\\';
|
||||||
|
|
||||||
// Macros for accessing the array of integers encoding the position of
|
|
||||||
// the segments. See RuleBasedTransliterator.Segments for more details.
|
|
||||||
// SEGMENTS_COUNT number of segments, n (half the number of parens)
|
|
||||||
// SEGMENTS_LEN length of the segments array (number of elements)
|
|
||||||
// SEGMENTS_POS position in 'pattern' of parenthesis i, where i=0..2n-1
|
|
||||||
// SEGMENTS_NUM index into segments to access POS of $1.open,
|
|
||||||
// $1.close, $2.open, $2.close,.., $n.open, $n.close
|
|
||||||
// Relative to FIRST_SEG_POS_INDEX. Ranges from 0..2n-1.
|
|
||||||
static final int FIRST_SEG_POS_INDEX = 2;
|
|
||||||
static final int SEGMENTS_COUNT(int[] x) { return x[0]; }
|
|
||||||
static final int SEGMENTS_LEN(int[] x) { return (SEGMENTS_COUNT(x)*4+4); }
|
|
||||||
static final int SEGMENTS_POS(int[] x,int i) { return x[FIRST_SEG_POS_INDEX+i]; }
|
|
||||||
static final int SEGMENTS_NUM(int[] x,int i) { return x[x[1]+i]-FIRST_SEG_POS_INDEX; }
|
|
||||||
|
|
||||||
private static final String COPYRIGHT =
|
private static final String COPYRIGHT =
|
||||||
"\u00A9 IBM Corporation 1999-2001. All rights reserved.";
|
"\u00A9 IBM Corporation 1999-2001. All rights reserved.";
|
||||||
|
|
||||||
@ -165,12 +146,8 @@ class TransliterationRule {
|
|||||||
* 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
|
* 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
|
||||||
* "xyz" and moves the cursor to before "a". It would have a cursorOffset
|
* "xyz" and moves the cursor to before "a". It would have a cursorOffset
|
||||||
* of -3.
|
* of -3.
|
||||||
* @param segs array of 2n integers. Each of n pairs consists of offset,
|
* @param segs array of UnicodeMatcher corresponding to input pattern
|
||||||
* limit for a segment of the input string. Characters in the output string
|
* segments, or null if there are none
|
||||||
* refer to these segments if they are in a special range determined by the
|
|
||||||
* associated RuleBasedTransliterator.Data object. May be null if there are
|
|
||||||
* no segments. The caller is responsible for validating that segments
|
|
||||||
* are well-formed.
|
|
||||||
* @param anchorStart true if the the rule is anchored on the left to
|
* @param anchorStart true if the the rule is anchored on the left to
|
||||||
* the context start
|
* the context start
|
||||||
* @param anchorEnd true if the rule is anchored on the right to the
|
* @param anchorEnd true if the rule is anchored on the right to the
|
||||||
@ -180,7 +157,7 @@ class TransliterationRule {
|
|||||||
int anteContextPos, int postContextPos,
|
int anteContextPos, int postContextPos,
|
||||||
String output,
|
String output,
|
||||||
int cursorPos, int cursorOffset,
|
int cursorPos, int cursorOffset,
|
||||||
int[] segs,
|
UnicodeMatcher[] segs,
|
||||||
boolean anchorStart, boolean anchorEnd,
|
boolean anchorStart, boolean anchorEnd,
|
||||||
RuleBasedTransliterator.Data theData) {
|
RuleBasedTransliterator.Data theData) {
|
||||||
data = theData;
|
data = theData;
|
||||||
@ -212,25 +189,11 @@ class TransliterationRule {
|
|||||||
this.cursorPos = cursorPos + cursorOffset;
|
this.cursorPos = cursorPos + cursorOffset;
|
||||||
this.output = output;
|
this.output = output;
|
||||||
// We don't validate the segments array. The caller must
|
// We don't validate the segments array. The caller must
|
||||||
// guarantee that the segments are well-formed.
|
// guarantee that the segments are well-formed (that is, that
|
||||||
|
// all $n references in the output refer to indices of this
|
||||||
|
// array, and that no array elements are null).
|
||||||
this.segments = segs;
|
this.segments = segs;
|
||||||
|
|
||||||
// Find the position of the first segment index that is after the
|
|
||||||
// anteContext (in the key). Note that this may be a start or a
|
|
||||||
// limit index. If all segments are in the ante context,
|
|
||||||
// firstKeySeg should point past the last segment -- that is, it
|
|
||||||
// should point at the end marker, which is -1. This allows the
|
|
||||||
// code to back up by one to obtain the last ante context segment.
|
|
||||||
firstKeySeg = -1;
|
|
||||||
if (segments != null) {
|
|
||||||
firstKeySeg = FIRST_SEG_POS_INDEX;
|
|
||||||
while (segments[firstKeySeg] >= 0 &&
|
|
||||||
segments[firstKeySeg] < anteContextLength) {
|
|
||||||
++firstKeySeg;
|
|
||||||
}
|
|
||||||
firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
|
|
||||||
}
|
|
||||||
|
|
||||||
pattern = input;
|
pattern = input;
|
||||||
flags = 0;
|
flags = 0;
|
||||||
if (anchorStart) {
|
if (anchorStart) {
|
||||||
@ -410,25 +373,12 @@ class TransliterationRule {
|
|||||||
|
|
||||||
// ============================ MATCH ===========================
|
// ============================ MATCH ===========================
|
||||||
|
|
||||||
// Record the actual positions, in the text, of the segments.
|
// Reset segment match data
|
||||||
// These are recorded in the order that they occur in the pattern.
|
|
||||||
|
|
||||||
// segPos[] is an array of 2*SEGMENTS_COUNT elements. It
|
|
||||||
// records the position in 'text' of each segment boundary, in
|
|
||||||
// the order that they occur in 'pattern'.
|
|
||||||
int[] segPos = null;
|
|
||||||
if (segments != null) {
|
if (segments != null) {
|
||||||
segPos = new int[2*SEGMENTS_COUNT(segments)];
|
for (int i=0; i<segments.length; ++i) {
|
||||||
|
((StringMatcher) segments[i]).resetMatch();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// iSeg is an index into segments[] that accesses the first
|
|
||||||
// array. As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
|
|
||||||
// When indexing into segments[] FIRST_SEG_POS_INDEX must be
|
|
||||||
// added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
|
|
||||||
int iSeg = firstKeySeg - 1;
|
|
||||||
// nextSegPos is an offset in 'pattern'. When the cursor is
|
|
||||||
// equal to nextSegPos, we are at a segment boundary, and we
|
|
||||||
// record the position in the real text in segPos[].
|
|
||||||
int nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
|
|
||||||
|
|
||||||
int lenDelta, keyLimit;
|
int lenDelta, keyLimit;
|
||||||
int[] intRef = new int[1];
|
int[] intRef = new int[1];
|
||||||
@ -465,15 +415,6 @@ class TransliterationRule {
|
|||||||
}
|
}
|
||||||
oText = intRef[0];
|
oText = intRef[0];
|
||||||
}
|
}
|
||||||
while (nextSegPos == oPattern) {
|
|
||||||
segPos[iSeg] = oText;
|
|
||||||
if (oText >= 0) {
|
|
||||||
segPos[iSeg] += UTF16.getCharCount(UTF16.charAt(text, oText));
|
|
||||||
} else {
|
|
||||||
++segPos[iSeg];
|
|
||||||
}
|
|
||||||
nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
minOText = posAfter(text, oText);
|
minOText = posAfter(text, oText);
|
||||||
@ -486,9 +427,6 @@ class TransliterationRule {
|
|||||||
|
|
||||||
// -------------------- Key and Post Context --------------------
|
// -------------------- Key and Post Context --------------------
|
||||||
|
|
||||||
iSeg = firstKeySeg;
|
|
||||||
nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
|
|
||||||
|
|
||||||
oPattern = 0;
|
oPattern = 0;
|
||||||
oText = pos.start;
|
oText = pos.start;
|
||||||
keyLimit = 0;
|
keyLimit = 0;
|
||||||
@ -511,10 +449,6 @@ class TransliterationRule {
|
|||||||
// depending on whether we're in the key or in the post
|
// depending on whether we're in the key or in the post
|
||||||
// context.
|
// context.
|
||||||
|
|
||||||
while (oPattern == nextSegPos) {
|
|
||||||
segPos[iSeg] = oText;
|
|
||||||
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
|
|
||||||
}
|
|
||||||
if (oPattern == keyLength) {
|
if (oPattern == keyLength) {
|
||||||
keyLimit = oText;
|
keyLimit = oText;
|
||||||
}
|
}
|
||||||
@ -554,10 +488,6 @@ class TransliterationRule {
|
|||||||
//! return UnicodeMatcher.U_MISMATCH;
|
//! return UnicodeMatcher.U_MISMATCH;
|
||||||
//!}
|
//!}
|
||||||
}
|
}
|
||||||
while (oPattern == nextSegPos) {
|
|
||||||
segPos[iSeg] = oText;
|
|
||||||
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
|
|
||||||
}
|
|
||||||
if (oPattern == keyLength) {
|
if (oPattern == keyLength) {
|
||||||
keyLimit = oText;
|
keyLimit = oText;
|
||||||
}
|
}
|
||||||
@ -576,8 +506,7 @@ class TransliterationRule {
|
|||||||
// =========================== REPLACE ==========================
|
// =========================== REPLACE ==========================
|
||||||
|
|
||||||
// We have a full match. The key is between pos.start and
|
// We have a full match. The key is between pos.start and
|
||||||
// keyLimit. Segment indices have been recorded in segPos[].
|
// keyLimit.
|
||||||
// Perform a replacement.
|
|
||||||
|
|
||||||
if (segments == null) {
|
if (segments == null) {
|
||||||
text.replace(pos.start, keyLimit, output);
|
text.replace(pos.start, keyLimit, output);
|
||||||
@ -629,11 +558,22 @@ class TransliterationRule {
|
|||||||
buf.setLength(0);
|
buf.setLength(0);
|
||||||
}
|
}
|
||||||
// Copy segment with out-of-band data
|
// Copy segment with out-of-band data
|
||||||
b *= 2;
|
StringMatcher m = (StringMatcher) segments[b];
|
||||||
int start = segPos[SEGMENTS_NUM(segments,b)];
|
int start = m.getMatchStart();
|
||||||
int limit = segPos[SEGMENTS_NUM(segments,b+1)];
|
int limit = m.getMatchLimit();
|
||||||
text.copy(start, limit, dest);
|
// If there was no match, that means that a quantifier
|
||||||
dest += limit - start;
|
// matched zero-length. E.g., x (a)* y matched "xy".
|
||||||
|
if (start >= 0) {
|
||||||
|
// Adjust indices for segments in post context
|
||||||
|
// for any inserted text between the key and
|
||||||
|
// the post context.
|
||||||
|
if (start >= keyLimit) {
|
||||||
|
start += dest - keyLimit;
|
||||||
|
limit += dest - keyLimit;
|
||||||
|
}
|
||||||
|
text.copy(start, limit, dest);
|
||||||
|
dest += limit - start;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
oOutput += UTF16.getCharCount(c);
|
oOutput += UTF16.getCharCount(c);
|
||||||
}
|
}
|
||||||
@ -790,20 +730,6 @@ class TransliterationRule {
|
|||||||
|
|
||||||
StringBuffer rule = new StringBuffer();
|
StringBuffer rule = new StringBuffer();
|
||||||
|
|
||||||
// iseg indexes into segments[] directly (not offset from FSPI)
|
|
||||||
int iseg = FIRST_SEG_POS_INDEX-1;
|
|
||||||
int nextSeg = -1;
|
|
||||||
// Build an array of booleans specifying open vs. close paren
|
|
||||||
boolean[] isOpen = null;
|
|
||||||
if (segments != null) {
|
|
||||||
isOpen = new boolean[2*SEGMENTS_COUNT(segments)];
|
|
||||||
for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
|
|
||||||
isOpen[SEGMENTS_NUM(segments,i) ] = true;
|
|
||||||
isOpen[SEGMENTS_NUM(segments,i+1)] = false;
|
|
||||||
}
|
|
||||||
nextSeg = segments[++iseg];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Accumulate special characters (and non-specials following them)
|
// Accumulate special characters (and non-specials following them)
|
||||||
// into quoteBuf. Append quoteBuf, within single quotes, when
|
// into quoteBuf. Append quoteBuf, within single quotes, when
|
||||||
// a non-quoted element must be inserted.
|
// a non-quoted element must be inserted.
|
||||||
@ -825,14 +751,6 @@ class TransliterationRule {
|
|||||||
appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
|
appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Append either '(' or ')' if we are at a segment index
|
|
||||||
if (i == nextSeg) {
|
|
||||||
appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
|
|
||||||
'(' : ')',
|
|
||||||
true, escapeUnprintable, quoteBuf);
|
|
||||||
nextSeg = segments[++iseg];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (emitBraces && i == (anteContextLength + keyLength)) {
|
if (emitBraces && i == (anteContextLength + keyLength)) {
|
||||||
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
||||||
}
|
}
|
||||||
@ -847,11 +765,6 @@ class TransliterationRule {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == nextSeg) {
|
|
||||||
// assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
|
|
||||||
appendToRule(rule, ')', true, escapeUnprintable, quoteBuf);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (emitBraces && i == (anteContextLength + keyLength)) {
|
if (emitBraces && i == (anteContextLength + keyLength)) {
|
||||||
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
||||||
}
|
}
|
||||||
@ -885,7 +798,7 @@ class TransliterationRule {
|
|||||||
} else {
|
} else {
|
||||||
++seg; // make 1-based
|
++seg; // make 1-based
|
||||||
appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf);
|
appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf);
|
||||||
rule.append(0x24 /*$*/);
|
rule.append('$');
|
||||||
boolean show = false; // true if we should display digits
|
boolean show = false; // true if we should display digits
|
||||||
for (int p=9; p>=0; --p) {
|
for (int p=9; p>=0; --p) {
|
||||||
int d = seg / POW10[p];
|
int d = seg / POW10[p];
|
||||||
@ -938,6 +851,9 @@ class TransliterationRule {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* $Log: TransliterationRule.java,v $
|
* $Log: TransliterationRule.java,v $
|
||||||
|
* Revision 1.34 2001/10/30 18:04:08 alan
|
||||||
|
* jitterbug 1406: make quantified segments behave like perl counterparts
|
||||||
|
*
|
||||||
* Revision 1.33 2001/10/25 23:22:15 alan
|
* Revision 1.33 2001/10/25 23:22:15 alan
|
||||||
* jitterbug 73: changes to support zero-length matchers at end of key
|
* jitterbug 73: changes to support zero-length matchers at end of key
|
||||||
*
|
*
|
||||||
|
@ -4,8 +4,8 @@
|
|||||||
* Corporation and others. All Rights Reserved.
|
* Corporation and others. All Rights Reserved.
|
||||||
**********************************************************************
|
**********************************************************************
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $
|
||||||
* $Date: 2001/10/24 00:03:38 $
|
* $Date: 2001/10/30 18:04:09 $
|
||||||
* $Revision: 1.7 $
|
* $Revision: 1.8 $
|
||||||
**********************************************************************
|
**********************************************************************
|
||||||
*/
|
*/
|
||||||
package com.ibm.text;
|
package com.ibm.text;
|
||||||
@ -117,6 +117,7 @@ class TransliteratorParser {
|
|||||||
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
|
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
|
||||||
|
|
||||||
private static final String OPERATORS = "=><";
|
private static final String OPERATORS = "=><";
|
||||||
|
private static final String HALF_ENDERS = "=><;";
|
||||||
|
|
||||||
// Other special characters
|
// Other special characters
|
||||||
private static final char QUOTE = '\'';
|
private static final char QUOTE = '\'';
|
||||||
@ -142,7 +143,7 @@ class TransliteratorParser {
|
|||||||
// private static final char ANCHOR_END = '$';
|
// private static final char ANCHOR_END = '$';
|
||||||
|
|
||||||
// Segments of the input string are delimited by "(" and ")". In the
|
// Segments of the input string are delimited by "(" and ")". In the
|
||||||
// output string these segments are referenced as "$1" through "$9".
|
// output string these segments are referenced as "$1", "$2", etc.
|
||||||
private static final char SEGMENT_OPEN = '(';
|
private static final char SEGMENT_OPEN = '(';
|
||||||
private static final char SEGMENT_CLOSE = ')';
|
private static final char SEGMENT_CLOSE = ')';
|
||||||
|
|
||||||
@ -285,209 +286,6 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
//----------------------------------------------------------------------
|
|
||||||
// class Segments
|
|
||||||
//----------------------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Segments are parentheses-enclosed regions of the input string.
|
|
||||||
* These are referenced in the output string using the notation $1,
|
|
||||||
* $2, etc. Numbering is in order of appearance of the left
|
|
||||||
* parenthesis. Number is one-based. Segments are defined as start,
|
|
||||||
* limit pairs. Segments may nest.
|
|
||||||
*
|
|
||||||
* During parsing, segment data is encoded in an object of class
|
|
||||||
* Segments. At runtime, the same data is encoded in compact form as
|
|
||||||
* an array of integers in a TransliterationRule. The runtime encoding
|
|
||||||
* must satisfy three goals:
|
|
||||||
*
|
|
||||||
* 1. Iterate over the offsets in a pattern, from left to right,
|
|
||||||
* and indicate all segment boundaries, in order. This is done
|
|
||||||
* during matching.
|
|
||||||
*
|
|
||||||
* 2. Given a reference $n, produce the start and limit offsets
|
|
||||||
* for that segment. This is done during replacement.
|
|
||||||
*
|
|
||||||
* 3. Similar to goal 1, but in addition, indicate whether each
|
|
||||||
* segment boundary is a start or a limit, in other words, whether
|
|
||||||
* each is an open paren or a close paren. This is required by
|
|
||||||
* the toRule() method.
|
|
||||||
*
|
|
||||||
* Goal 1 must be satisfied at high speed since this is done during
|
|
||||||
* matching. Goal 2 is next most important. Goal 3 is not performance
|
|
||||||
* critical since it is only needed by toRule().
|
|
||||||
*
|
|
||||||
* The array of integers is actually two arrays concatenated. The
|
|
||||||
* first gives the index values of the open and close parentheses in
|
|
||||||
* the order they appear. The second maps segment numbers to the
|
|
||||||
* indices of the first array. The two arrays have the same length.
|
|
||||||
* Iterating over the first array satisfies goal 1. Indexing into the
|
|
||||||
* second array satisfies goal 2. Goal 3 is satisfied by iterating
|
|
||||||
* over the second array and constructing the required data when
|
|
||||||
* needed. This is what toRule() does.
|
|
||||||
*
|
|
||||||
* Example: (a b(c d)e f)
|
|
||||||
* 0 1 2 3 4 5 6
|
|
||||||
*
|
|
||||||
* First array: Indices are 0, 2, 4, and 6.
|
|
||||||
|
|
||||||
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
|
|
||||||
* second array is 0, 3, 1 2 -- these give the indices in the
|
|
||||||
* first array at which $1:open, $1:close, $2:open, and $2:close
|
|
||||||
* occur.
|
|
||||||
*
|
|
||||||
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
|
|
||||||
*
|
|
||||||
* Each subarray is terminated with a -1, and two leading entries
|
|
||||||
* give the number of segments and the offset to the first entry
|
|
||||||
* of the second array. In addition, the second array value are
|
|
||||||
* all offset by 2 so they index directly into the final array.
|
|
||||||
* The total array size is 4*segments[0] + 4. The second index is
|
|
||||||
* 2*segments[0] + 3.
|
|
||||||
*
|
|
||||||
* In the output string, a segment reference is indicated by a
|
|
||||||
* character in a special range, as defined by
|
|
||||||
* RuleBasedTransliterator.Data.
|
|
||||||
*
|
|
||||||
* Most rules have no segments, in which case segments is null, and the
|
|
||||||
* output string need not be checked for segment reference characters.
|
|
||||||
*
|
|
||||||
* See also rbt_rule.h/cpp.
|
|
||||||
*/
|
|
||||||
private static class Segments {
|
|
||||||
|
|
||||||
private Vector offsets; // holds Integer objects
|
|
||||||
|
|
||||||
private Vector isOpenParen; // holds Boolean objects
|
|
||||||
|
|
||||||
private int offset(int i) {
|
|
||||||
return ((Integer) offsets.elementAt(i)).intValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isOpen(int i) {
|
|
||||||
return ((Boolean) isOpenParen.elementAt(i)).booleanValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
// size of the Vectors
|
|
||||||
private int size() {
|
|
||||||
// assert(offset.size() == isOpenParen.size());
|
|
||||||
return offsets.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Segments() {
|
|
||||||
offsets = new Vector();
|
|
||||||
isOpenParen = new Vector();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addParenthesisAt(int offset, boolean isOpen) {
|
|
||||||
offsets.addElement(new Integer(offset));
|
|
||||||
isOpenParen.addElement(new Boolean(isOpen));
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getLastParenOffset(boolean[] isOpenParen) {
|
|
||||||
if (size() == 0) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
isOpenParen[0] = isOpen(size()-1);
|
|
||||||
return offset(size()-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remove the last (rightmost) segment. Store its offsets in start
|
|
||||||
// and limit, and then convert all offsets at or after start to be
|
|
||||||
// equal to start. Upon failure, return FALSE. Assume that the
|
|
||||||
// caller has already called getLastParenOffset() and validated that
|
|
||||||
// there is at least one parenthesis and that the last one is a close
|
|
||||||
// paren.
|
|
||||||
public boolean extractLastParenSubstring(int[] start, int[] limit) {
|
|
||||||
// assert(offsets.size() > 0);
|
|
||||||
// assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
|
|
||||||
int i = size() - 1;
|
|
||||||
int n = 1; // count of close parens we need to match
|
|
||||||
// Record position of the last close paren
|
|
||||||
limit[0] = offset(i);
|
|
||||||
--i; // back up to the one before the last one
|
|
||||||
while (i >= 0 && n != 0) {
|
|
||||||
n += isOpen(i) ? -1 : 1;
|
|
||||||
}
|
|
||||||
if (n != 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// assert(i>=0);
|
|
||||||
start[0] = offset(i);
|
|
||||||
// Reset all segment pairs from i to size() - 1 to [start, start+1).
|
|
||||||
while (i<size()) {
|
|
||||||
int o = isOpen(i) ? start[0] : (start[0]+1);
|
|
||||||
offsets.setElementAt(new Integer(o), i);
|
|
||||||
++i;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assume caller has already gotten a TRUE validate().
|
|
||||||
public int[] createArray() {
|
|
||||||
int c = count(); // number of segments
|
|
||||||
int arrayLen = 4*c + 4;
|
|
||||||
int[] array = new int[arrayLen];
|
|
||||||
int a2offset = 2*c + 3; // offset to array 2
|
|
||||||
|
|
||||||
array[0] = c;
|
|
||||||
array[1] = a2offset;
|
|
||||||
int i;
|
|
||||||
for (i=0; i<2*c; ++i) {
|
|
||||||
array[2+i] = offset(i);
|
|
||||||
}
|
|
||||||
array[a2offset-1] = -1;
|
|
||||||
array[arrayLen-1] = -1;
|
|
||||||
// Now walk through and match up segment numbers with parentheses.
|
|
||||||
// Number segments from 0. We're going to offset all entries by 2
|
|
||||||
// to skip the first two elements, array[0] and array[1].
|
|
||||||
Stack stack = new Stack();
|
|
||||||
int nextOpen = 0; // seg # of next open, 0-based
|
|
||||||
for (i=0; i<2*c; ++i) {
|
|
||||||
boolean open = isOpen(i);
|
|
||||||
// Let seg be the zero-based segment number.
|
|
||||||
// Open parens are at 2*seg in array 2.
|
|
||||||
// Close parens are at 2*seg+1 in array 2.
|
|
||||||
if (open) {
|
|
||||||
array[a2offset + 2*nextOpen] = 2+i;
|
|
||||||
stack.push(new Integer(nextOpen));
|
|
||||||
++nextOpen;
|
|
||||||
} else {
|
|
||||||
int nextClose = ((Integer) stack.pop()).intValue();
|
|
||||||
array[a2offset + 2*nextClose+1] = 2+i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// assert(stack.empty());
|
|
||||||
|
|
||||||
return array;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean validate() {
|
|
||||||
// want number of parens >= 2
|
|
||||||
// want number of parens to be even
|
|
||||||
// want first paren '('
|
|
||||||
// want parens to match up in the end
|
|
||||||
if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
int n = 0;
|
|
||||||
for (int i=0; i<size(); ++i) {
|
|
||||||
n += isOpen(i) ? 1 : -1;
|
|
||||||
if (n < 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return n == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Number of segments
|
|
||||||
// Assume caller has already gotten a TRUE validate().
|
|
||||||
public int count() {
|
|
||||||
// assert(validate());
|
|
||||||
return size() / 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
// class RuleHalf
|
// class RuleHalf
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
@ -505,11 +303,7 @@ class TransliteratorParser {
|
|||||||
public int ante = -1; // position of ante context marker '{' in text
|
public int ante = -1; // position of ante context marker '{' in text
|
||||||
public int post = -1; // position of post context marker '}' in text
|
public int post = -1; // position of post context marker '}' in text
|
||||||
|
|
||||||
// Record the position of the segment substrings and references. A
|
public int maxRef = -1; // n where maximum segment ref is $n; 1-based
|
||||||
// given side should have segments or segment references, but not
|
|
||||||
// both.
|
|
||||||
public Segments segments = null;
|
|
||||||
public int maxRef = -1; // index of largest ref (1..9)
|
|
||||||
|
|
||||||
// Record the offset to the cursor either to the left or to the
|
// Record the offset to the cursor either to the left or to the
|
||||||
// right of the key. This is indicated by characters on the output
|
// right of the key. This is indicated by characters on the output
|
||||||
@ -521,29 +315,88 @@ class TransliteratorParser {
|
|||||||
// output text.
|
// output text.
|
||||||
public int cursorOffset = 0; // only nonzero on output side
|
public int cursorOffset = 0; // only nonzero on output side
|
||||||
|
|
||||||
|
// Position of first CURSOR_OFFSET on _right_. This will be -1
|
||||||
|
// for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
|
||||||
|
private int cursorOffsetPos = 0;
|
||||||
|
|
||||||
public boolean anchorStart = false;
|
public boolean anchorStart = false;
|
||||||
public boolean anchorEnd = false;
|
public boolean anchorEnd = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* UnicodeMatcher objects corresponding to each segment.
|
||||||
|
*/
|
||||||
|
public Vector segments = new Vector();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The segment number from 0..n-1 of the next '(' we see
|
||||||
|
* during parsing; 0-based.
|
||||||
|
*/
|
||||||
|
private int nextSegmentNumber = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse one side of a rule, stopping at either the limit,
|
* Parse one side of a rule, stopping at either the limit,
|
||||||
* the END_OF_RULE character, or an operator. Return
|
* the END_OF_RULE character, or an operator.
|
||||||
* the pos of the terminating character (or limit).
|
* @return the index after the terminating character, or
|
||||||
|
* if limit was reached, limit
|
||||||
*/
|
*/
|
||||||
public int parse(String rule, int pos, int limit,
|
public int parse(String rule, int pos, int limit,
|
||||||
TransliteratorParser parser) {
|
TransliteratorParser parser) {
|
||||||
int start = pos;
|
int start = pos;
|
||||||
StringBuffer buf = new StringBuffer();
|
StringBuffer buf = new StringBuffer();
|
||||||
|
pos = parseSection(rule, pos, limit, parser, buf, false);
|
||||||
|
text = buf.toString();
|
||||||
|
|
||||||
|
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
||||||
|
syntaxError("Misplaced " + CURSOR_POS, rule, start);
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a section of one side of a rule, stopping at either
|
||||||
|
* the limit, the END_OF_RULE character, an operator, or a
|
||||||
|
* segment close character. This method parses both a
|
||||||
|
* top-level rule half and a segment within such a rule half.
|
||||||
|
* It calls itself recursively to parse segments and nested
|
||||||
|
* segments.
|
||||||
|
* @param buf buffer into which to accumulate the rule pattern
|
||||||
|
* characters, either literal characters from the rule or
|
||||||
|
* standins for UnicodeMatcher objects including segments.
|
||||||
|
* @param isSegment if true, then we've already seen a '(' and
|
||||||
|
* pos on entry points right after it. Accumulate everything
|
||||||
|
* up to the closing ')', put it in a segment matcher object,
|
||||||
|
* generate a standin for it, and add the standin to buf. As
|
||||||
|
* a side effect, update the segments vector with a reference
|
||||||
|
* to the segment matcher. This works recursively for nested
|
||||||
|
* segments. If isSegment is false, just accumulate
|
||||||
|
* characters into buf.
|
||||||
|
* @return the index after the terminating character, or
|
||||||
|
* if limit was reached, limit
|
||||||
|
*/
|
||||||
|
private int parseSection(String rule, int pos, int limit,
|
||||||
|
TransliteratorParser parser,
|
||||||
|
StringBuffer buf,
|
||||||
|
boolean isSegment) {
|
||||||
|
int start = pos;
|
||||||
ParsePosition pp = null;
|
ParsePosition pp = null;
|
||||||
int cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
|
|
||||||
boolean done = false;
|
|
||||||
int quoteStart = -1; // Most recent 'single quoted string'
|
int quoteStart = -1; // Most recent 'single quoted string'
|
||||||
int quoteLimit = -1;
|
int quoteLimit = -1;
|
||||||
int varStart = -1; // Most recent $variableReference
|
int varStart = -1; // Most recent $variableReference
|
||||||
int varLimit = -1;
|
int varLimit = -1;
|
||||||
int[] iref = new int[1];
|
int[] iref = new int[1];
|
||||||
|
|
||||||
|
// If isSegment, then bufSegStart is the offset in buf to
|
||||||
|
// the first character of the segment we are parsing.
|
||||||
|
int bufSegStart = 0;
|
||||||
|
int segmentNumber = 0;
|
||||||
|
if (isSegment) {
|
||||||
|
bufSegStart = buf.length();
|
||||||
|
segmentNumber = nextSegmentNumber++;
|
||||||
|
}
|
||||||
|
|
||||||
main:
|
main:
|
||||||
while (pos < limit && !done) {
|
while (pos < limit) {
|
||||||
char c = rule.charAt(pos++);
|
char c = rule.charAt(pos++);
|
||||||
if (Character.isWhitespace(c)) {
|
if (Character.isWhitespace(c)) {
|
||||||
// Ignore whitespace. Note that this is not Unicode
|
// Ignore whitespace. Note that this is not Unicode
|
||||||
@ -551,8 +404,11 @@ class TransliteratorParser {
|
|||||||
// whitespace likely to be seen in code.
|
// whitespace likely to be seen in code.
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (OPERATORS.indexOf(c) >= 0) {
|
// HALF_ENDERS is all chars that end a rule half: "<>=;"
|
||||||
--pos; // Backup to point to operator
|
if (HALF_ENDERS.indexOf(c) >= 0) {
|
||||||
|
if (isSegment) {
|
||||||
|
syntaxError("Unclosed segment", rule, start);
|
||||||
|
}
|
||||||
break main;
|
break main;
|
||||||
}
|
}
|
||||||
if (anchorEnd) {
|
if (anchorEnd) {
|
||||||
@ -614,7 +470,12 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (c) {
|
switch (c) {
|
||||||
|
|
||||||
|
//------------------------------------------------------
|
||||||
|
// Elements allowed within and out of segments
|
||||||
|
//------------------------------------------------------
|
||||||
case ANCHOR_START:
|
case ANCHOR_START:
|
||||||
if (buf.length() == 0 && !anchorStart) {
|
if (buf.length() == 0 && !anchorStart) {
|
||||||
anchorStart = true;
|
anchorStart = true;
|
||||||
@ -624,17 +485,8 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case SEGMENT_OPEN:
|
case SEGMENT_OPEN:
|
||||||
case SEGMENT_CLOSE:
|
pos = parseSection(rule, pos, limit, parser, buf, true);
|
||||||
// Handle segment definitions "(" and ")"
|
|
||||||
// Parse "(", ")"
|
|
||||||
if (segments == null) {
|
|
||||||
segments = new Segments();
|
|
||||||
}
|
|
||||||
segments.addParenthesisAt(buf.length(), c == SEGMENT_OPEN);
|
|
||||||
break;
|
break;
|
||||||
case END_OF_RULE:
|
|
||||||
--pos; // Backup to point to END_OF_RULE
|
|
||||||
break main;
|
|
||||||
case SymbolTable.SYMBOL_REF:
|
case SymbolTable.SYMBOL_REF:
|
||||||
// Handle variable references and segment references "$1" .. "$9"
|
// Handle variable references and segment references "$1" .. "$9"
|
||||||
{
|
{
|
||||||
@ -676,7 +528,7 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
pp.setIndex(pos);
|
pp.setIndex(pos);
|
||||||
String name = parser.parseData.
|
String name = parser.parseData.
|
||||||
parseReference(rule, pp, limit);
|
parseReference(rule, pp, limit);
|
||||||
if (name == null) {
|
if (name == null) {
|
||||||
// This means the '$' was not followed by a
|
// This means the '$' was not followed by a
|
||||||
// valid name. Try to interpret it as an
|
// valid name. Try to interpret it as an
|
||||||
@ -697,25 +549,129 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case DOT:
|
||||||
|
buf.append(parser.getDotStandIn());
|
||||||
|
break;
|
||||||
|
case KLEENE_STAR:
|
||||||
|
case ONE_OR_MORE:
|
||||||
|
case ZERO_OR_ONE:
|
||||||
|
// Quantifiers. We handle single characters, quoted strings,
|
||||||
|
// variable references, and segments.
|
||||||
|
// a+ matches aaa
|
||||||
|
// 'foo'+ matches foofoofoo
|
||||||
|
// $v+ matches xyxyxy if $v == xy
|
||||||
|
// (seg)+ matches segsegseg
|
||||||
|
{
|
||||||
|
if (isSegment && buf.length() == bufSegStart) {
|
||||||
|
// The */+ immediately follows '('
|
||||||
|
syntaxError("Misplaced quantifier", rule, start);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int qstart, qlimit;
|
||||||
|
// The */+ follows an isolated character or quote
|
||||||
|
// or variable reference
|
||||||
|
if (buf.length() == quoteLimit) {
|
||||||
|
// The */+ follows a 'quoted string'
|
||||||
|
qstart = quoteStart;
|
||||||
|
qlimit = quoteLimit;
|
||||||
|
} else if (buf.length() == varLimit) {
|
||||||
|
// The */+ follows a $variableReference
|
||||||
|
qstart = varStart;
|
||||||
|
qlimit = varLimit;
|
||||||
|
} else {
|
||||||
|
// The */+ follows a single character, possibly
|
||||||
|
// a segment standin
|
||||||
|
qstart = buf.length() - 1;
|
||||||
|
qlimit = qstart + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
UnicodeMatcher m =
|
||||||
|
new StringMatcher(buf.toString(), qstart, qlimit,
|
||||||
|
false, parser.data);
|
||||||
|
int min = 0;
|
||||||
|
int max = Quantifier.MAX;
|
||||||
|
switch (c) {
|
||||||
|
case ONE_OR_MORE:
|
||||||
|
min = 1;
|
||||||
|
break;
|
||||||
|
case ZERO_OR_ONE:
|
||||||
|
min = 0;
|
||||||
|
max = 1;
|
||||||
|
break;
|
||||||
|
// case KLEENE_STAR:
|
||||||
|
// do nothing -- min, max already set
|
||||||
|
}
|
||||||
|
m = new Quantifier(m, min, max);
|
||||||
|
buf.setLength(qstart);
|
||||||
|
buf.append(parser.generateStandInFor(m));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
//------------------------------------------------------
|
||||||
|
// Elements allowed ONLY WITHIN segments
|
||||||
|
//------------------------------------------------------
|
||||||
|
case SEGMENT_CLOSE:
|
||||||
|
if (isSegment) {
|
||||||
|
// We're done parsing a segment. The relevant
|
||||||
|
// characters are in buf, starting at offset
|
||||||
|
// bufSegStart. Extract them into a string
|
||||||
|
// matcher, and replace them with a standin
|
||||||
|
// for that matcher.
|
||||||
|
StringMatcher m =
|
||||||
|
new StringMatcher(buf.substring(bufSegStart),
|
||||||
|
true, parser.data);
|
||||||
|
// Since we call parseSection() recursively,
|
||||||
|
// nested segments will result in segment i+1
|
||||||
|
// getting parsed and stored before segment i;
|
||||||
|
// be careful with the vector handling here.
|
||||||
|
if ((segmentNumber+1) > segments.size()) {
|
||||||
|
segments.setSize(segmentNumber+1);
|
||||||
|
}
|
||||||
|
segments.setElementAt(m, segmentNumber);
|
||||||
|
buf.setLength(bufSegStart);
|
||||||
|
buf.append(parser.generateStandInFor(m));
|
||||||
|
break main;
|
||||||
|
}
|
||||||
|
// If we aren't in a segment, then a segment close
|
||||||
|
// character is a syntax error.
|
||||||
|
syntaxError("Unquoted special", rule, start);
|
||||||
|
break;
|
||||||
|
|
||||||
|
//------------------------------------------------------
|
||||||
|
// Elements allowed ONLY OUTSIDE segments
|
||||||
|
//------------------------------------------------------
|
||||||
case CONTEXT_ANTE:
|
case CONTEXT_ANTE:
|
||||||
|
if (isSegment) {
|
||||||
|
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||||
|
}
|
||||||
if (ante >= 0) {
|
if (ante >= 0) {
|
||||||
syntaxError("Multiple ante contexts", rule, start);
|
syntaxError("Multiple ante contexts", rule, start);
|
||||||
}
|
}
|
||||||
ante = buf.length();
|
ante = buf.length();
|
||||||
break;
|
break;
|
||||||
case CONTEXT_POST:
|
case CONTEXT_POST:
|
||||||
|
if (isSegment) {
|
||||||
|
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||||
|
}
|
||||||
if (post >= 0) {
|
if (post >= 0) {
|
||||||
syntaxError("Multiple post contexts", rule, start);
|
syntaxError("Multiple post contexts", rule, start);
|
||||||
}
|
}
|
||||||
post = buf.length();
|
post = buf.length();
|
||||||
break;
|
break;
|
||||||
case CURSOR_POS:
|
case CURSOR_POS:
|
||||||
|
if (isSegment) {
|
||||||
|
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||||
|
}
|
||||||
if (cursor >= 0) {
|
if (cursor >= 0) {
|
||||||
syntaxError("Multiple cursors", rule, start);
|
syntaxError("Multiple cursors", rule, start);
|
||||||
}
|
}
|
||||||
cursor = buf.length();
|
cursor = buf.length();
|
||||||
break;
|
break;
|
||||||
case CURSOR_OFFSET:
|
case CURSOR_OFFSET:
|
||||||
|
if (isSegment) {
|
||||||
|
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||||
|
}
|
||||||
if (cursorOffset < 0) {
|
if (cursorOffset < 0) {
|
||||||
if (buf.length() > 0) {
|
if (buf.length() > 0) {
|
||||||
syntaxError("Misplaced " + c, rule, start);
|
syntaxError("Misplaced " + c, rule, start);
|
||||||
@ -737,74 +693,10 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case DOT:
|
|
||||||
buf.append(parser.getDotStandIn());
|
//------------------------------------------------------
|
||||||
break;
|
// Non-special characters
|
||||||
case KLEENE_STAR:
|
//------------------------------------------------------
|
||||||
case ONE_OR_MORE:
|
|
||||||
case ZERO_OR_ONE:
|
|
||||||
// Quantifiers. We handle single characters, quoted strings,
|
|
||||||
// variable references, and segments.
|
|
||||||
// a+ matches aaa
|
|
||||||
// 'foo'+ matches foofoofoo
|
|
||||||
// $v+ matches xyxyxy if $v == xy
|
|
||||||
// (seg)+ matches segsegseg
|
|
||||||
{
|
|
||||||
int qstart, qlimit;
|
|
||||||
boolean[] isOpenParen = new boolean[1];
|
|
||||||
boolean isSegment = false;
|
|
||||||
if (segments != null &&
|
|
||||||
segments.getLastParenOffset(isOpenParen) == buf.length()) {
|
|
||||||
// The */+ immediately follows a segment
|
|
||||||
if (isOpenParen[0]) {
|
|
||||||
syntaxError("Misplaced quantifier", rule, start);
|
|
||||||
}
|
|
||||||
int[] startparam = new int[1];
|
|
||||||
int[] limitparam = new int[1];
|
|
||||||
if (!segments.extractLastParenSubstring(startparam, limitparam)) {
|
|
||||||
syntaxError("Mismatched segment delimiters", rule, start);
|
|
||||||
}
|
|
||||||
qstart = startparam[0];
|
|
||||||
qlimit = limitparam[0];
|
|
||||||
isSegment = true;
|
|
||||||
} else {
|
|
||||||
// The */+ follows an isolated character or quote
|
|
||||||
// or variable reference
|
|
||||||
if (buf.length() == quoteLimit) {
|
|
||||||
// The */+ follows a 'quoted string'
|
|
||||||
qstart = quoteStart;
|
|
||||||
qlimit = quoteLimit;
|
|
||||||
} else if (buf.length() == varLimit) {
|
|
||||||
// The */+ follows a $variableReference
|
|
||||||
qstart = varStart;
|
|
||||||
qlimit = varLimit;
|
|
||||||
} else {
|
|
||||||
// The */+ follows a single character
|
|
||||||
qstart = buf.length() - 1;
|
|
||||||
qlimit = qstart + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
UnicodeMatcher m =
|
|
||||||
new StringMatcher(buf.toString(), qstart, qlimit,
|
|
||||||
isSegment, parser.data);
|
|
||||||
int min = 0;
|
|
||||||
int max = Quantifier.MAX;
|
|
||||||
switch (c) {
|
|
||||||
case ONE_OR_MORE:
|
|
||||||
min = 1;
|
|
||||||
break;
|
|
||||||
case ZERO_OR_ONE:
|
|
||||||
min = 0;
|
|
||||||
max = 1;
|
|
||||||
break;
|
|
||||||
// case KLEENE_STAR:
|
|
||||||
// do nothing -- min, max already set
|
|
||||||
}
|
|
||||||
m = new Quantifier(m, min, max);
|
|
||||||
buf.setLength(qstart);
|
|
||||||
buf.append(parser.generateStandInFor(m));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||||
// in the printable ASCII range. These characters are
|
// in the printable ASCII range. These characters are
|
||||||
@ -819,11 +711,6 @@ class TransliteratorParser {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
|
||||||
syntaxError("Misplaced " + CURSOR_POS, rule, start);
|
|
||||||
}
|
|
||||||
text = buf.toString();
|
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -838,10 +725,12 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create and return an int[] array of segments.
|
* Create and return a UnicodeMatcher[] array of segments,
|
||||||
|
* or null if there are no segments.
|
||||||
*/
|
*/
|
||||||
int[] createSegments() {
|
UnicodeMatcher[] createSegments() {
|
||||||
return (segments == null) ? null : segments.createArray();
|
return (segments.size() == 0) ? null :
|
||||||
|
(UnicodeMatcher[]) segments.toArray(new UnicodeMatcher[segments.size()]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1096,9 +985,10 @@ class TransliteratorParser {
|
|||||||
pos = left.parse(rule, pos, limit, this);
|
pos = left.parse(rule, pos, limit, this);
|
||||||
|
|
||||||
if (pos == limit ||
|
if (pos == limit ||
|
||||||
OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
|
OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
|
||||||
syntaxError("No operator", rule, start);
|
syntaxError("No operator pos=" + pos, rule, start);
|
||||||
}
|
}
|
||||||
|
++pos;
|
||||||
|
|
||||||
// Found an operator char. Check for forward-reverse operator.
|
// Found an operator char. Check for forward-reverse operator.
|
||||||
if (operator == REVERSE_RULE_OP &&
|
if (operator == REVERSE_RULE_OP &&
|
||||||
@ -1110,7 +1000,7 @@ class TransliteratorParser {
|
|||||||
pos = right.parse(rule, pos, limit, this);
|
pos = right.parse(rule, pos, limit, this);
|
||||||
|
|
||||||
if (pos < limit) {
|
if (pos < limit) {
|
||||||
if (rule.charAt(pos) == END_OF_RULE) {
|
if (rule.charAt(--pos) == END_OF_RULE) {
|
||||||
++pos;
|
++pos;
|
||||||
} else {
|
} else {
|
||||||
// RuleHalf parser must have terminated at an operator
|
// RuleHalf parser must have terminated at an operator
|
||||||
@ -1173,7 +1063,7 @@ class TransliteratorParser {
|
|||||||
// apply.
|
// apply.
|
||||||
if (operator == FWDREV_RULE_OP) {
|
if (operator == FWDREV_RULE_OP) {
|
||||||
right.removeContext();
|
right.removeContext();
|
||||||
right.segments = null;
|
right.segments.removeAllElements();
|
||||||
left.cursor = left.maxRef = -1;
|
left.cursor = left.maxRef = -1;
|
||||||
left.cursorOffset = 0;
|
left.cursorOffset = 0;
|
||||||
}
|
}
|
||||||
@ -1193,7 +1083,7 @@ class TransliteratorParser {
|
|||||||
// cannot place the cursor outside the limits of the context.
|
// cannot place the cursor outside the limits of the context.
|
||||||
// Anchors are only allowed on the input side.
|
// Anchors are only allowed on the input side.
|
||||||
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
|
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
|
||||||
right.segments != null || left.maxRef >= 0 ||
|
right.segments.size() > 0 || left.maxRef >= 0 ||
|
||||||
(right.cursorOffset != 0 && right.cursor < 0) ||
|
(right.cursorOffset != 0 && right.cursor < 0) ||
|
||||||
// - The following two checks were used to ensure that the
|
// - The following two checks were used to ensure that the
|
||||||
// - the cursor offset stayed within the ante- or postcontext.
|
// - the cursor offset stayed within the ante- or postcontext.
|
||||||
@ -1208,14 +1098,8 @@ class TransliteratorParser {
|
|||||||
// Check integrity of segments and segment references. Each
|
// Check integrity of segments and segment references. Each
|
||||||
// segment's start must have a corresponding limit, and the
|
// segment's start must have a corresponding limit, and the
|
||||||
// references must not refer to segments that do not exist.
|
// references must not refer to segments that do not exist.
|
||||||
if (left.segments != null) {
|
if (right.maxRef > left.segments.size()) {
|
||||||
if (!left.segments.validate()) {
|
syntaxError("Undefined segment reference $" + right.maxRef, rule, start);
|
||||||
syntaxError("Missing segment close", rule, start);
|
|
||||||
}
|
|
||||||
int n = left.segments.count();
|
|
||||||
if (right.maxRef > n) {
|
|
||||||
syntaxError("Undefined segment reference", rule, start);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
data.ruleSet.addRule(new TransliterationRule(
|
data.ruleSet.addRule(new TransliterationRule(
|
||||||
@ -1363,7 +1247,7 @@ class TransliteratorParser {
|
|||||||
char generateStandInFor(UnicodeMatcher matcher) {
|
char generateStandInFor(UnicodeMatcher matcher) {
|
||||||
// assert(matcher != null);
|
// assert(matcher != null);
|
||||||
if (variableNext >= variableLimit) {
|
if (variableNext >= variableLimit) {
|
||||||
throw new RuntimeException("Private use variables exhausted");
|
throw new RuntimeException("Variable range exhausted");
|
||||||
}
|
}
|
||||||
variablesVector.addElement(matcher);
|
variablesVector.addElement(matcher);
|
||||||
return variableNext++;
|
return variableNext++;
|
||||||
@ -1379,7 +1263,7 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
return (char) dotStandIn;
|
return (char) dotStandIn;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Append the value of the given variable name to the given
|
* Append the value of the given variable name to the given
|
||||||
* StringBuffer.
|
* StringBuffer.
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
|
||||||
* $Date: 2001/10/26 22:59:26 $
|
* $Date: 2001/10/30 18:08:19 $
|
||||||
* $Revision: 1.57 $
|
* $Revision: 1.58 $
|
||||||
*
|
*
|
||||||
*****************************************************************************************
|
*****************************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -1268,9 +1268,11 @@ public class TransliteratorTest extends TestFmwk {
|
|||||||
"c abc ababc",
|
"c abc ababc",
|
||||||
"d d abd");
|
"d d abd");
|
||||||
|
|
||||||
|
// NOTE: The (ab)+ when referenced just yields a single "ab",
|
||||||
|
// not the full sequence of them. This accords with perl behavior.
|
||||||
expect("(ab)+ {x} > '(' $1 ')';",
|
expect("(ab)+ {x} > '(' $1 ')';",
|
||||||
"x abx ababxy",
|
"x abx ababxy",
|
||||||
"x ab(ab) abab(abab)y");
|
"x ab(ab) abab(ab)y");
|
||||||
|
|
||||||
expect("b+ > x;",
|
expect("b+ > x;",
|
||||||
"ac abc abbc abbbc",
|
"ac abc abbc abbbc",
|
||||||
@ -1288,12 +1290,11 @@ public class TransliteratorTest extends TestFmwk {
|
|||||||
"qa qab qaba qababc",
|
"qa qab qaba qababc",
|
||||||
"xa x xa xc");
|
"xa x xa xc");
|
||||||
|
|
||||||
// Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
|
// NOTE: The (ab)+ when referenced just yields a single "ab",
|
||||||
// In perl, it only matches the first occurrence, so the output
|
// not the full sequence of them. This accords with perl behavior.
|
||||||
// is "()a (ab) (ab)a (ab)c".
|
|
||||||
expect("q(ab)* > '(' $1 ')';",
|
expect("q(ab)* > '(' $1 ')';",
|
||||||
"qa qab qaba qababc",
|
"qa qab qaba qababc",
|
||||||
"()a (ab) (ab)a (abab)c");
|
"()a (ab) (ab)a (ab)c");
|
||||||
|
|
||||||
// 'foo'+ and 'foo'* -- the quantifier should apply to the entire
|
// 'foo'+ and 'foo'* -- the quantifier should apply to the entire
|
||||||
// quoted string
|
// quoted string
|
||||||
@ -1574,6 +1575,46 @@ public class TransliteratorTest extends TestFmwk {
|
|||||||
expect(gr, "\u03B1\u0314", "ha");
|
expect(gr, "\u03B1\u0314", "ha");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test quantified segment behavior. We want:
|
||||||
|
* ([abc])+ > x $1 x; applied to "cba" produces "xax"
|
||||||
|
*/
|
||||||
|
public void TestQuantifiedSegment() {
|
||||||
|
// The normal case
|
||||||
|
expect("([abc]+) > x $1 x;", "cba", "xcbax");
|
||||||
|
|
||||||
|
// The tricky case; the quantifier is around the segment
|
||||||
|
expect("([abc])+ > x $1 x;", "cba", "xax");
|
||||||
|
|
||||||
|
// Tricky case in reverse direction
|
||||||
|
expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
|
||||||
|
|
||||||
|
// Check post-context segment
|
||||||
|
expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
|
||||||
|
|
||||||
|
// Test toRule/toPattern for non-quantified segment.
|
||||||
|
// Careful with spacing here.
|
||||||
|
String r = "([a-c]){q} > x $1 x;";
|
||||||
|
Transliterator t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
|
||||||
|
String rr = t.toRules(true);
|
||||||
|
if (!r.equals(rr)) {
|
||||||
|
errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||||
|
} else {
|
||||||
|
logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test toRule/toPattern for quantified segment.
|
||||||
|
// Careful with spacing here.
|
||||||
|
r = "([a-c])+{q} > x $1 x;";
|
||||||
|
t = Transliterator.createFromRules("ID", r, Transliterator.FORWARD);
|
||||||
|
rr = t.toRules(true);
|
||||||
|
if (!r.equals(rr)) {
|
||||||
|
errln("FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||||
|
} else {
|
||||||
|
logln("Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//======================================================================
|
//======================================================================
|
||||||
// icu4j ONLY
|
// icu4j ONLY
|
||||||
// These tests are not mirrored (yet) in icu4c at
|
// These tests are not mirrored (yet) in icu4c at
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/StringMatcher.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/StringMatcher.java,v $
|
||||||
* $Date: 2001/10/25 22:32:02 $
|
* $Date: 2001/10/30 18:04:08 $
|
||||||
* $Revision: 1.2 $
|
* $Revision: 1.3 $
|
||||||
*
|
*
|
||||||
*****************************************************************************************
|
*****************************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -18,16 +18,27 @@ class StringMatcher implements UnicodeMatcher {
|
|||||||
|
|
||||||
private boolean isSegment;
|
private boolean isSegment;
|
||||||
|
|
||||||
|
private int matchStart;
|
||||||
|
|
||||||
|
private int matchLimit;
|
||||||
|
|
||||||
private final RuleBasedTransliterator.Data data;
|
private final RuleBasedTransliterator.Data data;
|
||||||
|
|
||||||
|
public StringMatcher(String theString,
|
||||||
|
boolean isSeg,
|
||||||
|
RuleBasedTransliterator.Data theData) {
|
||||||
|
data = theData;
|
||||||
|
isSegment = isSeg;
|
||||||
|
pattern = theString;
|
||||||
|
matchStart = matchLimit = -1;
|
||||||
|
}
|
||||||
|
|
||||||
public StringMatcher(String theString,
|
public StringMatcher(String theString,
|
||||||
int start,
|
int start,
|
||||||
int limit,
|
int limit,
|
||||||
boolean isSeg,
|
boolean isSeg,
|
||||||
RuleBasedTransliterator.Data theData) {
|
RuleBasedTransliterator.Data theData) {
|
||||||
data = theData;
|
this(theString.substring(start, limit), isSeg, theData);
|
||||||
isSegment = isSeg;
|
|
||||||
pattern = theString.substring(start, limit);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -40,6 +51,7 @@ class StringMatcher implements UnicodeMatcher {
|
|||||||
int i;
|
int i;
|
||||||
int[] cursor = new int[] { offset[0] };
|
int[] cursor = new int[] { offset[0] };
|
||||||
if (limit < cursor[0]) {
|
if (limit < cursor[0]) {
|
||||||
|
// Match in the reverse direction
|
||||||
for (i=pattern.length()-1; i>=0; --i) {
|
for (i=pattern.length()-1; i>=0; --i) {
|
||||||
char keyChar = pattern.charAt(i);
|
char keyChar = pattern.charAt(i);
|
||||||
UnicodeMatcher subm = data.lookup(keyChar);
|
UnicodeMatcher subm = data.lookup(keyChar);
|
||||||
@ -58,6 +70,13 @@ class StringMatcher implements UnicodeMatcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Record the match position, but adjust for a normal
|
||||||
|
// forward start, limit, and only if a prior match does not
|
||||||
|
// exist -- we want the rightmost match.
|
||||||
|
if (matchStart < 0) {
|
||||||
|
matchStart = cursor[0]+1;
|
||||||
|
matchLimit = offset[0]+1;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
for (i=0; i<pattern.length(); ++i) {
|
for (i=0; i<pattern.length(); ++i) {
|
||||||
if (incremental && cursor[0] == limit) {
|
if (incremental && cursor[0] == limit) {
|
||||||
@ -85,6 +104,9 @@ class StringMatcher implements UnicodeMatcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Record the match position
|
||||||
|
matchStart = offset[0];
|
||||||
|
matchLimit = cursor[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
offset[0] = cursor[0];
|
offset[0] = cursor[0];
|
||||||
@ -114,7 +136,7 @@ class StringMatcher implements UnicodeMatcher {
|
|||||||
result.append(')');
|
result.append(')');
|
||||||
}
|
}
|
||||||
// Flush quoteBuf out to result
|
// Flush quoteBuf out to result
|
||||||
TransliterationRule.appendToRule(result, (isSegment?')':-1),
|
TransliterationRule.appendToRule(result, -1,
|
||||||
true, escapeUnprintable, quoteBuf);
|
true, escapeUnprintable, quoteBuf);
|
||||||
return result.toString();
|
return result.toString();
|
||||||
}
|
}
|
||||||
@ -130,6 +152,32 @@ class StringMatcher implements UnicodeMatcher {
|
|||||||
UnicodeMatcher m = data.lookup(c);
|
UnicodeMatcher m = data.lookup(c);
|
||||||
return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
|
return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove any match data. This must be called before performing a
|
||||||
|
* set of matches with this segment.
|
||||||
|
*/
|
||||||
|
public void resetMatch() {
|
||||||
|
matchStart = matchLimit = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the start offset, in the match text, of the <em>rightmost</em>
|
||||||
|
* match. This method may get moved up into the UnicodeMatcher if
|
||||||
|
* it turns out to be useful to generalize this.
|
||||||
|
*/
|
||||||
|
public int getMatchStart() {
|
||||||
|
return matchStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the limit offset, in the match text, of the <em>rightmost</em>
|
||||||
|
* match. This method may get moved up into the UnicodeMatcher if
|
||||||
|
* it turns out to be useful to generalize this.
|
||||||
|
*/
|
||||||
|
public int getMatchLimit() {
|
||||||
|
return matchLimit;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//eof
|
//eof
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
|
||||||
* $Date: 2001/10/25 23:22:15 $
|
* $Date: 2001/10/30 18:04:08 $
|
||||||
* $Revision: 1.33 $
|
* $Revision: 1.34 $
|
||||||
*
|
*
|
||||||
*****************************************************************************************
|
*****************************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -30,13 +30,15 @@ import com.ibm.util.Utility;
|
|||||||
* Variables are detected by looking up each character in a supplied
|
* Variables are detected by looking up each character in a supplied
|
||||||
* variable list to see if it has been so defined.
|
* variable list to see if it has been so defined.
|
||||||
*
|
*
|
||||||
* <p>A rule may contain segments in its input string and segment references in
|
* <p>A rule may contain segments in its input string and segment
|
||||||
* its output string. A segment is a substring of the input pattern, indicated
|
* references in its output string. A segment is a substring of the
|
||||||
* by an offset and limit. The segment may span the preceding or following
|
* input pattern, indicated by an offset and limit. The segment may
|
||||||
* context. A segment reference is a special character in the output string
|
* be in the preceding or following context. It may not span a
|
||||||
* that causes a segment of the input string (not the input pattern) to be
|
* context boundary. A segment reference is a special character in
|
||||||
* copied to the output string. The range of special characters that represent
|
* the output string that causes a segment of the input string (not
|
||||||
* segment references is defined by RuleBasedTransliterator.Data.
|
* the input pattern) to be copied to the output string. The range of
|
||||||
|
* special characters that represent segment references is defined by
|
||||||
|
* RuleBasedTransliterator.Data.
|
||||||
*
|
*
|
||||||
* <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
|
* <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
|
||||||
* string "abc.123" to "ab1.c23".
|
* string "abc.123" to "ab1.c23".
|
||||||
@ -44,7 +46,7 @@ import com.ibm.util.Utility;
|
|||||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||||
*
|
*
|
||||||
* @author Alan Liu
|
* @author Alan Liu
|
||||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.33 $ $Date: 2001/10/25 23:22:15 $
|
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.34 $ $Date: 2001/10/30 18:04:08 $
|
||||||
*/
|
*/
|
||||||
class TransliterationRule {
|
class TransliterationRule {
|
||||||
|
|
||||||
@ -64,20 +66,13 @@ class TransliterationRule {
|
|||||||
private String output;
|
private String output;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An array of integers encoding the position of the segments.
|
* An array of matcher objects corresponding to the input pattern
|
||||||
* See RuleBasedTransliterator.Segments for more details.
|
* segments. If there are no segments this is null. N.B. This is
|
||||||
|
* a UnicodeMatcher for generality, but in practice it is always a
|
||||||
|
* StringMatcher. In the future we may generalize this, but for
|
||||||
|
* now we sometimes cast down to StringMatcher.
|
||||||
*/
|
*/
|
||||||
int[] segments;
|
UnicodeMatcher[] segments;
|
||||||
|
|
||||||
/**
|
|
||||||
* A value we compute from segments. The first index into segments[]
|
|
||||||
* that is >= anteContextLength. That is, the first one that is within
|
|
||||||
* the forward scanned part of the pattern -- the key or the postContext.
|
|
||||||
* If there are no segments, this has the value -1. This index is relative
|
|
||||||
* to FIRST_SEG_POS_INDEX; that is, it should be used as follows:
|
|
||||||
* segments[FIRST_SEG_POS_INDEX + firstKeySeg].
|
|
||||||
*/
|
|
||||||
int firstKeySeg;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The length of the string that must match before the key. If
|
* The length of the string that must match before the key. If
|
||||||
@ -127,20 +122,6 @@ class TransliterationRule {
|
|||||||
private static final char APOSTROPHE = '\'';
|
private static final char APOSTROPHE = '\'';
|
||||||
private static final char BACKSLASH = '\\';
|
private static final char BACKSLASH = '\\';
|
||||||
|
|
||||||
// Macros for accessing the array of integers encoding the position of
|
|
||||||
// the segments. See RuleBasedTransliterator.Segments for more details.
|
|
||||||
// SEGMENTS_COUNT number of segments, n (half the number of parens)
|
|
||||||
// SEGMENTS_LEN length of the segments array (number of elements)
|
|
||||||
// SEGMENTS_POS position in 'pattern' of parenthesis i, where i=0..2n-1
|
|
||||||
// SEGMENTS_NUM index into segments to access POS of $1.open,
|
|
||||||
// $1.close, $2.open, $2.close,.., $n.open, $n.close
|
|
||||||
// Relative to FIRST_SEG_POS_INDEX. Ranges from 0..2n-1.
|
|
||||||
static final int FIRST_SEG_POS_INDEX = 2;
|
|
||||||
static final int SEGMENTS_COUNT(int[] x) { return x[0]; }
|
|
||||||
static final int SEGMENTS_LEN(int[] x) { return (SEGMENTS_COUNT(x)*4+4); }
|
|
||||||
static final int SEGMENTS_POS(int[] x,int i) { return x[FIRST_SEG_POS_INDEX+i]; }
|
|
||||||
static final int SEGMENTS_NUM(int[] x,int i) { return x[x[1]+i]-FIRST_SEG_POS_INDEX; }
|
|
||||||
|
|
||||||
private static final String COPYRIGHT =
|
private static final String COPYRIGHT =
|
||||||
"\u00A9 IBM Corporation 1999-2001. All rights reserved.";
|
"\u00A9 IBM Corporation 1999-2001. All rights reserved.";
|
||||||
|
|
||||||
@ -165,12 +146,8 @@ class TransliterationRule {
|
|||||||
* 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
|
* 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
|
||||||
* "xyz" and moves the cursor to before "a". It would have a cursorOffset
|
* "xyz" and moves the cursor to before "a". It would have a cursorOffset
|
||||||
* of -3.
|
* of -3.
|
||||||
* @param segs array of 2n integers. Each of n pairs consists of offset,
|
* @param segs array of UnicodeMatcher corresponding to input pattern
|
||||||
* limit for a segment of the input string. Characters in the output string
|
* segments, or null if there are none
|
||||||
* refer to these segments if they are in a special range determined by the
|
|
||||||
* associated RuleBasedTransliterator.Data object. May be null if there are
|
|
||||||
* no segments. The caller is responsible for validating that segments
|
|
||||||
* are well-formed.
|
|
||||||
* @param anchorStart true if the the rule is anchored on the left to
|
* @param anchorStart true if the the rule is anchored on the left to
|
||||||
* the context start
|
* the context start
|
||||||
* @param anchorEnd true if the rule is anchored on the right to the
|
* @param anchorEnd true if the rule is anchored on the right to the
|
||||||
@ -180,7 +157,7 @@ class TransliterationRule {
|
|||||||
int anteContextPos, int postContextPos,
|
int anteContextPos, int postContextPos,
|
||||||
String output,
|
String output,
|
||||||
int cursorPos, int cursorOffset,
|
int cursorPos, int cursorOffset,
|
||||||
int[] segs,
|
UnicodeMatcher[] segs,
|
||||||
boolean anchorStart, boolean anchorEnd,
|
boolean anchorStart, boolean anchorEnd,
|
||||||
RuleBasedTransliterator.Data theData) {
|
RuleBasedTransliterator.Data theData) {
|
||||||
data = theData;
|
data = theData;
|
||||||
@ -212,25 +189,11 @@ class TransliterationRule {
|
|||||||
this.cursorPos = cursorPos + cursorOffset;
|
this.cursorPos = cursorPos + cursorOffset;
|
||||||
this.output = output;
|
this.output = output;
|
||||||
// We don't validate the segments array. The caller must
|
// We don't validate the segments array. The caller must
|
||||||
// guarantee that the segments are well-formed.
|
// guarantee that the segments are well-formed (that is, that
|
||||||
|
// all $n references in the output refer to indices of this
|
||||||
|
// array, and that no array elements are null).
|
||||||
this.segments = segs;
|
this.segments = segs;
|
||||||
|
|
||||||
// Find the position of the first segment index that is after the
|
|
||||||
// anteContext (in the key). Note that this may be a start or a
|
|
||||||
// limit index. If all segments are in the ante context,
|
|
||||||
// firstKeySeg should point past the last segment -- that is, it
|
|
||||||
// should point at the end marker, which is -1. This allows the
|
|
||||||
// code to back up by one to obtain the last ante context segment.
|
|
||||||
firstKeySeg = -1;
|
|
||||||
if (segments != null) {
|
|
||||||
firstKeySeg = FIRST_SEG_POS_INDEX;
|
|
||||||
while (segments[firstKeySeg] >= 0 &&
|
|
||||||
segments[firstKeySeg] < anteContextLength) {
|
|
||||||
++firstKeySeg;
|
|
||||||
}
|
|
||||||
firstKeySeg -= FIRST_SEG_POS_INDEX; // make relative to FSPI
|
|
||||||
}
|
|
||||||
|
|
||||||
pattern = input;
|
pattern = input;
|
||||||
flags = 0;
|
flags = 0;
|
||||||
if (anchorStart) {
|
if (anchorStart) {
|
||||||
@ -410,25 +373,12 @@ class TransliterationRule {
|
|||||||
|
|
||||||
// ============================ MATCH ===========================
|
// ============================ MATCH ===========================
|
||||||
|
|
||||||
// Record the actual positions, in the text, of the segments.
|
// Reset segment match data
|
||||||
// These are recorded in the order that they occur in the pattern.
|
|
||||||
|
|
||||||
// segPos[] is an array of 2*SEGMENTS_COUNT elements. It
|
|
||||||
// records the position in 'text' of each segment boundary, in
|
|
||||||
// the order that they occur in 'pattern'.
|
|
||||||
int[] segPos = null;
|
|
||||||
if (segments != null) {
|
if (segments != null) {
|
||||||
segPos = new int[2*SEGMENTS_COUNT(segments)];
|
for (int i=0; i<segments.length; ++i) {
|
||||||
|
((StringMatcher) segments[i]).resetMatch();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// iSeg is an index into segments[] that accesses the first
|
|
||||||
// array. As such it ranges from 0 to SEGMENTS_COUNT*2 - 1.
|
|
||||||
// When indexing into segments[] FIRST_SEG_POS_INDEX must be
|
|
||||||
// added to it: segments[FIRST_SEG_POS_INDEX + iSeg].
|
|
||||||
int iSeg = firstKeySeg - 1;
|
|
||||||
// nextSegPos is an offset in 'pattern'. When the cursor is
|
|
||||||
// equal to nextSegPos, we are at a segment boundary, and we
|
|
||||||
// record the position in the real text in segPos[].
|
|
||||||
int nextSegPos = (iSeg >= 0) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
|
|
||||||
|
|
||||||
int lenDelta, keyLimit;
|
int lenDelta, keyLimit;
|
||||||
int[] intRef = new int[1];
|
int[] intRef = new int[1];
|
||||||
@ -465,15 +415,6 @@ class TransliterationRule {
|
|||||||
}
|
}
|
||||||
oText = intRef[0];
|
oText = intRef[0];
|
||||||
}
|
}
|
||||||
while (nextSegPos == oPattern) {
|
|
||||||
segPos[iSeg] = oText;
|
|
||||||
if (oText >= 0) {
|
|
||||||
segPos[iSeg] += UTF16.getCharCount(UTF16.charAt(text, oText));
|
|
||||||
} else {
|
|
||||||
++segPos[iSeg];
|
|
||||||
}
|
|
||||||
nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[FIRST_SEG_POS_INDEX+iSeg] : -1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
minOText = posAfter(text, oText);
|
minOText = posAfter(text, oText);
|
||||||
@ -486,9 +427,6 @@ class TransliterationRule {
|
|||||||
|
|
||||||
// -------------------- Key and Post Context --------------------
|
// -------------------- Key and Post Context --------------------
|
||||||
|
|
||||||
iSeg = firstKeySeg;
|
|
||||||
nextSegPos = (iSeg >= 0) ? (segments[FIRST_SEG_POS_INDEX+iSeg] - anteContextLength) : -1;
|
|
||||||
|
|
||||||
oPattern = 0;
|
oPattern = 0;
|
||||||
oText = pos.start;
|
oText = pos.start;
|
||||||
keyLimit = 0;
|
keyLimit = 0;
|
||||||
@ -511,10 +449,6 @@ class TransliterationRule {
|
|||||||
// depending on whether we're in the key or in the post
|
// depending on whether we're in the key or in the post
|
||||||
// context.
|
// context.
|
||||||
|
|
||||||
while (oPattern == nextSegPos) {
|
|
||||||
segPos[iSeg] = oText;
|
|
||||||
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
|
|
||||||
}
|
|
||||||
if (oPattern == keyLength) {
|
if (oPattern == keyLength) {
|
||||||
keyLimit = oText;
|
keyLimit = oText;
|
||||||
}
|
}
|
||||||
@ -554,10 +488,6 @@ class TransliterationRule {
|
|||||||
//! return UnicodeMatcher.U_MISMATCH;
|
//! return UnicodeMatcher.U_MISMATCH;
|
||||||
//!}
|
//!}
|
||||||
}
|
}
|
||||||
while (oPattern == nextSegPos) {
|
|
||||||
segPos[iSeg] = oText;
|
|
||||||
nextSegPos = segments[FIRST_SEG_POS_INDEX+(++iSeg)] - anteContextLength;
|
|
||||||
}
|
|
||||||
if (oPattern == keyLength) {
|
if (oPattern == keyLength) {
|
||||||
keyLimit = oText;
|
keyLimit = oText;
|
||||||
}
|
}
|
||||||
@ -576,8 +506,7 @@ class TransliterationRule {
|
|||||||
// =========================== REPLACE ==========================
|
// =========================== REPLACE ==========================
|
||||||
|
|
||||||
// We have a full match. The key is between pos.start and
|
// We have a full match. The key is between pos.start and
|
||||||
// keyLimit. Segment indices have been recorded in segPos[].
|
// keyLimit.
|
||||||
// Perform a replacement.
|
|
||||||
|
|
||||||
if (segments == null) {
|
if (segments == null) {
|
||||||
text.replace(pos.start, keyLimit, output);
|
text.replace(pos.start, keyLimit, output);
|
||||||
@ -629,11 +558,22 @@ class TransliterationRule {
|
|||||||
buf.setLength(0);
|
buf.setLength(0);
|
||||||
}
|
}
|
||||||
// Copy segment with out-of-band data
|
// Copy segment with out-of-band data
|
||||||
b *= 2;
|
StringMatcher m = (StringMatcher) segments[b];
|
||||||
int start = segPos[SEGMENTS_NUM(segments,b)];
|
int start = m.getMatchStart();
|
||||||
int limit = segPos[SEGMENTS_NUM(segments,b+1)];
|
int limit = m.getMatchLimit();
|
||||||
text.copy(start, limit, dest);
|
// If there was no match, that means that a quantifier
|
||||||
dest += limit - start;
|
// matched zero-length. E.g., x (a)* y matched "xy".
|
||||||
|
if (start >= 0) {
|
||||||
|
// Adjust indices for segments in post context
|
||||||
|
// for any inserted text between the key and
|
||||||
|
// the post context.
|
||||||
|
if (start >= keyLimit) {
|
||||||
|
start += dest - keyLimit;
|
||||||
|
limit += dest - keyLimit;
|
||||||
|
}
|
||||||
|
text.copy(start, limit, dest);
|
||||||
|
dest += limit - start;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
oOutput += UTF16.getCharCount(c);
|
oOutput += UTF16.getCharCount(c);
|
||||||
}
|
}
|
||||||
@ -790,20 +730,6 @@ class TransliterationRule {
|
|||||||
|
|
||||||
StringBuffer rule = new StringBuffer();
|
StringBuffer rule = new StringBuffer();
|
||||||
|
|
||||||
// iseg indexes into segments[] directly (not offset from FSPI)
|
|
||||||
int iseg = FIRST_SEG_POS_INDEX-1;
|
|
||||||
int nextSeg = -1;
|
|
||||||
// Build an array of booleans specifying open vs. close paren
|
|
||||||
boolean[] isOpen = null;
|
|
||||||
if (segments != null) {
|
|
||||||
isOpen = new boolean[2*SEGMENTS_COUNT(segments)];
|
|
||||||
for (i=0; i<2*SEGMENTS_COUNT(segments); i+=2) {
|
|
||||||
isOpen[SEGMENTS_NUM(segments,i) ] = true;
|
|
||||||
isOpen[SEGMENTS_NUM(segments,i+1)] = false;
|
|
||||||
}
|
|
||||||
nextSeg = segments[++iseg];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Accumulate special characters (and non-specials following them)
|
// Accumulate special characters (and non-specials following them)
|
||||||
// into quoteBuf. Append quoteBuf, within single quotes, when
|
// into quoteBuf. Append quoteBuf, within single quotes, when
|
||||||
// a non-quoted element must be inserted.
|
// a non-quoted element must be inserted.
|
||||||
@ -825,14 +751,6 @@ class TransliterationRule {
|
|||||||
appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
|
appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Append either '(' or ')' if we are at a segment index
|
|
||||||
if (i == nextSeg) {
|
|
||||||
appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
|
|
||||||
'(' : ')',
|
|
||||||
true, escapeUnprintable, quoteBuf);
|
|
||||||
nextSeg = segments[++iseg];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (emitBraces && i == (anteContextLength + keyLength)) {
|
if (emitBraces && i == (anteContextLength + keyLength)) {
|
||||||
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
||||||
}
|
}
|
||||||
@ -847,11 +765,6 @@ class TransliterationRule {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == nextSeg) {
|
|
||||||
// assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
|
|
||||||
appendToRule(rule, ')', true, escapeUnprintable, quoteBuf);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (emitBraces && i == (anteContextLength + keyLength)) {
|
if (emitBraces && i == (anteContextLength + keyLength)) {
|
||||||
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
|
||||||
}
|
}
|
||||||
@ -885,7 +798,7 @@ class TransliterationRule {
|
|||||||
} else {
|
} else {
|
||||||
++seg; // make 1-based
|
++seg; // make 1-based
|
||||||
appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf);
|
appendToRule(rule, 0x20, true, escapeUnprintable, quoteBuf);
|
||||||
rule.append(0x24 /*$*/);
|
rule.append('$');
|
||||||
boolean show = false; // true if we should display digits
|
boolean show = false; // true if we should display digits
|
||||||
for (int p=9; p>=0; --p) {
|
for (int p=9; p>=0; --p) {
|
||||||
int d = seg / POW10[p];
|
int d = seg / POW10[p];
|
||||||
@ -938,6 +851,9 @@ class TransliterationRule {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* $Log: TransliterationRule.java,v $
|
* $Log: TransliterationRule.java,v $
|
||||||
|
* Revision 1.34 2001/10/30 18:04:08 alan
|
||||||
|
* jitterbug 1406: make quantified segments behave like perl counterparts
|
||||||
|
*
|
||||||
* Revision 1.33 2001/10/25 23:22:15 alan
|
* Revision 1.33 2001/10/25 23:22:15 alan
|
||||||
* jitterbug 73: changes to support zero-length matchers at end of key
|
* jitterbug 73: changes to support zero-length matchers at end of key
|
||||||
*
|
*
|
||||||
|
@ -4,8 +4,8 @@
|
|||||||
* Corporation and others. All Rights Reserved.
|
* Corporation and others. All Rights Reserved.
|
||||||
**********************************************************************
|
**********************************************************************
|
||||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliteratorParser.java,v $
|
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliteratorParser.java,v $
|
||||||
* $Date: 2001/10/24 00:03:38 $
|
* $Date: 2001/10/30 18:04:09 $
|
||||||
* $Revision: 1.7 $
|
* $Revision: 1.8 $
|
||||||
**********************************************************************
|
**********************************************************************
|
||||||
*/
|
*/
|
||||||
package com.ibm.text;
|
package com.ibm.text;
|
||||||
@ -117,6 +117,7 @@ class TransliteratorParser {
|
|||||||
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
|
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
|
||||||
|
|
||||||
private static final String OPERATORS = "=><";
|
private static final String OPERATORS = "=><";
|
||||||
|
private static final String HALF_ENDERS = "=><;";
|
||||||
|
|
||||||
// Other special characters
|
// Other special characters
|
||||||
private static final char QUOTE = '\'';
|
private static final char QUOTE = '\'';
|
||||||
@ -142,7 +143,7 @@ class TransliteratorParser {
|
|||||||
// private static final char ANCHOR_END = '$';
|
// private static final char ANCHOR_END = '$';
|
||||||
|
|
||||||
// Segments of the input string are delimited by "(" and ")". In the
|
// Segments of the input string are delimited by "(" and ")". In the
|
||||||
// output string these segments are referenced as "$1" through "$9".
|
// output string these segments are referenced as "$1", "$2", etc.
|
||||||
private static final char SEGMENT_OPEN = '(';
|
private static final char SEGMENT_OPEN = '(';
|
||||||
private static final char SEGMENT_CLOSE = ')';
|
private static final char SEGMENT_CLOSE = ')';
|
||||||
|
|
||||||
@ -285,209 +286,6 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
//----------------------------------------------------------------------
|
|
||||||
// class Segments
|
|
||||||
//----------------------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Segments are parentheses-enclosed regions of the input string.
|
|
||||||
* These are referenced in the output string using the notation $1,
|
|
||||||
* $2, etc. Numbering is in order of appearance of the left
|
|
||||||
* parenthesis. Number is one-based. Segments are defined as start,
|
|
||||||
* limit pairs. Segments may nest.
|
|
||||||
*
|
|
||||||
* During parsing, segment data is encoded in an object of class
|
|
||||||
* Segments. At runtime, the same data is encoded in compact form as
|
|
||||||
* an array of integers in a TransliterationRule. The runtime encoding
|
|
||||||
* must satisfy three goals:
|
|
||||||
*
|
|
||||||
* 1. Iterate over the offsets in a pattern, from left to right,
|
|
||||||
* and indicate all segment boundaries, in order. This is done
|
|
||||||
* during matching.
|
|
||||||
*
|
|
||||||
* 2. Given a reference $n, produce the start and limit offsets
|
|
||||||
* for that segment. This is done during replacement.
|
|
||||||
*
|
|
||||||
* 3. Similar to goal 1, but in addition, indicate whether each
|
|
||||||
* segment boundary is a start or a limit, in other words, whether
|
|
||||||
* each is an open paren or a close paren. This is required by
|
|
||||||
* the toRule() method.
|
|
||||||
*
|
|
||||||
* Goal 1 must be satisfied at high speed since this is done during
|
|
||||||
* matching. Goal 2 is next most important. Goal 3 is not performance
|
|
||||||
* critical since it is only needed by toRule().
|
|
||||||
*
|
|
||||||
* The array of integers is actually two arrays concatenated. The
|
|
||||||
* first gives the index values of the open and close parentheses in
|
|
||||||
* the order they appear. The second maps segment numbers to the
|
|
||||||
* indices of the first array. The two arrays have the same length.
|
|
||||||
* Iterating over the first array satisfies goal 1. Indexing into the
|
|
||||||
* second array satisfies goal 2. Goal 3 is satisfied by iterating
|
|
||||||
* over the second array and constructing the required data when
|
|
||||||
* needed. This is what toRule() does.
|
|
||||||
*
|
|
||||||
* Example: (a b(c d)e f)
|
|
||||||
* 0 1 2 3 4 5 6
|
|
||||||
*
|
|
||||||
* First array: Indices are 0, 2, 4, and 6.
|
|
||||||
|
|
||||||
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
|
|
||||||
* second array is 0, 3, 1 2 -- these give the indices in the
|
|
||||||
* first array at which $1:open, $1:close, $2:open, and $2:close
|
|
||||||
* occur.
|
|
||||||
*
|
|
||||||
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
|
|
||||||
*
|
|
||||||
* Each subarray is terminated with a -1, and two leading entries
|
|
||||||
* give the number of segments and the offset to the first entry
|
|
||||||
* of the second array. In addition, the second array value are
|
|
||||||
* all offset by 2 so they index directly into the final array.
|
|
||||||
* The total array size is 4*segments[0] + 4. The second index is
|
|
||||||
* 2*segments[0] + 3.
|
|
||||||
*
|
|
||||||
* In the output string, a segment reference is indicated by a
|
|
||||||
* character in a special range, as defined by
|
|
||||||
* RuleBasedTransliterator.Data.
|
|
||||||
*
|
|
||||||
* Most rules have no segments, in which case segments is null, and the
|
|
||||||
* output string need not be checked for segment reference characters.
|
|
||||||
*
|
|
||||||
* See also rbt_rule.h/cpp.
|
|
||||||
*/
|
|
||||||
private static class Segments {
|
|
||||||
|
|
||||||
private Vector offsets; // holds Integer objects
|
|
||||||
|
|
||||||
private Vector isOpenParen; // holds Boolean objects
|
|
||||||
|
|
||||||
private int offset(int i) {
|
|
||||||
return ((Integer) offsets.elementAt(i)).intValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isOpen(int i) {
|
|
||||||
return ((Boolean) isOpenParen.elementAt(i)).booleanValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
// size of the Vectors
|
|
||||||
private int size() {
|
|
||||||
// assert(offset.size() == isOpenParen.size());
|
|
||||||
return offsets.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Segments() {
|
|
||||||
offsets = new Vector();
|
|
||||||
isOpenParen = new Vector();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addParenthesisAt(int offset, boolean isOpen) {
|
|
||||||
offsets.addElement(new Integer(offset));
|
|
||||||
isOpenParen.addElement(new Boolean(isOpen));
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getLastParenOffset(boolean[] isOpenParen) {
|
|
||||||
if (size() == 0) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
isOpenParen[0] = isOpen(size()-1);
|
|
||||||
return offset(size()-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remove the last (rightmost) segment. Store its offsets in start
|
|
||||||
// and limit, and then convert all offsets at or after start to be
|
|
||||||
// equal to start. Upon failure, return FALSE. Assume that the
|
|
||||||
// caller has already called getLastParenOffset() and validated that
|
|
||||||
// there is at least one parenthesis and that the last one is a close
|
|
||||||
// paren.
|
|
||||||
public boolean extractLastParenSubstring(int[] start, int[] limit) {
|
|
||||||
// assert(offsets.size() > 0);
|
|
||||||
// assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
|
|
||||||
int i = size() - 1;
|
|
||||||
int n = 1; // count of close parens we need to match
|
|
||||||
// Record position of the last close paren
|
|
||||||
limit[0] = offset(i);
|
|
||||||
--i; // back up to the one before the last one
|
|
||||||
while (i >= 0 && n != 0) {
|
|
||||||
n += isOpen(i) ? -1 : 1;
|
|
||||||
}
|
|
||||||
if (n != 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// assert(i>=0);
|
|
||||||
start[0] = offset(i);
|
|
||||||
// Reset all segment pairs from i to size() - 1 to [start, start+1).
|
|
||||||
while (i<size()) {
|
|
||||||
int o = isOpen(i) ? start[0] : (start[0]+1);
|
|
||||||
offsets.setElementAt(new Integer(o), i);
|
|
||||||
++i;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assume caller has already gotten a TRUE validate().
|
|
||||||
public int[] createArray() {
|
|
||||||
int c = count(); // number of segments
|
|
||||||
int arrayLen = 4*c + 4;
|
|
||||||
int[] array = new int[arrayLen];
|
|
||||||
int a2offset = 2*c + 3; // offset to array 2
|
|
||||||
|
|
||||||
array[0] = c;
|
|
||||||
array[1] = a2offset;
|
|
||||||
int i;
|
|
||||||
for (i=0; i<2*c; ++i) {
|
|
||||||
array[2+i] = offset(i);
|
|
||||||
}
|
|
||||||
array[a2offset-1] = -1;
|
|
||||||
array[arrayLen-1] = -1;
|
|
||||||
// Now walk through and match up segment numbers with parentheses.
|
|
||||||
// Number segments from 0. We're going to offset all entries by 2
|
|
||||||
// to skip the first two elements, array[0] and array[1].
|
|
||||||
Stack stack = new Stack();
|
|
||||||
int nextOpen = 0; // seg # of next open, 0-based
|
|
||||||
for (i=0; i<2*c; ++i) {
|
|
||||||
boolean open = isOpen(i);
|
|
||||||
// Let seg be the zero-based segment number.
|
|
||||||
// Open parens are at 2*seg in array 2.
|
|
||||||
// Close parens are at 2*seg+1 in array 2.
|
|
||||||
if (open) {
|
|
||||||
array[a2offset + 2*nextOpen] = 2+i;
|
|
||||||
stack.push(new Integer(nextOpen));
|
|
||||||
++nextOpen;
|
|
||||||
} else {
|
|
||||||
int nextClose = ((Integer) stack.pop()).intValue();
|
|
||||||
array[a2offset + 2*nextClose+1] = 2+i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// assert(stack.empty());
|
|
||||||
|
|
||||||
return array;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean validate() {
|
|
||||||
// want number of parens >= 2
|
|
||||||
// want number of parens to be even
|
|
||||||
// want first paren '('
|
|
||||||
// want parens to match up in the end
|
|
||||||
if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
int n = 0;
|
|
||||||
for (int i=0; i<size(); ++i) {
|
|
||||||
n += isOpen(i) ? 1 : -1;
|
|
||||||
if (n < 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return n == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Number of segments
|
|
||||||
// Assume caller has already gotten a TRUE validate().
|
|
||||||
public int count() {
|
|
||||||
// assert(validate());
|
|
||||||
return size() / 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
// class RuleHalf
|
// class RuleHalf
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
@ -505,11 +303,7 @@ class TransliteratorParser {
|
|||||||
public int ante = -1; // position of ante context marker '{' in text
|
public int ante = -1; // position of ante context marker '{' in text
|
||||||
public int post = -1; // position of post context marker '}' in text
|
public int post = -1; // position of post context marker '}' in text
|
||||||
|
|
||||||
// Record the position of the segment substrings and references. A
|
public int maxRef = -1; // n where maximum segment ref is $n; 1-based
|
||||||
// given side should have segments or segment references, but not
|
|
||||||
// both.
|
|
||||||
public Segments segments = null;
|
|
||||||
public int maxRef = -1; // index of largest ref (1..9)
|
|
||||||
|
|
||||||
// Record the offset to the cursor either to the left or to the
|
// Record the offset to the cursor either to the left or to the
|
||||||
// right of the key. This is indicated by characters on the output
|
// right of the key. This is indicated by characters on the output
|
||||||
@ -521,29 +315,88 @@ class TransliteratorParser {
|
|||||||
// output text.
|
// output text.
|
||||||
public int cursorOffset = 0; // only nonzero on output side
|
public int cursorOffset = 0; // only nonzero on output side
|
||||||
|
|
||||||
|
// Position of first CURSOR_OFFSET on _right_. This will be -1
|
||||||
|
// for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
|
||||||
|
private int cursorOffsetPos = 0;
|
||||||
|
|
||||||
public boolean anchorStart = false;
|
public boolean anchorStart = false;
|
||||||
public boolean anchorEnd = false;
|
public boolean anchorEnd = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* UnicodeMatcher objects corresponding to each segment.
|
||||||
|
*/
|
||||||
|
public Vector segments = new Vector();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The segment number from 0..n-1 of the next '(' we see
|
||||||
|
* during parsing; 0-based.
|
||||||
|
*/
|
||||||
|
private int nextSegmentNumber = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse one side of a rule, stopping at either the limit,
|
* Parse one side of a rule, stopping at either the limit,
|
||||||
* the END_OF_RULE character, or an operator. Return
|
* the END_OF_RULE character, or an operator.
|
||||||
* the pos of the terminating character (or limit).
|
* @return the index after the terminating character, or
|
||||||
|
* if limit was reached, limit
|
||||||
*/
|
*/
|
||||||
public int parse(String rule, int pos, int limit,
|
public int parse(String rule, int pos, int limit,
|
||||||
TransliteratorParser parser) {
|
TransliteratorParser parser) {
|
||||||
int start = pos;
|
int start = pos;
|
||||||
StringBuffer buf = new StringBuffer();
|
StringBuffer buf = new StringBuffer();
|
||||||
|
pos = parseSection(rule, pos, limit, parser, buf, false);
|
||||||
|
text = buf.toString();
|
||||||
|
|
||||||
|
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
||||||
|
syntaxError("Misplaced " + CURSOR_POS, rule, start);
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a section of one side of a rule, stopping at either
|
||||||
|
* the limit, the END_OF_RULE character, an operator, or a
|
||||||
|
* segment close character. This method parses both a
|
||||||
|
* top-level rule half and a segment within such a rule half.
|
||||||
|
* It calls itself recursively to parse segments and nested
|
||||||
|
* segments.
|
||||||
|
* @param buf buffer into which to accumulate the rule pattern
|
||||||
|
* characters, either literal characters from the rule or
|
||||||
|
* standins for UnicodeMatcher objects including segments.
|
||||||
|
* @param isSegment if true, then we've already seen a '(' and
|
||||||
|
* pos on entry points right after it. Accumulate everything
|
||||||
|
* up to the closing ')', put it in a segment matcher object,
|
||||||
|
* generate a standin for it, and add the standin to buf. As
|
||||||
|
* a side effect, update the segments vector with a reference
|
||||||
|
* to the segment matcher. This works recursively for nested
|
||||||
|
* segments. If isSegment is false, just accumulate
|
||||||
|
* characters into buf.
|
||||||
|
* @return the index after the terminating character, or
|
||||||
|
* if limit was reached, limit
|
||||||
|
*/
|
||||||
|
private int parseSection(String rule, int pos, int limit,
|
||||||
|
TransliteratorParser parser,
|
||||||
|
StringBuffer buf,
|
||||||
|
boolean isSegment) {
|
||||||
|
int start = pos;
|
||||||
ParsePosition pp = null;
|
ParsePosition pp = null;
|
||||||
int cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
|
|
||||||
boolean done = false;
|
|
||||||
int quoteStart = -1; // Most recent 'single quoted string'
|
int quoteStart = -1; // Most recent 'single quoted string'
|
||||||
int quoteLimit = -1;
|
int quoteLimit = -1;
|
||||||
int varStart = -1; // Most recent $variableReference
|
int varStart = -1; // Most recent $variableReference
|
||||||
int varLimit = -1;
|
int varLimit = -1;
|
||||||
int[] iref = new int[1];
|
int[] iref = new int[1];
|
||||||
|
|
||||||
|
// If isSegment, then bufSegStart is the offset in buf to
|
||||||
|
// the first character of the segment we are parsing.
|
||||||
|
int bufSegStart = 0;
|
||||||
|
int segmentNumber = 0;
|
||||||
|
if (isSegment) {
|
||||||
|
bufSegStart = buf.length();
|
||||||
|
segmentNumber = nextSegmentNumber++;
|
||||||
|
}
|
||||||
|
|
||||||
main:
|
main:
|
||||||
while (pos < limit && !done) {
|
while (pos < limit) {
|
||||||
char c = rule.charAt(pos++);
|
char c = rule.charAt(pos++);
|
||||||
if (Character.isWhitespace(c)) {
|
if (Character.isWhitespace(c)) {
|
||||||
// Ignore whitespace. Note that this is not Unicode
|
// Ignore whitespace. Note that this is not Unicode
|
||||||
@ -551,8 +404,11 @@ class TransliteratorParser {
|
|||||||
// whitespace likely to be seen in code.
|
// whitespace likely to be seen in code.
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (OPERATORS.indexOf(c) >= 0) {
|
// HALF_ENDERS is all chars that end a rule half: "<>=;"
|
||||||
--pos; // Backup to point to operator
|
if (HALF_ENDERS.indexOf(c) >= 0) {
|
||||||
|
if (isSegment) {
|
||||||
|
syntaxError("Unclosed segment", rule, start);
|
||||||
|
}
|
||||||
break main;
|
break main;
|
||||||
}
|
}
|
||||||
if (anchorEnd) {
|
if (anchorEnd) {
|
||||||
@ -614,7 +470,12 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (c) {
|
switch (c) {
|
||||||
|
|
||||||
|
//------------------------------------------------------
|
||||||
|
// Elements allowed within and out of segments
|
||||||
|
//------------------------------------------------------
|
||||||
case ANCHOR_START:
|
case ANCHOR_START:
|
||||||
if (buf.length() == 0 && !anchorStart) {
|
if (buf.length() == 0 && !anchorStart) {
|
||||||
anchorStart = true;
|
anchorStart = true;
|
||||||
@ -624,17 +485,8 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case SEGMENT_OPEN:
|
case SEGMENT_OPEN:
|
||||||
case SEGMENT_CLOSE:
|
pos = parseSection(rule, pos, limit, parser, buf, true);
|
||||||
// Handle segment definitions "(" and ")"
|
|
||||||
// Parse "(", ")"
|
|
||||||
if (segments == null) {
|
|
||||||
segments = new Segments();
|
|
||||||
}
|
|
||||||
segments.addParenthesisAt(buf.length(), c == SEGMENT_OPEN);
|
|
||||||
break;
|
break;
|
||||||
case END_OF_RULE:
|
|
||||||
--pos; // Backup to point to END_OF_RULE
|
|
||||||
break main;
|
|
||||||
case SymbolTable.SYMBOL_REF:
|
case SymbolTable.SYMBOL_REF:
|
||||||
// Handle variable references and segment references "$1" .. "$9"
|
// Handle variable references and segment references "$1" .. "$9"
|
||||||
{
|
{
|
||||||
@ -676,7 +528,7 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
pp.setIndex(pos);
|
pp.setIndex(pos);
|
||||||
String name = parser.parseData.
|
String name = parser.parseData.
|
||||||
parseReference(rule, pp, limit);
|
parseReference(rule, pp, limit);
|
||||||
if (name == null) {
|
if (name == null) {
|
||||||
// This means the '$' was not followed by a
|
// This means the '$' was not followed by a
|
||||||
// valid name. Try to interpret it as an
|
// valid name. Try to interpret it as an
|
||||||
@ -697,25 +549,129 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case DOT:
|
||||||
|
buf.append(parser.getDotStandIn());
|
||||||
|
break;
|
||||||
|
case KLEENE_STAR:
|
||||||
|
case ONE_OR_MORE:
|
||||||
|
case ZERO_OR_ONE:
|
||||||
|
// Quantifiers. We handle single characters, quoted strings,
|
||||||
|
// variable references, and segments.
|
||||||
|
// a+ matches aaa
|
||||||
|
// 'foo'+ matches foofoofoo
|
||||||
|
// $v+ matches xyxyxy if $v == xy
|
||||||
|
// (seg)+ matches segsegseg
|
||||||
|
{
|
||||||
|
if (isSegment && buf.length() == bufSegStart) {
|
||||||
|
// The */+ immediately follows '('
|
||||||
|
syntaxError("Misplaced quantifier", rule, start);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int qstart, qlimit;
|
||||||
|
// The */+ follows an isolated character or quote
|
||||||
|
// or variable reference
|
||||||
|
if (buf.length() == quoteLimit) {
|
||||||
|
// The */+ follows a 'quoted string'
|
||||||
|
qstart = quoteStart;
|
||||||
|
qlimit = quoteLimit;
|
||||||
|
} else if (buf.length() == varLimit) {
|
||||||
|
// The */+ follows a $variableReference
|
||||||
|
qstart = varStart;
|
||||||
|
qlimit = varLimit;
|
||||||
|
} else {
|
||||||
|
// The */+ follows a single character, possibly
|
||||||
|
// a segment standin
|
||||||
|
qstart = buf.length() - 1;
|
||||||
|
qlimit = qstart + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
UnicodeMatcher m =
|
||||||
|
new StringMatcher(buf.toString(), qstart, qlimit,
|
||||||
|
false, parser.data);
|
||||||
|
int min = 0;
|
||||||
|
int max = Quantifier.MAX;
|
||||||
|
switch (c) {
|
||||||
|
case ONE_OR_MORE:
|
||||||
|
min = 1;
|
||||||
|
break;
|
||||||
|
case ZERO_OR_ONE:
|
||||||
|
min = 0;
|
||||||
|
max = 1;
|
||||||
|
break;
|
||||||
|
// case KLEENE_STAR:
|
||||||
|
// do nothing -- min, max already set
|
||||||
|
}
|
||||||
|
m = new Quantifier(m, min, max);
|
||||||
|
buf.setLength(qstart);
|
||||||
|
buf.append(parser.generateStandInFor(m));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
//------------------------------------------------------
|
||||||
|
// Elements allowed ONLY WITHIN segments
|
||||||
|
//------------------------------------------------------
|
||||||
|
case SEGMENT_CLOSE:
|
||||||
|
if (isSegment) {
|
||||||
|
// We're done parsing a segment. The relevant
|
||||||
|
// characters are in buf, starting at offset
|
||||||
|
// bufSegStart. Extract them into a string
|
||||||
|
// matcher, and replace them with a standin
|
||||||
|
// for that matcher.
|
||||||
|
StringMatcher m =
|
||||||
|
new StringMatcher(buf.substring(bufSegStart),
|
||||||
|
true, parser.data);
|
||||||
|
// Since we call parseSection() recursively,
|
||||||
|
// nested segments will result in segment i+1
|
||||||
|
// getting parsed and stored before segment i;
|
||||||
|
// be careful with the vector handling here.
|
||||||
|
if ((segmentNumber+1) > segments.size()) {
|
||||||
|
segments.setSize(segmentNumber+1);
|
||||||
|
}
|
||||||
|
segments.setElementAt(m, segmentNumber);
|
||||||
|
buf.setLength(bufSegStart);
|
||||||
|
buf.append(parser.generateStandInFor(m));
|
||||||
|
break main;
|
||||||
|
}
|
||||||
|
// If we aren't in a segment, then a segment close
|
||||||
|
// character is a syntax error.
|
||||||
|
syntaxError("Unquoted special", rule, start);
|
||||||
|
break;
|
||||||
|
|
||||||
|
//------------------------------------------------------
|
||||||
|
// Elements allowed ONLY OUTSIDE segments
|
||||||
|
//------------------------------------------------------
|
||||||
case CONTEXT_ANTE:
|
case CONTEXT_ANTE:
|
||||||
|
if (isSegment) {
|
||||||
|
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||||
|
}
|
||||||
if (ante >= 0) {
|
if (ante >= 0) {
|
||||||
syntaxError("Multiple ante contexts", rule, start);
|
syntaxError("Multiple ante contexts", rule, start);
|
||||||
}
|
}
|
||||||
ante = buf.length();
|
ante = buf.length();
|
||||||
break;
|
break;
|
||||||
case CONTEXT_POST:
|
case CONTEXT_POST:
|
||||||
|
if (isSegment) {
|
||||||
|
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||||
|
}
|
||||||
if (post >= 0) {
|
if (post >= 0) {
|
||||||
syntaxError("Multiple post contexts", rule, start);
|
syntaxError("Multiple post contexts", rule, start);
|
||||||
}
|
}
|
||||||
post = buf.length();
|
post = buf.length();
|
||||||
break;
|
break;
|
||||||
case CURSOR_POS:
|
case CURSOR_POS:
|
||||||
|
if (isSegment) {
|
||||||
|
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||||
|
}
|
||||||
if (cursor >= 0) {
|
if (cursor >= 0) {
|
||||||
syntaxError("Multiple cursors", rule, start);
|
syntaxError("Multiple cursors", rule, start);
|
||||||
}
|
}
|
||||||
cursor = buf.length();
|
cursor = buf.length();
|
||||||
break;
|
break;
|
||||||
case CURSOR_OFFSET:
|
case CURSOR_OFFSET:
|
||||||
|
if (isSegment) {
|
||||||
|
syntaxError("Illegal character '" + c + "' in segment", rule, start);
|
||||||
|
}
|
||||||
if (cursorOffset < 0) {
|
if (cursorOffset < 0) {
|
||||||
if (buf.length() > 0) {
|
if (buf.length() > 0) {
|
||||||
syntaxError("Misplaced " + c, rule, start);
|
syntaxError("Misplaced " + c, rule, start);
|
||||||
@ -737,74 +693,10 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case DOT:
|
|
||||||
buf.append(parser.getDotStandIn());
|
//------------------------------------------------------
|
||||||
break;
|
// Non-special characters
|
||||||
case KLEENE_STAR:
|
//------------------------------------------------------
|
||||||
case ONE_OR_MORE:
|
|
||||||
case ZERO_OR_ONE:
|
|
||||||
// Quantifiers. We handle single characters, quoted strings,
|
|
||||||
// variable references, and segments.
|
|
||||||
// a+ matches aaa
|
|
||||||
// 'foo'+ matches foofoofoo
|
|
||||||
// $v+ matches xyxyxy if $v == xy
|
|
||||||
// (seg)+ matches segsegseg
|
|
||||||
{
|
|
||||||
int qstart, qlimit;
|
|
||||||
boolean[] isOpenParen = new boolean[1];
|
|
||||||
boolean isSegment = false;
|
|
||||||
if (segments != null &&
|
|
||||||
segments.getLastParenOffset(isOpenParen) == buf.length()) {
|
|
||||||
// The */+ immediately follows a segment
|
|
||||||
if (isOpenParen[0]) {
|
|
||||||
syntaxError("Misplaced quantifier", rule, start);
|
|
||||||
}
|
|
||||||
int[] startparam = new int[1];
|
|
||||||
int[] limitparam = new int[1];
|
|
||||||
if (!segments.extractLastParenSubstring(startparam, limitparam)) {
|
|
||||||
syntaxError("Mismatched segment delimiters", rule, start);
|
|
||||||
}
|
|
||||||
qstart = startparam[0];
|
|
||||||
qlimit = limitparam[0];
|
|
||||||
isSegment = true;
|
|
||||||
} else {
|
|
||||||
// The */+ follows an isolated character or quote
|
|
||||||
// or variable reference
|
|
||||||
if (buf.length() == quoteLimit) {
|
|
||||||
// The */+ follows a 'quoted string'
|
|
||||||
qstart = quoteStart;
|
|
||||||
qlimit = quoteLimit;
|
|
||||||
} else if (buf.length() == varLimit) {
|
|
||||||
// The */+ follows a $variableReference
|
|
||||||
qstart = varStart;
|
|
||||||
qlimit = varLimit;
|
|
||||||
} else {
|
|
||||||
// The */+ follows a single character
|
|
||||||
qstart = buf.length() - 1;
|
|
||||||
qlimit = qstart + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
UnicodeMatcher m =
|
|
||||||
new StringMatcher(buf.toString(), qstart, qlimit,
|
|
||||||
isSegment, parser.data);
|
|
||||||
int min = 0;
|
|
||||||
int max = Quantifier.MAX;
|
|
||||||
switch (c) {
|
|
||||||
case ONE_OR_MORE:
|
|
||||||
min = 1;
|
|
||||||
break;
|
|
||||||
case ZERO_OR_ONE:
|
|
||||||
min = 0;
|
|
||||||
max = 1;
|
|
||||||
break;
|
|
||||||
// case KLEENE_STAR:
|
|
||||||
// do nothing -- min, max already set
|
|
||||||
}
|
|
||||||
m = new Quantifier(m, min, max);
|
|
||||||
buf.setLength(qstart);
|
|
||||||
buf.append(parser.generateStandInFor(m));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||||
// in the printable ASCII range. These characters are
|
// in the printable ASCII range. These characters are
|
||||||
@ -819,11 +711,6 @@ class TransliteratorParser {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
|
||||||
syntaxError("Misplaced " + CURSOR_POS, rule, start);
|
|
||||||
}
|
|
||||||
text = buf.toString();
|
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -838,10 +725,12 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create and return an int[] array of segments.
|
* Create and return a UnicodeMatcher[] array of segments,
|
||||||
|
* or null if there are no segments.
|
||||||
*/
|
*/
|
||||||
int[] createSegments() {
|
UnicodeMatcher[] createSegments() {
|
||||||
return (segments == null) ? null : segments.createArray();
|
return (segments.size() == 0) ? null :
|
||||||
|
(UnicodeMatcher[]) segments.toArray(new UnicodeMatcher[segments.size()]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1096,9 +985,10 @@ class TransliteratorParser {
|
|||||||
pos = left.parse(rule, pos, limit, this);
|
pos = left.parse(rule, pos, limit, this);
|
||||||
|
|
||||||
if (pos == limit ||
|
if (pos == limit ||
|
||||||
OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
|
OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
|
||||||
syntaxError("No operator", rule, start);
|
syntaxError("No operator pos=" + pos, rule, start);
|
||||||
}
|
}
|
||||||
|
++pos;
|
||||||
|
|
||||||
// Found an operator char. Check for forward-reverse operator.
|
// Found an operator char. Check for forward-reverse operator.
|
||||||
if (operator == REVERSE_RULE_OP &&
|
if (operator == REVERSE_RULE_OP &&
|
||||||
@ -1110,7 +1000,7 @@ class TransliteratorParser {
|
|||||||
pos = right.parse(rule, pos, limit, this);
|
pos = right.parse(rule, pos, limit, this);
|
||||||
|
|
||||||
if (pos < limit) {
|
if (pos < limit) {
|
||||||
if (rule.charAt(pos) == END_OF_RULE) {
|
if (rule.charAt(--pos) == END_OF_RULE) {
|
||||||
++pos;
|
++pos;
|
||||||
} else {
|
} else {
|
||||||
// RuleHalf parser must have terminated at an operator
|
// RuleHalf parser must have terminated at an operator
|
||||||
@ -1173,7 +1063,7 @@ class TransliteratorParser {
|
|||||||
// apply.
|
// apply.
|
||||||
if (operator == FWDREV_RULE_OP) {
|
if (operator == FWDREV_RULE_OP) {
|
||||||
right.removeContext();
|
right.removeContext();
|
||||||
right.segments = null;
|
right.segments.removeAllElements();
|
||||||
left.cursor = left.maxRef = -1;
|
left.cursor = left.maxRef = -1;
|
||||||
left.cursorOffset = 0;
|
left.cursorOffset = 0;
|
||||||
}
|
}
|
||||||
@ -1193,7 +1083,7 @@ class TransliteratorParser {
|
|||||||
// cannot place the cursor outside the limits of the context.
|
// cannot place the cursor outside the limits of the context.
|
||||||
// Anchors are only allowed on the input side.
|
// Anchors are only allowed on the input side.
|
||||||
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
|
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
|
||||||
right.segments != null || left.maxRef >= 0 ||
|
right.segments.size() > 0 || left.maxRef >= 0 ||
|
||||||
(right.cursorOffset != 0 && right.cursor < 0) ||
|
(right.cursorOffset != 0 && right.cursor < 0) ||
|
||||||
// - The following two checks were used to ensure that the
|
// - The following two checks were used to ensure that the
|
||||||
// - the cursor offset stayed within the ante- or postcontext.
|
// - the cursor offset stayed within the ante- or postcontext.
|
||||||
@ -1208,14 +1098,8 @@ class TransliteratorParser {
|
|||||||
// Check integrity of segments and segment references. Each
|
// Check integrity of segments and segment references. Each
|
||||||
// segment's start must have a corresponding limit, and the
|
// segment's start must have a corresponding limit, and the
|
||||||
// references must not refer to segments that do not exist.
|
// references must not refer to segments that do not exist.
|
||||||
if (left.segments != null) {
|
if (right.maxRef > left.segments.size()) {
|
||||||
if (!left.segments.validate()) {
|
syntaxError("Undefined segment reference $" + right.maxRef, rule, start);
|
||||||
syntaxError("Missing segment close", rule, start);
|
|
||||||
}
|
|
||||||
int n = left.segments.count();
|
|
||||||
if (right.maxRef > n) {
|
|
||||||
syntaxError("Undefined segment reference", rule, start);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
data.ruleSet.addRule(new TransliterationRule(
|
data.ruleSet.addRule(new TransliterationRule(
|
||||||
@ -1363,7 +1247,7 @@ class TransliteratorParser {
|
|||||||
char generateStandInFor(UnicodeMatcher matcher) {
|
char generateStandInFor(UnicodeMatcher matcher) {
|
||||||
// assert(matcher != null);
|
// assert(matcher != null);
|
||||||
if (variableNext >= variableLimit) {
|
if (variableNext >= variableLimit) {
|
||||||
throw new RuntimeException("Private use variables exhausted");
|
throw new RuntimeException("Variable range exhausted");
|
||||||
}
|
}
|
||||||
variablesVector.addElement(matcher);
|
variablesVector.addElement(matcher);
|
||||||
return variableNext++;
|
return variableNext++;
|
||||||
@ -1379,7 +1263,7 @@ class TransliteratorParser {
|
|||||||
}
|
}
|
||||||
return (char) dotStandIn;
|
return (char) dotStandIn;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Append the value of the given variable name to the given
|
* Append the value of the given variable name to the given
|
||||||
* StringBuffer.
|
* StringBuffer.
|
||||||
|
Loading…
Reference in New Issue
Block a user