ICU-2128 fix inf loop bug in RBBI Line Monkey; simplify RBBI word monkey

X-SVN-Rev: 13272
This commit is contained in:
Andy Heninger 2003-10-02 00:18:13 +00:00
parent 82753b8a48
commit 34357bdd81

View File

@ -2129,10 +2129,6 @@ private:
RegexMatcher *fMatcher;
const UnicodeString *fText;
UChar32 *fMungedText;
int32_t fMungedLen;
int32_t *fMungedPositions;
int32_t *fOrigPositions;
RegexMatcher *fGCFMatcher;
RegexMatcher *fGCMatcher;
@ -2140,10 +2136,7 @@ private:
};
RBBIWordMonkey::RBBIWordMonkey() : fMungedText(0),
fMungedPositions(0),
fOrigPositions(0),
fGCFMatcher(0),
RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0),
fGCMatcher(0)
{
UErrorCode status = U_ZERO_ERROR;
@ -2185,12 +2178,8 @@ RBBIWordMonkey::RBBIWordMonkey() : fMungedText(0),
fSets->addElement(fFormatSet, status);
fSets->addElement(fOtherSet, status);
fMungedText = NULL;
fMungedLen = 0;
fMungedPositions = NULL;
fOrigPositions = NULL;
fGCFMatcher = new RegexMatcher("\\X(?:\\p{Format}\\p{Grapheme_Extend}*)*", 0, status);
fGCFMatcher = new RegexMatcher("\\X(?:\\p{Format})*", 0, status);
fGCMatcher = new RegexMatcher("\\X", 0, status);
if (U_FAILURE(status)) {
@ -2200,162 +2189,141 @@ RBBIWordMonkey::RBBIWordMonkey() : fMungedText(0),
void RBBIWordMonkey::setText(const UnicodeString &s) {
fText = &s;
delete [] fMungedText;
fMungedText = new UChar32[s.length()];
fMungedLen = 0;
delete [] fMungedPositions;
fMungedPositions = new int32_t[s.length()];
delete [] fOrigPositions;
fOrigPositions = new int32_t[s.length()];
memset(fOrigPositions, -1, s.length()*4);
// Precompute the "Munged Text", which is the test text,
// converted to an array of UChar32 for easier indexing,
// and with all but the first char of each Graphem Cluster removed (rule 3)
// and with format chars removed (rule 4)
fGCFMatcher->reset(s);
fGCMatcher ->reset(s);
int32_t pos=0;
while (fGCFMatcher->find()) {
pos = fGCFMatcher->start(deferredStatus);
UChar32 c = s.char32At(pos);
fMungedPositions[fMungedLen] = pos;
fOrigPositions[pos] = fMungedLen;
fMungedText[fMungedLen++] = c;
}
fGCMatcher->reset(*fText);
fGCFMatcher->reset(*fText);
}
int32_t RBBIWordMonkey::next(int32_t prevPos) {
UErrorCode status = U_ZERO_ERROR;
int p0, p1, p2, p3; // Indices of the significant code points around the
// break position being tested. The candidate break
// locatoin is before p2.
int breakPos = -1;
UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
// Prev break at end of string. return DONE.
if (prevPos >= fText->length()) {
return -1;
}
p0 = p1 = p2 = p3 = prevPos;
c3 = fText->char32At(prevPos);
c0 = c1 = c2 = 0;
// If the previous position doesn't map to a position in the munged text,
// it means that the prev position was pointing to a trailing format char
// Advance, looking for additional format chars while doing so.
if (fOrigPositions[prevPos] == -1) {
// Advance by one grapheme cluster (could include combining marks)
fGCMatcher->reset();
fGCMatcher->find(prevPos, status);
int32_t pos = fGCMatcher->end(status);
if (U_FAILURE(status)) {
pos = -1;
}
// TODO: Don't return extend chars here!!!
return pos;
// Format char after prev break? Special case, see last Note for Word Boundaries TR.
// break immdiately after the format char.
if (fFormatSet->contains(c3)) {
breakPos = fText->moveIndex32(prevPos, 1);
return breakPos;
}
// Loop runs once per position in the munged test text, until a break position
// is found.
int32_t mpos = fOrigPositions[prevPos];
for (; ; mpos++) {
UChar32 letter = fMungedText[mpos];
// Break at end of text.
if (mpos >= fMungedLen-1) {
mpos = fMungedLen;
// Loop runs once per "significant" character position in the input text.
for (;;) {
// Move all of the positions forward in the input string.
p0 = p1; c0 = c1;
p1 = p2; c1 = c2;
p2 = p3; c2 = c3;
// Advancd p3 by (GC Format*) Rules 3, 4
status = U_ZERO_ERROR;
if (fGCFMatcher->find(p3, status) == FALSE) {
p3 = fText->length();
c3 = 0;
} else {
p3 = fGCFMatcher->end(0, status);
U_ASSERT(U_SUCCESS(status));
c3 = fText->char32At(p3);
}
if (p1 == p2) {
// Still warming up the loop. (won't work with zero length strings, but we don't care)
continue;
}
if (p2 == fText->length()) {
// Reached end of string. Always a break position.
break;
}
// Rule (5). ALetter x ALetter
if (fALetterSet->contains(fMungedText[mpos]) &&
fALetterSet->contains(fMungedText[mpos+1])) {
if (fALetterSet->contains(c1) &&
fALetterSet->contains(c2)) {
continue;
}
// Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
if ((mpos+2) < fMungedLen &&
fALetterSet->contains(fMungedText[mpos]) &&
(fMidLetterSet->contains(fMungedText[mpos+1]) ||
fMidNumLetSet->contains(fMungedText[mpos+1]) ) &&
fALetterSet->contains(fMungedText[mpos+2]))
//
// Also incorporates rule 7 by skipping pos ahead to position of the
// terminating ALetter.
if ( fALetterSet->contains(c1) &&
(fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
fALetterSet->contains(c3)) {
continue;
}
// Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
if (mpos >= 1 &&
fALetterSet->contains(fMungedText[mpos-1]) &&
(fMidLetterSet->contains(fMungedText[mpos]) ||
fMidNumLetSet->contains(fMungedText[mpos]) ) &&
fALetterSet->contains(fMungedText[mpos+1]))
if (fALetterSet->contains(c0) &&
(fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) ) &&
fALetterSet->contains(c2)) {
continue;
}
// Rule (8) Numeric x Numeric
if (fNumericSet->contains(fMungedText[mpos]) &&
fNumericSet->contains(fMungedText[mpos+1])) {
if (fNumericSet->contains(c1) &&
fNumericSet->contains(c2)) {
continue;
}
// Rule (9) ALetter x Numeric
if (fALetterSet->contains(fMungedText[mpos]) &&
fNumericSet->contains(fMungedText[mpos+1])) {
if (fALetterSet->contains(c1) &&
fNumericSet->contains(c2)) {
continue;
}
// Rule (10) Numeric x ALetter
if (fNumericSet->contains(fMungedText[mpos]) &&
fALetterSet->contains(fMungedText[mpos+1])) {
if (fNumericSet->contains(c1) &&
fALetterSet->contains(c2)) {
continue;
}
// Rule (11) Numeric (MidNum | MidNumLet) x Numeric
if (mpos >= 1 &&
fNumericSet->contains(fMungedText[mpos-1]) &&
(fMidNumSet->contains(fMungedText[mpos]) ||
fMidNumLetSet->contains(fMungedText[mpos]) ) &&
fNumericSet->contains(fMungedText[mpos+1]))
if ( fNumericSet->contains(c0) &&
(fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) &&
fNumericSet->contains(c2)) {
continue;
}
// Rule (12) Numeric x (MidNum | MidNumLet) Numeric
if ((mpos+2) < fMungedLen &&
fNumericSet->contains(fMungedText[mpos]) &&
(fMidNumSet->contains(fMungedText[mpos+1]) ||
fMidNumLetSet->contains(fMungedText[mpos+1]) ) &&
fNumericSet->contains(fMungedText[mpos+2]))
if (fNumericSet->contains(c1) &&
(fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
fNumericSet->contains(c3)) {
continue;
}
// Rule (13) Katakana x Katakana
if (fKatakanaSet->contains(fMungedText[mpos]) &&
fKatakanaSet->contains(fMungedText[mpos+1])) {
if (fKatakanaSet->contains(c1) &&
fKatakanaSet->contains(c2)) {
continue;
}
// Rule 14. Break found here.
mpos++;
break;
}
// We have a break position in terms of an index in the munged data.
// Get the corresponding index in the original test text.
int32_t breakPos;
if (mpos == fMungedLen) {
breakPos = fText->length();
} else {
breakPos = fMungedPositions[mpos];
}
// Rule 4 fixup, back up before any trailing
// format characters at the end of the word.
int32_t t = breakPos;
for (;;) {
t = fText->moveIndex32(t, -1);
if (t <= prevPos) {
break;
}
UChar32 prevC = fText->char32At(t);
if (fExtendSet->contains(prevC)) {
continue;
}
if (fFormatSet->contains(prevC) == FALSE) {
break;
}
breakPos = t;
// format characters at the end of the word.
breakPos = p2;
status = U_ZERO_ERROR;
if (fGCMatcher->find(p1, status)) {
breakPos = fGCMatcher->end(0, status);
U_ASSERT(U_SUCCESS(status));
}
return breakPos;
}
@ -2377,10 +2345,6 @@ RBBIWordMonkey::~RBBIWordMonkey() {
delete fExtendSet;
delete fOtherSet;
delete [] fMungedText;
delete [] fMungedPositions;
delete [] fOrigPositions;
delete fGCFMatcher;
delete fGCMatcher;
}
@ -2798,7 +2762,7 @@ fall_through_11:
nextPos = numEndIdx;
pos = numEndIdx;
do {
pos = fText->moveIndex32(nextPos, -1);
pos = fText->moveIndex32(pos, -1);
lastCharInNumber = fText->char32At(pos);
} while (fCM->contains(lastCharInNumber));
continue;
@ -2879,7 +2843,7 @@ RBBILineMonkey::~RBBILineMonkey() {
// -1: run forever.
// 0 or greater: run length.
//
// type = char | work | line | sent | title
// type = char | word | line | sent | title
//
//-------------------------------------------------------------------------------------------
@ -2966,6 +2930,10 @@ void RBBITest::TestMonkey(char *params) {
logln("Line Break Monkey Test");
RBBILineMonkey m;
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
if (params == NULL) {
// TODO: Resolve rule ambiguities, unpin loop count.
loopCount = 2;
}
RunMonkey(bi, m, "line", seed, loopCount);
delete bi;
}