scuffed-code/icu4c/source/i18n/rematch.cpp
Andy Heninger 14bcaaf58e ICU-20876 Regex Grapheme Cluster matching with Break Iterators.
Change the implementation of grapheme cluster matching in regex to use an ICU
break iterator instead of a little one-off state machine.

The old implementation had fallen behind the Unicode UAX-29 specification for
graphem clusters, and could not be easily updated.

The implementation follows the same general pattern that is used for finding
word boundaries with an ICU break iterator. In reviewing that code, a few
improvements to the handling of ICU error codes were also made.

Also note that this change adds a new dependency on Break Iteration.  Regex
patterns that previously would work with ICU builds that were configured with
no break iteration will now fail. But only if they include \X for matching
grapheme cluster boundaries.
2020-02-18 18:28:10 -08:00

5716 lines
214 KiB
C++

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**************************************************************************
* Copyright (C) 2002-2016 International Business Machines Corporation
* and others. All rights reserved.
**************************************************************************
*/
//
// file: rematch.cpp
//
// Contains the implementation of class RegexMatcher,
// which is one of the main API classes for the ICU regular expression package.
//
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/regex.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "unicode/rbbi.h"
#include "unicode/utf.h"
#include "unicode/utf16.h"
#include "uassert.h"
#include "cmemory.h"
#include "cstr.h"
#include "uvector.h"
#include "uvectr32.h"
#include "uvectr64.h"
#include "regeximp.h"
#include "regexst.h"
#include "regextxt.h"
#include "ucase.h"
// #include <malloc.h> // Needed for heapcheck testing
U_NAMESPACE_BEGIN
// Default limit for the size of the back track stack, to avoid system
// failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
// This value puts ICU's limits higher than most other regexp implementations,
// which use recursion rather than the heap, and take more storage per
// backtrack point.
//
static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
// Time limit counter constant.
// Time limits for expression evaluation are in terms of quanta of work by
// the engine, each of which is 10,000 state saves.
// This constant determines that state saves per tick number.
static const int32_t TIMER_INITIAL_VALUE = 10000;
// Test for any of the Unicode line terminating characters.
static inline UBool isLineTerminator(UChar32 c) {
if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
return false;
}
return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
}
//-----------------------------------------------------------------------------
//
// Constructor and Destructor
//
//-----------------------------------------------------------------------------
RegexMatcher::RegexMatcher(const RegexPattern *pat) {
fDeferredStatus = U_ZERO_ERROR;
init(fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
return;
}
if (pat==NULL) {
fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
fPattern = pat;
init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
}
RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
uint32_t flags, UErrorCode &status) {
init(status);
if (U_FAILURE(status)) {
return;
}
UParseError pe;
fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
fPattern = fPatternOwned;
UText inputText = UTEXT_INITIALIZER;
utext_openConstUnicodeString(&inputText, &input, &status);
init2(&inputText, status);
utext_close(&inputText);
fInputUniStrMaybeMutable = TRUE;
}
RegexMatcher::RegexMatcher(UText *regexp, UText *input,
uint32_t flags, UErrorCode &status) {
init(status);
if (U_FAILURE(status)) {
return;
}
UParseError pe;
fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
if (U_FAILURE(status)) {
return;
}
fPattern = fPatternOwned;
init2(input, status);
}
RegexMatcher::RegexMatcher(const UnicodeString &regexp,
uint32_t flags, UErrorCode &status) {
init(status);
if (U_FAILURE(status)) {
return;
}
UParseError pe;
fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
if (U_FAILURE(status)) {
return;
}
fPattern = fPatternOwned;
init2(RegexStaticSets::gStaticSets->fEmptyText, status);
}
RegexMatcher::RegexMatcher(UText *regexp,
uint32_t flags, UErrorCode &status) {
init(status);
if (U_FAILURE(status)) {
return;
}
UParseError pe;
fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
if (U_FAILURE(status)) {
return;
}
fPattern = fPatternOwned;
init2(RegexStaticSets::gStaticSets->fEmptyText, status);
}
RegexMatcher::~RegexMatcher() {
delete fStack;
if (fData != fSmallData) {
uprv_free(fData);
fData = NULL;
}
if (fPatternOwned) {
delete fPatternOwned;
fPatternOwned = NULL;
fPattern = NULL;
}
if (fInput) {
delete fInput;
}
if (fInputText) {
utext_close(fInputText);
}
if (fAltInputText) {
utext_close(fAltInputText);
}
#if UCONFIG_NO_BREAK_ITERATION==0
delete fWordBreakItr;
delete fGCBreakItr;
#endif
}
//
// init() common initialization for use by all constructors.
// Initialize all fields, get the object into a consistent state.
// This must be done even when the initial status shows an error,
// so that the object is initialized sufficiently well for the destructor
// to run safely.
//
void RegexMatcher::init(UErrorCode &status) {
fPattern = NULL;
fPatternOwned = NULL;
fFrameSize = 0;
fRegionStart = 0;
fRegionLimit = 0;
fAnchorStart = 0;
fAnchorLimit = 0;
fLookStart = 0;
fLookLimit = 0;
fActiveStart = 0;
fActiveLimit = 0;
fTransparentBounds = FALSE;
fAnchoringBounds = TRUE;
fMatch = FALSE;
fMatchStart = 0;
fMatchEnd = 0;
fLastMatchEnd = -1;
fAppendPosition = 0;
fHitEnd = FALSE;
fRequireEnd = FALSE;
fStack = NULL;
fFrame = NULL;
fTimeLimit = 0;
fTime = 0;
fTickCounter = 0;
fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
fCallbackFn = NULL;
fCallbackContext = NULL;
fFindProgressCallbackFn = NULL;
fFindProgressCallbackContext = NULL;
fTraceDebug = FALSE;
fDeferredStatus = status;
fData = fSmallData;
fWordBreakItr = NULL;
fGCBreakItr = NULL;
fStack = NULL;
fInputText = NULL;
fAltInputText = NULL;
fInput = NULL;
fInputLength = 0;
fInputUniStrMaybeMutable = FALSE;
}
//
// init2() Common initialization for use by RegexMatcher constructors, part 2.
// This handles the common setup to be done after the Pattern is available.
//
void RegexMatcher::init2(UText *input, UErrorCode &status) {
if (U_FAILURE(status)) {
fDeferredStatus = status;
return;
}
if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
if (fData == NULL) {
status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
fStack = new UVector64(status);
if (fStack == NULL) {
status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
reset(input);
setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
if (U_FAILURE(status)) {
fDeferredStatus = status;
return;
}
}
static const UChar BACKSLASH = 0x5c;
static const UChar DOLLARSIGN = 0x24;
static const UChar LEFTBRACKET = 0x7b;
static const UChar RIGHTBRACKET = 0x7d;
//--------------------------------------------------------------------------------
//
// appendReplacement
//
//--------------------------------------------------------------------------------
RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
const UnicodeString &replacement,
UErrorCode &status) {
UText replacementText = UTEXT_INITIALIZER;
utext_openConstUnicodeString(&replacementText, &replacement, &status);
if (U_SUCCESS(status)) {
UText resultText = UTEXT_INITIALIZER;
utext_openUnicodeString(&resultText, &dest, &status);
if (U_SUCCESS(status)) {
appendReplacement(&resultText, &replacementText, status);
utext_close(&resultText);
}
utext_close(&replacementText);
}
return *this;
}
//
// appendReplacement, UText mode
//
RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
UText *replacement,
UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return *this;
}
if (fMatch == FALSE) {
status = U_REGEX_INVALID_STATE;
return *this;
}
// Copy input string from the end of previous match to start of current match
int64_t destLen = utext_nativeLength(dest);
if (fMatchStart > fAppendPosition) {
if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
(int32_t)(fMatchStart-fAppendPosition), &status);
} else {
int32_t len16;
if (UTEXT_USES_U16(fInputText)) {
len16 = (int32_t)(fMatchStart-fAppendPosition);
} else {
UErrorCode lengthStatus = U_ZERO_ERROR;
len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
}
UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
if (inputChars == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return *this;
}
utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
uprv_free(inputChars);
}
}
fAppendPosition = fMatchEnd;
// scan the replacement text, looking for substitutions ($n) and \escapes.
// TODO: optimize this loop by efficiently scanning for '$' or '\',
// move entire ranges not containing substitutions.
UTEXT_SETNATIVEINDEX(replacement, 0);
for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
if (c == BACKSLASH) {
// Backslash Escape. Copy the following char out without further checks.
// Note: Surrogate pairs don't need any special handling
// The second half wont be a '$' or a '\', and
// will move to the dest normally on the next
// loop iteration.
c = UTEXT_CURRENT32(replacement);
if (c == U_SENTINEL) {
break;
}
if (c==0x55/*U*/ || c==0x75/*u*/) {
// We have a \udddd or \Udddddddd escape sequence.
int32_t offset = 0;
struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
if (escapedChar != (UChar32)0xFFFFFFFF) {
if (U_IS_BMP(escapedChar)) {
UChar c16 = (UChar)escapedChar;
destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
} else {
UChar surrogate[2];
surrogate[0] = U16_LEAD(escapedChar);
surrogate[1] = U16_TRAIL(escapedChar);
if (U_SUCCESS(status)) {
destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
}
}
// TODO: Report errors for mal-formed \u escapes?
// As this is, the original sequence is output, which may be OK.
if (context.lastOffset == offset) {
(void)UTEXT_PREVIOUS32(replacement);
} else if (context.lastOffset != offset-1) {
utext_moveIndex32(replacement, offset - context.lastOffset - 1);
}
}
} else {
(void)UTEXT_NEXT32(replacement);
// Plain backslash escape. Just put out the escaped character.
if (U_IS_BMP(c)) {
UChar c16 = (UChar)c;
destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
} else {
UChar surrogate[2];
surrogate[0] = U16_LEAD(c);
surrogate[1] = U16_TRAIL(c);
if (U_SUCCESS(status)) {
destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
}
}
}
} else if (c != DOLLARSIGN) {
// Normal char, not a $. Copy it out without further checks.
if (U_IS_BMP(c)) {
UChar c16 = (UChar)c;
destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
} else {
UChar surrogate[2];
surrogate[0] = U16_LEAD(c);
surrogate[1] = U16_TRAIL(c);
if (U_SUCCESS(status)) {
destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
}
}
} else {
// We've got a $. Pick up a capture group name or number if one follows.
// Consume digits so long as the resulting group number <= the number of
// number of capture groups in the pattern.
int32_t groupNum = 0;
int32_t numDigits = 0;
UChar32 nextChar = utext_current32(replacement);
if (nextChar == LEFTBRACKET) {
// Scan for a Named Capture Group, ${name}.
UnicodeString groupName;
utext_next32(replacement);
while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
nextChar = utext_next32(replacement);
if (nextChar == U_SENTINEL) {
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
} else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
(nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
(nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
groupName.append(nextChar);
} else if (nextChar == RIGHTBRACKET) {
groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
if (groupNum == 0) {
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
} else {
// Character was something other than a name char or a closing '}'
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
}
} else if (u_isdigit(nextChar)) {
// $n Scan for a capture group number
int32_t numCaptureGroups = fPattern->fGroupMap->size();
for (;;) {
nextChar = UTEXT_CURRENT32(replacement);
if (nextChar == U_SENTINEL) {
break;
}
if (u_isdigit(nextChar) == FALSE) {
break;
}
int32_t nextDigitVal = u_charDigitValue(nextChar);
if (groupNum*10 + nextDigitVal > numCaptureGroups) {
// Don't consume the next digit if it makes the capture group number too big.
if (numDigits == 0) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
}
break;
}
(void)UTEXT_NEXT32(replacement);
groupNum=groupNum*10 + nextDigitVal;
++numDigits;
}
} else {
// $ not followed by capture group name or number.
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
if (U_SUCCESS(status)) {
destLen += appendGroup(groupNum, dest, status);
}
} // End of $ capture group handling
} // End of per-character loop through the replacement string.
return *this;
}
//--------------------------------------------------------------------------------
//
// appendTail Intended to be used in conjunction with appendReplacement()
// To the destination string, append everything following
// the last match position from the input string.
//
// Note: Match ranges do not affect appendTail or appendReplacement
//
//--------------------------------------------------------------------------------
UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
UErrorCode status = U_ZERO_ERROR;
UText resultText = UTEXT_INITIALIZER;
utext_openUnicodeString(&resultText, &dest, &status);
if (U_SUCCESS(status)) {
appendTail(&resultText, status);
utext_close(&resultText);
}
return dest;
}
//
// appendTail, UText mode
//
UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
if (U_FAILURE(status)) {
return dest;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return dest;
}
if (fInputLength > fAppendPosition) {
if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
int64_t destLen = utext_nativeLength(dest);
utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
(int32_t)(fInputLength-fAppendPosition), &status);
} else {
int32_t len16;
if (UTEXT_USES_U16(fInputText)) {
len16 = (int32_t)(fInputLength-fAppendPosition);
} else {
len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
status = U_ZERO_ERROR; // buffer overflow
}
UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
if (inputChars == NULL) {
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
} else {
utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
int64_t destLen = utext_nativeLength(dest);
utext_replace(dest, destLen, destLen, inputChars, len16, &status);
uprv_free(inputChars);
}
}
}
return dest;
}
//--------------------------------------------------------------------------------
//
// end
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::end(UErrorCode &err) const {
return end(0, err);
}
int64_t RegexMatcher::end64(UErrorCode &err) const {
return end64(0, err);
}
int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
if (U_FAILURE(err)) {
return -1;
}
if (fMatch == FALSE) {
err = U_REGEX_INVALID_STATE;
return -1;
}
if (group < 0 || group > fPattern->fGroupMap->size()) {
err = U_INDEX_OUTOFBOUNDS_ERROR;
return -1;
}
int64_t e = -1;
if (group == 0) {
e = fMatchEnd;
} else {
// Get the position within the stack frame of the variables for
// this capture group.
int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
U_ASSERT(groupOffset < fPattern->fFrameSize);
U_ASSERT(groupOffset >= 0);
e = fFrame->fExtra[groupOffset + 1];
}
return e;
}
int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
return (int32_t)end64(group, err);
}
//--------------------------------------------------------------------------------
//
// findProgressInterrupt This function is called once for each advance in the target
// string from the find() function, and calls the user progress callback
// function if there is one installed.
//
// Return: TRUE if the find operation is to be terminated.
// FALSE if the find operation is to continue running.
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
status = U_REGEX_STOPPED_BY_CALLER;
return TRUE;
}
return FALSE;
}
//--------------------------------------------------------------------------------
//
// find()
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::find() {
if (U_FAILURE(fDeferredStatus)) {
return FALSE;
}
UErrorCode status = U_ZERO_ERROR;
UBool result = find(status);
return result;
}
//--------------------------------------------------------------------------------
//
// find()
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::find(UErrorCode &status) {
// Start at the position of the last match end. (Will be zero if the
// matcher has been reset.)
//
if (U_FAILURE(status)) {
return FALSE;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return FALSE;
}
if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
return findUsingChunk(status);
}
int64_t startPos = fMatchEnd;
if (startPos==0) {
startPos = fActiveStart;
}
if (fMatch) {
// Save the position of any previous successful match.
fLastMatchEnd = fMatchEnd;
if (fMatchStart == fMatchEnd) {
// Previous match had zero length. Move start position up one position
// to avoid sending find() into a loop on zero-length matches.
if (startPos >= fActiveLimit) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
(void)UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
}
} else {
if (fLastMatchEnd >= 0) {
// A previous find() failed to match. Don't try again.
// (without this test, a pattern with a zero-length match
// could match again at the end of an input string.)
fHitEnd = TRUE;
return FALSE;
}
}
// Compute the position in the input string beyond which a match can not begin, because
// the minimum length match would extend past the end of the input.
// Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
// Be aware of possible overflows if making changes here.
int64_t testStartLimit;
if (UTEXT_USES_U16(fInputText)) {
testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
if (startPos > testStartLimit) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
} else {
// We don't know exactly how long the minimum match length is in native characters.
// Treat anything > 0 as 1.
testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
}
UChar32 c;
U_ASSERT(startPos >= 0);
switch (fPattern->fStartType) {
case START_NO_INFO:
// No optimization was found.
// Try a match at each input position.
for (;;) {
MatchAt(startPos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
if (startPos >= testStartLimit) {
fHitEnd = TRUE;
return FALSE;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
(void)UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testStartLimit the last time through.
if (findProgressInterrupt(startPos, status))
return FALSE;
}
UPRV_UNREACHABLE;
case START_START:
// Matches are only possible at the start of the input string
// (pattern begins with ^ or \A)
if (startPos > fActiveStart) {
fMatch = FALSE;
return FALSE;
}
MatchAt(startPos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
return fMatch;
case START_SET:
{
// Match may start on any char from a pre-computed set.
U_ASSERT(fPattern->fMinMatchLen > 0);
UTEXT_SETNATIVEINDEX(fInputText, startPos);
for (;;) {
int64_t pos = startPos;
c = UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
// c will be -1 (U_SENTINEL) at end of text, in which case we
// skip this next block (so we don't have a negative array index)
// and handle end of text in the following block.
if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
(c>=256 && fPattern->fInitialChars->contains(c)))) {
MatchAt(pos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
UTEXT_SETNATIVEINDEX(fInputText, pos);
}
if (startPos > testStartLimit) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
if (findProgressInterrupt(startPos, status))
return FALSE;
}
}
UPRV_UNREACHABLE;
case START_STRING:
case START_CHAR:
{
// Match starts on exactly one char.
U_ASSERT(fPattern->fMinMatchLen > 0);
UChar32 theChar = fPattern->fInitialChar;
UTEXT_SETNATIVEINDEX(fInputText, startPos);
for (;;) {
int64_t pos = startPos;
c = UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
if (c == theChar) {
MatchAt(pos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
}
if (startPos > testStartLimit) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
if (findProgressInterrupt(startPos, status))
return FALSE;
}
}
UPRV_UNREACHABLE;
case START_LINE:
{
UChar32 ch;
if (startPos == fAnchorStart) {
MatchAt(startPos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
ch = UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
} else {
UTEXT_SETNATIVEINDEX(fInputText, startPos);
ch = UTEXT_PREVIOUS32(fInputText);
UTEXT_SETNATIVEINDEX(fInputText, startPos);
}
if (fPattern->fFlags & UREGEX_UNIX_LINES) {
for (;;) {
if (ch == 0x0a) {
MatchAt(startPos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
}
if (startPos >= testStartLimit) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
ch = UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testStartLimit the last time through.
if (findProgressInterrupt(startPos, status))
return FALSE;
}
} else {
for (;;) {
if (isLineTerminator(ch)) {
if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
(void)UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
}
MatchAt(startPos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
}
if (startPos >= testStartLimit) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
ch = UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testStartLimit the last time through.
if (findProgressInterrupt(startPos, status))
return FALSE;
}
}
}
default:
UPRV_UNREACHABLE;
}
UPRV_UNREACHABLE;
}
UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return FALSE;
}
this->reset(); // Note: Reset() is specified by Java Matcher documentation.
// This will reset the region to be the full input length.
if (start < 0) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
int64_t nativeStart = start;
if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
fMatchEnd = nativeStart;
return find(status);
}
//--------------------------------------------------------------------------------
//
// findUsingChunk() -- like find(), but with the advance knowledge that the
// entire string is available in the UText's chunk buffer.
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
// Start at the position of the last match end. (Will be zero if the
// matcher has been reset.
//
int32_t startPos = (int32_t)fMatchEnd;
if (startPos==0) {
startPos = (int32_t)fActiveStart;
}
const UChar *inputBuf = fInputText->chunkContents;
if (fMatch) {
// Save the position of any previous successful match.
fLastMatchEnd = fMatchEnd;
if (fMatchStart == fMatchEnd) {
// Previous match had zero length. Move start position up one position
// to avoid sending find() into a loop on zero-length matches.
if (startPos >= fActiveLimit) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
U16_FWD_1(inputBuf, startPos, fInputLength);
}
} else {
if (fLastMatchEnd >= 0) {
// A previous find() failed to match. Don't try again.
// (without this test, a pattern with a zero-length match
// could match again at the end of an input string.)
fHitEnd = TRUE;
return FALSE;
}
}
// Compute the position in the input string beyond which a match can not begin, because
// the minimum length match would extend past the end of the input.
// Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
// Be aware of possible overflows if making changes here.
// Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
if (startPos > testLen) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
UChar32 c;
U_ASSERT(startPos >= 0);
switch (fPattern->fStartType) {
case START_NO_INFO:
// No optimization was found.
// Try a match at each input position.
for (;;) {
MatchChunkAt(startPos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
if (startPos >= testLen) {
fHitEnd = TRUE;
return FALSE;
}
U16_FWD_1(inputBuf, startPos, fActiveLimit);
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testLen the last time through.
if (findProgressInterrupt(startPos, status))
return FALSE;
}
UPRV_UNREACHABLE;
case START_START:
// Matches are only possible at the start of the input string
// (pattern begins with ^ or \A)
if (startPos > fActiveStart) {
fMatch = FALSE;
return FALSE;
}
MatchChunkAt(startPos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
return fMatch;
case START_SET:
{
// Match may start on any char from a pre-computed set.
U_ASSERT(fPattern->fMinMatchLen > 0);
for (;;) {
int32_t pos = startPos;
U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
(c>=256 && fPattern->fInitialChars->contains(c))) {
MatchChunkAt(pos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
}
if (startPos > testLen) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
if (findProgressInterrupt(startPos, status))
return FALSE;
}
}
UPRV_UNREACHABLE;
case START_STRING:
case START_CHAR:
{
// Match starts on exactly one char.
U_ASSERT(fPattern->fMinMatchLen > 0);
UChar32 theChar = fPattern->fInitialChar;
for (;;) {
int32_t pos = startPos;
U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
if (c == theChar) {
MatchChunkAt(pos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
}
if (startPos > testLen) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
if (findProgressInterrupt(startPos, status))
return FALSE;
}
}
UPRV_UNREACHABLE;
case START_LINE:
{
UChar32 ch;
if (startPos == fAnchorStart) {
MatchChunkAt(startPos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
U16_FWD_1(inputBuf, startPos, fActiveLimit);
}
if (fPattern->fFlags & UREGEX_UNIX_LINES) {
for (;;) {
ch = inputBuf[startPos-1];
if (ch == 0x0a) {
MatchChunkAt(startPos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
}
if (startPos >= testLen) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
U16_FWD_1(inputBuf, startPos, fActiveLimit);
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testLen the last time through.
if (findProgressInterrupt(startPos, status))
return FALSE;
}
} else {
for (;;) {
ch = inputBuf[startPos-1];
if (isLineTerminator(ch)) {
if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
startPos++;
}
MatchChunkAt(startPos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
}
if (startPos >= testLen) {
fMatch = FALSE;
fHitEnd = TRUE;
return FALSE;
}
U16_FWD_1(inputBuf, startPos, fActiveLimit);
// Note that it's perfectly OK for a pattern to have a zero-length
// match at the end of a string, so we must make sure that the loop
// runs with startPos == testLen the last time through.
if (findProgressInterrupt(startPos, status))
return FALSE;
}
}
}
default:
UPRV_UNREACHABLE;
}
UPRV_UNREACHABLE;
}
//--------------------------------------------------------------------------------
//
// group()
//
//--------------------------------------------------------------------------------
UnicodeString RegexMatcher::group(UErrorCode &status) const {
return group(0, status);
}
// Return immutable shallow clone
UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
return group(0, dest, group_len, status);
}
// Return immutable shallow clone
UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
group_len = 0;
if (U_FAILURE(status)) {
return dest;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
} else if (fMatch == FALSE) {
status = U_REGEX_INVALID_STATE;
} else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
}
if (U_FAILURE(status)) {
return dest;
}
int64_t s, e;
if (groupNum == 0) {
s = fMatchStart;
e = fMatchEnd;
} else {
int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
U_ASSERT(groupOffset < fPattern->fFrameSize);
U_ASSERT(groupOffset >= 0);
s = fFrame->fExtra[groupOffset];
e = fFrame->fExtra[groupOffset+1];
}
if (s < 0) {
// A capture group wasn't part of the match
return utext_clone(dest, fInputText, FALSE, TRUE, &status);
}
U_ASSERT(s <= e);
group_len = e - s;
dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
if (dest)
UTEXT_SETNATIVEINDEX(dest, s);
return dest;
}
UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
UnicodeString result;
int64_t groupStart = start64(groupNum, status);
int64_t groupEnd = end64(groupNum, status);
if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
return result;
}
// Get the group length using a utext_extract preflight.
// UText is actually pretty efficient at this when underlying encoding is UTF-16.
int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
return result;
}
status = U_ZERO_ERROR;
UChar *buf = result.getBuffer(length);
if (buf == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
} else {
int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
result.releaseBuffer(extractLength);
U_ASSERT(length == extractLength);
}
return result;
}
//--------------------------------------------------------------------------------
//
// appendGroup() -- currently internal only, appends a group to a UText rather
// than replacing its contents
//
//--------------------------------------------------------------------------------
int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
if (U_FAILURE(status)) {
return 0;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return 0;
}
int64_t destLen = utext_nativeLength(dest);
if (fMatch == FALSE) {
status = U_REGEX_INVALID_STATE;
return utext_replace(dest, destLen, destLen, NULL, 0, &status);
}
if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return utext_replace(dest, destLen, destLen, NULL, 0, &status);
}
int64_t s, e;
if (groupNum == 0) {
s = fMatchStart;
e = fMatchEnd;
} else {
int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
U_ASSERT(groupOffset < fPattern->fFrameSize);
U_ASSERT(groupOffset >= 0);
s = fFrame->fExtra[groupOffset];
e = fFrame->fExtra[groupOffset+1];
}
if (s < 0) {
// A capture group wasn't part of the match
return utext_replace(dest, destLen, destLen, NULL, 0, &status);
}
U_ASSERT(s <= e);
int64_t deltaLen;
if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
U_ASSERT(e <= fInputLength);
deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
} else {
int32_t len16;
if (UTEXT_USES_U16(fInputText)) {
len16 = (int32_t)(e-s);
} else {
UErrorCode lengthStatus = U_ZERO_ERROR;
len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
}
UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
if (groupChars == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
utext_extract(fInputText, s, e, groupChars, len16+1, &status);
deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
uprv_free(groupChars);
}
return deltaLen;
}
//--------------------------------------------------------------------------------
//
// groupCount()
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::groupCount() const {
return fPattern->fGroupMap->size();
}
//--------------------------------------------------------------------------------
//
// hasAnchoringBounds()
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::hasAnchoringBounds() const {
return fAnchoringBounds;
}
//--------------------------------------------------------------------------------
//
// hasTransparentBounds()
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::hasTransparentBounds() const {
return fTransparentBounds;
}
//--------------------------------------------------------------------------------
//
// hitEnd()
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::hitEnd() const {
return fHitEnd;
}
//--------------------------------------------------------------------------------
//
// input()
//
//--------------------------------------------------------------------------------
const UnicodeString &RegexMatcher::input() const {
if (!fInput) {
UErrorCode status = U_ZERO_ERROR;
int32_t len16;
if (UTEXT_USES_U16(fInputText)) {
len16 = (int32_t)fInputLength;
} else {
len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
status = U_ZERO_ERROR; // overflow, length status
}
UnicodeString *result = new UnicodeString(len16, 0, 0);
UChar *inputChars = result->getBuffer(len16);
utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
result->releaseBuffer(len16);
(*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
}
return *fInput;
}
//--------------------------------------------------------------------------------
//
// inputText()
//
//--------------------------------------------------------------------------------
UText *RegexMatcher::inputText() const {
return fInputText;
}
//--------------------------------------------------------------------------------
//
// getInput() -- like inputText(), but makes a clone or copies into another UText
//
//--------------------------------------------------------------------------------
UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
if (U_FAILURE(status)) {
return dest;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return dest;
}
if (dest) {
if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
} else {
int32_t input16Len;
if (UTEXT_USES_U16(fInputText)) {
input16Len = (int32_t)fInputLength;
} else {
UErrorCode lengthStatus = U_ZERO_ERROR;
input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
}
UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
if (inputChars == NULL) {
return dest;
}
status = U_ZERO_ERROR;
utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
status = U_ZERO_ERROR;
utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
uprv_free(inputChars);
}
return dest;
} else {
return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
}
}
static UBool compat_SyncMutableUTextContents(UText *ut);
static UBool compat_SyncMutableUTextContents(UText *ut) {
UBool retVal = FALSE;
// In the following test, we're really only interested in whether the UText should switch
// between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
// will still point to the correct data.
if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
UnicodeString *us=(UnicodeString *)ut->context;
// Update to the latest length.
// For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
int32_t newLength = us->length();
// Update the chunk description.
// The buffer may have switched between stack- and heap-based.
ut->chunkContents = us->getBuffer();
ut->chunkLength = newLength;
ut->chunkNativeLimit = newLength;
ut->nativeIndexingLimit = newLength;
retVal = TRUE;
}
return retVal;
}
//--------------------------------------------------------------------------------
//
// lookingAt()
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::lookingAt(UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return FALSE;
}
if (fInputUniStrMaybeMutable) {
if (compat_SyncMutableUTextContents(fInputText)) {
fInputLength = utext_nativeLength(fInputText);
reset();
}
}
else {
resetPreserveRegion();
}
if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
MatchChunkAt((int32_t)fActiveStart, FALSE, status);
} else {
MatchAt(fActiveStart, FALSE, status);
}
return fMatch;
}
UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return FALSE;
}
reset();
if (start < 0) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
if (fInputUniStrMaybeMutable) {
if (compat_SyncMutableUTextContents(fInputText)) {
fInputLength = utext_nativeLength(fInputText);
reset();
}
}
int64_t nativeStart;
nativeStart = start;
if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
MatchChunkAt((int32_t)nativeStart, FALSE, status);
} else {
MatchAt(nativeStart, FALSE, status);
}
return fMatch;
}
//--------------------------------------------------------------------------------
//
// matches()
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::matches(UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return FALSE;
}
if (fInputUniStrMaybeMutable) {
if (compat_SyncMutableUTextContents(fInputText)) {
fInputLength = utext_nativeLength(fInputText);
reset();
}
}
else {
resetPreserveRegion();
}
if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
MatchChunkAt((int32_t)fActiveStart, TRUE, status);
} else {
MatchAt(fActiveStart, TRUE, status);
}
return fMatch;
}
UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return FALSE;
}
reset();
if (start < 0) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
if (fInputUniStrMaybeMutable) {
if (compat_SyncMutableUTextContents(fInputText)) {
fInputLength = utext_nativeLength(fInputText);
reset();
}
}
int64_t nativeStart;
nativeStart = start;
if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
MatchChunkAt((int32_t)nativeStart, TRUE, status);
} else {
MatchAt(nativeStart, TRUE, status);
}
return fMatch;
}
//--------------------------------------------------------------------------------
//
// pattern
//
//--------------------------------------------------------------------------------
const RegexPattern &RegexMatcher::pattern() const {
return *fPattern;
}
//--------------------------------------------------------------------------------
//
// region
//
//--------------------------------------------------------------------------------
RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
int64_t nativeStart = regionStart;
int64_t nativeLimit = regionLimit;
if (nativeStart > fInputLength || nativeLimit > fInputLength) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
if (startIndex == -1)
this->reset();
else
resetPreserveRegion();
fRegionStart = nativeStart;
fRegionLimit = nativeLimit;
fActiveStart = nativeStart;
fActiveLimit = nativeLimit;
if (startIndex != -1) {
if (startIndex < fActiveStart || startIndex > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
}
fMatchEnd = startIndex;
}
if (!fTransparentBounds) {
fLookStart = nativeStart;
fLookLimit = nativeLimit;
}
if (fAnchoringBounds) {
fAnchorStart = nativeStart;
fAnchorLimit = nativeLimit;
}
return *this;
}
RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
return region(start, limit, -1, status);
}
//--------------------------------------------------------------------------------
//
// regionEnd
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::regionEnd() const {
return (int32_t)fRegionLimit;
}
int64_t RegexMatcher::regionEnd64() const {
return fRegionLimit;
}
//--------------------------------------------------------------------------------
//
// regionStart
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::regionStart() const {
return (int32_t)fRegionStart;
}
int64_t RegexMatcher::regionStart64() const {
return fRegionStart;
}
//--------------------------------------------------------------------------------
//
// replaceAll
//
//--------------------------------------------------------------------------------
UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
UText replacementText = UTEXT_INITIALIZER;
UText resultText = UTEXT_INITIALIZER;
UnicodeString resultString;
if (U_FAILURE(status)) {
return resultString;
}
utext_openConstUnicodeString(&replacementText, &replacement, &status);
utext_openUnicodeString(&resultText, &resultString, &status);
replaceAll(&replacementText, &resultText, status);
utext_close(&resultText);
utext_close(&replacementText);
return resultString;
}
//
// replaceAll, UText mode
//
UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
if (U_FAILURE(status)) {
return dest;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return dest;
}
if (dest == NULL) {
UnicodeString emptyString;
UText empty = UTEXT_INITIALIZER;
utext_openUnicodeString(&empty, &emptyString, &status);
dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
utext_close(&empty);
}
if (U_SUCCESS(status)) {
reset();
while (find()) {
appendReplacement(dest, replacement, status);
if (U_FAILURE(status)) {
break;
}
}
appendTail(dest, status);
}
return dest;
}
//--------------------------------------------------------------------------------
//
// replaceFirst
//
//--------------------------------------------------------------------------------
UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
UText replacementText = UTEXT_INITIALIZER;
UText resultText = UTEXT_INITIALIZER;
UnicodeString resultString;
utext_openConstUnicodeString(&replacementText, &replacement, &status);
utext_openUnicodeString(&resultText, &resultString, &status);
replaceFirst(&replacementText, &resultText, status);
utext_close(&resultText);
utext_close(&replacementText);
return resultString;
}
//
// replaceFirst, UText mode
//
UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
if (U_FAILURE(status)) {
return dest;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return dest;
}
reset();
if (!find()) {
return getInput(dest, status);
}
if (dest == NULL) {
UnicodeString emptyString;
UText empty = UTEXT_INITIALIZER;
utext_openUnicodeString(&empty, &emptyString, &status);
dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
utext_close(&empty);
}
appendReplacement(dest, replacement, status);
appendTail(dest, status);
return dest;
}
//--------------------------------------------------------------------------------
//
// requireEnd
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::requireEnd() const {
return fRequireEnd;
}
//--------------------------------------------------------------------------------
//
// reset
//
//--------------------------------------------------------------------------------
RegexMatcher &RegexMatcher::reset() {
fRegionStart = 0;
fRegionLimit = fInputLength;
fActiveStart = 0;
fActiveLimit = fInputLength;
fAnchorStart = 0;
fAnchorLimit = fInputLength;
fLookStart = 0;
fLookLimit = fInputLength;
resetPreserveRegion();
return *this;
}
void RegexMatcher::resetPreserveRegion() {
fMatchStart = 0;
fMatchEnd = 0;
fLastMatchEnd = -1;
fAppendPosition = 0;
fMatch = FALSE;
fHitEnd = FALSE;
fRequireEnd = FALSE;
fTime = 0;
fTickCounter = TIMER_INITIAL_VALUE;
//resetStack(); // more expensive than it looks...
}
RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
if (fPattern->fNeedsAltInput) {
fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
}
if (U_FAILURE(fDeferredStatus)) {
return *this;
}
fInputLength = utext_nativeLength(fInputText);
reset();
delete fInput;
fInput = NULL;
// Do the following for any UnicodeString.
// This is for compatibility for those clients who modify the input string "live" during regex operations.
fInputUniStrMaybeMutable = TRUE;
#if UCONFIG_NO_BREAK_ITERATION==0
if (fWordBreakItr) {
fWordBreakItr->setText(fInputText, fDeferredStatus);
}
if (fGCBreakItr) {
fGCBreakItr->setText(fInputText, fDeferredStatus);
}
#endif
return *this;
}
RegexMatcher &RegexMatcher::reset(UText *input) {
if (fInputText != input) {
fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
return *this;
}
fInputLength = utext_nativeLength(fInputText);
delete fInput;
fInput = NULL;
#if UCONFIG_NO_BREAK_ITERATION==0
if (fWordBreakItr) {
fWordBreakItr->setText(input, fDeferredStatus);
}
if (fGCBreakItr) {
fGCBreakItr->setText(fInputText, fDeferredStatus);
}
#endif
}
reset();
fInputUniStrMaybeMutable = FALSE;
return *this;
}
/*RegexMatcher &RegexMatcher::reset(const UChar *) {
fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
return *this;
}*/
RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
reset(); // Reset also resets the region to be the entire string.
if (position < 0 || position > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return *this;
}
fMatchEnd = position;
return *this;
}
//--------------------------------------------------------------------------------
//
// refresh
//
//--------------------------------------------------------------------------------
RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
if (input == NULL) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
int64_t pos = utext_getNativeIndex(fInputText);
// Shallow read-only clone of the new UText into the existing input UText
fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
if (U_FAILURE(status)) {
return *this;
}
utext_setNativeIndex(fInputText, pos);
if (fAltInputText != NULL) {
pos = utext_getNativeIndex(fAltInputText);
fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
if (U_FAILURE(status)) {
return *this;
}
utext_setNativeIndex(fAltInputText, pos);
}
return *this;
}
//--------------------------------------------------------------------------------
//
// setTrace
//
//--------------------------------------------------------------------------------
void RegexMatcher::setTrace(UBool state) {
fTraceDebug = state;
}
/**
* UText, replace entire contents of the destination UText with a substring of the source UText.
*
* @param src The source UText
* @param dest The destination UText. Must be writable.
* May be NULL, in which case a new UText will be allocated.
* @param start Start index of source substring.
* @param limit Limit index of source substring.
* @param status An error code.
*/
static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
if (U_FAILURE(*status)) {
return dest;
}
if (start == limit) {
if (dest) {
utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
return dest;
} else {
return utext_openUChars(NULL, NULL, 0, status);
}
}
int32_t length = utext_extract(src, start, limit, NULL, 0, status);
if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
return dest;
}
*status = U_ZERO_ERROR;
MaybeStackArray<UChar, 40> buffer;
if (length >= buffer.getCapacity()) {
UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
if (newBuf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
}
}
utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
if (dest) {
utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
return dest;
}
// Caller did not provide a prexisting UText.
// Open a new one, and have it adopt the text buffer storage.
if (U_FAILURE(*status)) {
return NULL;
}
int32_t ownedLength = 0;
UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
if (ownedBuf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
UText *result = utext_openUChars(NULL, ownedBuf, length, status);
if (U_FAILURE(*status)) {
uprv_free(ownedBuf);
return NULL;
}
result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
return result;
}
//---------------------------------------------------------------------
//
// split
//
//---------------------------------------------------------------------
int32_t RegexMatcher::split(const UnicodeString &input,
UnicodeString dest[],
int32_t destCapacity,
UErrorCode &status)
{
UText inputText = UTEXT_INITIALIZER;
utext_openConstUnicodeString(&inputText, &input, &status);
if (U_FAILURE(status)) {
return 0;
}
UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
if (destText == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
int32_t i;
for (i = 0; i < destCapacity; i++) {
destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
}
int32_t fieldCount = split(&inputText, destText, destCapacity, status);
for (i = 0; i < destCapacity; i++) {
utext_close(destText[i]);
}
uprv_free(destText);
utext_close(&inputText);
return fieldCount;
}
//
// split, UText mode
//
int32_t RegexMatcher::split(UText *input,
UText *dest[],
int32_t destCapacity,
UErrorCode &status)
{
//
// Check arguements for validity
//
if (U_FAILURE(status)) {
return 0;
}
if (destCapacity < 1) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
//
// Reset for the input text
//
reset(input);
int64_t nextOutputStringStart = 0;
if (fActiveLimit == 0) {
return 0;
}
//
// Loop through the input text, searching for the delimiter pattern
//
int32_t i;
int32_t numCaptureGroups = fPattern->fGroupMap->size();
for (i=0; ; i++) {
if (i>=destCapacity-1) {
// There is one or zero output string left.
// Fill the last output string with whatever is left from the input, then exit the loop.
// ( i will be == destCapacity if we filled the output array while processing
// capture groups of the delimiter expression, in which case we will discard the
// last capture group saved in favor of the unprocessed remainder of the
// input string.)
i = destCapacity-1;
if (fActiveLimit > nextOutputStringStart) {
if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
if (dest[i]) {
utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
input->chunkContents+nextOutputStringStart,
(int32_t)(fActiveLimit-nextOutputStringStart), &status);
} else {
UText remainingText = UTEXT_INITIALIZER;
utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
fActiveLimit-nextOutputStringStart, &status);
dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
utext_close(&remainingText);
}
} else {
UErrorCode lengthStatus = U_ZERO_ERROR;
int32_t remaining16Length =
utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
if (remainingChars == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
break;
}
utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
if (dest[i]) {
utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
} else {
UText remainingText = UTEXT_INITIALIZER;
utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
utext_close(&remainingText);
}
uprv_free(remainingChars);
}
}
break;
}
if (find()) {
// We found another delimiter. Move everything from where we started looking
// up until the start of the delimiter into the next output string.
if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
if (dest[i]) {
utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
input->chunkContents+nextOutputStringStart,
(int32_t)(fMatchStart-nextOutputStringStart), &status);
} else {
UText remainingText = UTEXT_INITIALIZER;
utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
fMatchStart-nextOutputStringStart, &status);
dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
utext_close(&remainingText);
}
} else {
UErrorCode lengthStatus = U_ZERO_ERROR;
int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
if (remainingChars == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
break;
}
utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
if (dest[i]) {
utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
} else {
UText remainingText = UTEXT_INITIALIZER;
utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
utext_close(&remainingText);
}
uprv_free(remainingChars);
}
nextOutputStringStart = fMatchEnd;
// If the delimiter pattern has capturing parentheses, the captured
// text goes out into the next n destination strings.
int32_t groupNum;
for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
if (i >= destCapacity-2) {
// Never fill the last available output string with capture group text.
// It will filled with the last field, the remainder of the
// unsplit input text.
break;
}
i++;
dest[i] = utext_extract_replace(fInputText, dest[i],
start64(groupNum, status), end64(groupNum, status), &status);
}
if (nextOutputStringStart == fActiveLimit) {
// The delimiter was at the end of the string. We're done, but first
// we output one last empty string, for the empty field following
// the delimiter at the end of input.
if (i+1 < destCapacity) {
++i;
if (dest[i] == NULL) {
dest[i] = utext_openUChars(NULL, NULL, 0, &status);
} else {
static const UChar emptyString[] = {(UChar)0};
utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
}
}
break;
}
}
else
{
// We ran off the end of the input while looking for the next delimiter.
// All the remaining text goes into the current output string.
if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
if (dest[i]) {
utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
input->chunkContents+nextOutputStringStart,
(int32_t)(fActiveLimit-nextOutputStringStart), &status);
} else {
UText remainingText = UTEXT_INITIALIZER;
utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
fActiveLimit-nextOutputStringStart, &status);
dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
utext_close(&remainingText);
}
} else {
UErrorCode lengthStatus = U_ZERO_ERROR;
int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
if (remainingChars == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
break;
}
utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
if (dest[i]) {
utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
} else {
UText remainingText = UTEXT_INITIALIZER;
utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
utext_close(&remainingText);
}
uprv_free(remainingChars);
}
break;
}
if (U_FAILURE(status)) {
break;
}
} // end of for loop
return i+1;
}
//--------------------------------------------------------------------------------
//
// start
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::start(UErrorCode &status) const {
return start(0, status);
}
int64_t RegexMatcher::start64(UErrorCode &status) const {
return start64(0, status);
}
//--------------------------------------------------------------------------------
//
// start(int32_t group, UErrorCode &status)
//
//--------------------------------------------------------------------------------
int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
if (U_FAILURE(status)) {
return -1;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return -1;
}
if (fMatch == FALSE) {
status = U_REGEX_INVALID_STATE;
return -1;
}
if (group < 0 || group > fPattern->fGroupMap->size()) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return -1;
}
int64_t s;
if (group == 0) {
s = fMatchStart;
} else {
int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
U_ASSERT(groupOffset < fPattern->fFrameSize);
U_ASSERT(groupOffset >= 0);
s = fFrame->fExtra[groupOffset];
}
return s;
}
int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
return (int32_t)start64(group, status);
}
//--------------------------------------------------------------------------------
//
// useAnchoringBounds
//
//--------------------------------------------------------------------------------
RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
fAnchoringBounds = b;
fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
return *this;
}
//--------------------------------------------------------------------------------
//
// useTransparentBounds
//
//--------------------------------------------------------------------------------
RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
fTransparentBounds = b;
fLookStart = (fTransparentBounds ? 0 : fRegionStart);
fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
return *this;
}
//--------------------------------------------------------------------------------
//
// setTimeLimit
//
//--------------------------------------------------------------------------------
void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return;
}
if (limit < 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
fTimeLimit = limit;
}
//--------------------------------------------------------------------------------
//
// getTimeLimit
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::getTimeLimit() const {
return fTimeLimit;
}
//--------------------------------------------------------------------------------
//
// setStackLimit
//
//--------------------------------------------------------------------------------
void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return;
}
if (limit < 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
// Reset the matcher. This is needed here in case there is a current match
// whose final stack frame (containing the match results, pointed to by fFrame)
// would be lost by resizing to a smaller stack size.
reset();
if (limit == 0) {
// Unlimited stack expansion
fStack->setMaxCapacity(0);
} else {
// Change the units of the limit from bytes to ints, and bump the size up
// to be big enough to hold at least one stack frame for the pattern,
// if it isn't there already.
int32_t adjustedLimit = limit / sizeof(int32_t);
if (adjustedLimit < fPattern->fFrameSize) {
adjustedLimit = fPattern->fFrameSize;
}
fStack->setMaxCapacity(adjustedLimit);
}
fStackLimit = limit;
}
//--------------------------------------------------------------------------------
//
// getStackLimit
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::getStackLimit() const {
return fStackLimit;
}
//--------------------------------------------------------------------------------
//
// setMatchCallback
//
//--------------------------------------------------------------------------------
void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
const void *context,
UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
fCallbackFn = callback;
fCallbackContext = context;
}
//--------------------------------------------------------------------------------
//
// getMatchCallback
//
//--------------------------------------------------------------------------------
void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
const void *&context,
UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
callback = fCallbackFn;
context = fCallbackContext;
}
//--------------------------------------------------------------------------------
//
// setMatchCallback
//
//--------------------------------------------------------------------------------
void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
const void *context,
UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
fFindProgressCallbackFn = callback;
fFindProgressCallbackContext = context;
}
//--------------------------------------------------------------------------------
//
// getMatchCallback
//
//--------------------------------------------------------------------------------
void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
const void *&context,
UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
callback = fFindProgressCallbackFn;
context = fFindProgressCallbackContext;
}
//================================================================================
//
// Code following this point in this file is the internal
// Match Engine Implementation.
//
//================================================================================
//--------------------------------------------------------------------------------
//
// resetStack
// Discard any previous contents of the state save stack, and initialize a
// new stack frame to all -1. The -1s are needed for capture group limits,
// where they indicate that a group has not yet matched anything.
//--------------------------------------------------------------------------------
REStackFrame *RegexMatcher::resetStack() {
// Discard any previous contents of the state save stack, and initialize a
// new stack frame with all -1 data. The -1s are needed for capture group limits,
// where they indicate that a group has not yet matched anything.
fStack->removeAllElements();
REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
if(U_FAILURE(fDeferredStatus)) {
return NULL;
}
int32_t i;
for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
iFrame->fExtra[i] = -1;
}
return iFrame;
}
//--------------------------------------------------------------------------------
//
// isWordBoundary
// in perl, "xab..cd..", \b is true at positions 0,3,5,7
// For us,
// If the current char is a combining mark,
// \b is FALSE.
// Else Scan backwards to the first non-combining char.
// We are at a boundary if the this char and the original chars are
// opposite in membership in \w set
//
// parameters: pos - the current position in the input buffer
//
// TODO: double-check edge cases at region boundaries.
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::isWordBoundary(int64_t pos) {
UBool isBoundary = FALSE;
UBool cIsWord = FALSE;
if (pos >= fLookLimit) {
fHitEnd = TRUE;
} else {
// Determine whether char c at current position is a member of the word set of chars.
// If we're off the end of the string, behave as though we're not at a word char.
UTEXT_SETNATIVEINDEX(fInputText, pos);
UChar32 c = UTEXT_CURRENT32(fInputText);
if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
// Current char is a combining one. Not a boundary.
return FALSE;
}
cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
}
// Back up until we come to a non-combining char, determine whether
// that char is a word char.
UBool prevCIsWord = FALSE;
for (;;) {
if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
break;
}
UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
|| u_charType(prevChar) == U_FORMAT_CHAR)) {
prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
break;
}
}
isBoundary = cIsWord ^ prevCIsWord;
return isBoundary;
}
UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
UBool isBoundary = FALSE;
UBool cIsWord = FALSE;
const UChar *inputBuf = fInputText->chunkContents;
if (pos >= fLookLimit) {
fHitEnd = TRUE;
} else {
// Determine whether char c at current position is a member of the word set of chars.
// If we're off the end of the string, behave as though we're not at a word char.
UChar32 c;
U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
// Current char is a combining one. Not a boundary.
return FALSE;
}
cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
}
// Back up until we come to a non-combining char, determine whether
// that char is a word char.
UBool prevCIsWord = FALSE;
for (;;) {
if (pos <= fLookStart) {
break;
}
UChar32 prevChar;
U16_PREV(inputBuf, fLookStart, pos, prevChar);
if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
|| u_charType(prevChar) == U_FORMAT_CHAR)) {
prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
break;
}
}
isBoundary = cIsWord ^ prevCIsWord;
return isBoundary;
}
//--------------------------------------------------------------------------------
//
// isUWordBoundary
//
// Test for a word boundary using RBBI word break.
//
// parameters: pos - the current position in the input buffer
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
UBool returnVal = FALSE;
#if UCONFIG_NO_BREAK_ITERATION==0
// Note: this point will never be reached if break iteration is configured out.
// Regex patterns that would require this function will fail to compile.
// If we haven't yet created a break iterator for this matcher, do it now.
if (fWordBreakItr == nullptr) {
fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
if (U_FAILURE(status)) {
return FALSE;
}
fWordBreakItr->setText(fInputText, status);
}
// Note: zero width boundary tests like \b see through transparent region bounds,
// which is why fLookLimit is used here, rather than fActiveLimit.
if (pos >= fLookLimit) {
fHitEnd = TRUE;
returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
// words are not boundaries. All non-word chars stand by themselves,
// with word boundaries on both sides.
} else {
returnVal = fWordBreakItr->isBoundary((int32_t)pos);
}
#endif
return returnVal;
}
int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
int64_t result = pos;
#if UCONFIG_NO_BREAK_ITERATION==0
// Note: this point will never be reached if break iteration is configured out.
// Regex patterns that would require this function will fail to compile.
// If we haven't yet created a break iterator for this matcher, do it now.
if (fGCBreakItr == nullptr) {
fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
if (U_FAILURE(status)) {
return pos;
}
fGCBreakItr->setText(fInputText, status);
}
result = fGCBreakItr->following(pos);
if (result == BreakIterator::DONE) {
result = pos;
}
#endif
return result;
}
//--------------------------------------------------------------------------------
//
// IncrementTime This function is called once each TIMER_INITIAL_VALUE state
// saves. Increment the "time" counter, and call the
// user callback function if there is one installed.
//
// If the match operation needs to be aborted, either for a time-out
// or because the user callback asked for it, just set an error status.
// The engine will pick that up and stop in its outer loop.
//
//--------------------------------------------------------------------------------
void RegexMatcher::IncrementTime(UErrorCode &status) {
fTickCounter = TIMER_INITIAL_VALUE;
fTime++;
if (fCallbackFn != NULL) {
if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
status = U_REGEX_STOPPED_BY_CALLER;
return;
}
}
if (fTimeLimit > 0 && fTime >= fTimeLimit) {
status = U_REGEX_TIME_OUT;
}
}
//--------------------------------------------------------------------------------
//
// StateSave
// Make a new stack frame, initialized as a copy of the current stack frame.
// Set the pattern index in the original stack frame from the operand value
// in the opcode. Execution of the engine continues with the state in
// the newly created stack frame
//
// Note that reserveBlock() may grow the stack, resulting in the
// whole thing being relocated in memory.
//
// Parameters:
// fp The top frame pointer when called. At return, a new
// fame will be present
// savePatIdx An index into the compiled pattern. Goes into the original
// (not new) frame. If execution ever back-tracks out of the
// new frame, this will be where we continue from in the pattern.
// Return
// The new frame pointer.
//
//--------------------------------------------------------------------------------
inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
if (U_FAILURE(status)) {
return fp;
}
// push storage for a new frame.
int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
if (U_FAILURE(status)) {
// Failure on attempted stack expansion.
// Stack function set some other error code, change it to a more
// specific one for regular expressions.
status = U_REGEX_STACK_OVERFLOW;
// We need to return a writable stack frame, so just return the
// previous frame. The match operation will stop quickly
// because of the error status, after which the frame will never
// be looked at again.
return fp;
}
fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
// New stack frame = copy of old top frame.
int64_t *source = (int64_t *)fp;
int64_t *dest = newFP;
for (;;) {
*dest++ = *source++;
if (source == newFP) {
break;
}
}
fTickCounter--;
if (fTickCounter <= 0) {
IncrementTime(status); // Re-initializes fTickCounter
}
fp->fPatIdx = savePatIdx;
return (REStackFrame *)newFP;
}
#if defined(REGEX_DEBUG)
namespace {
UnicodeString StringFromUText(UText *ut) {
UnicodeString result;
for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
result.append(c);
}
return result;
}
}
#endif // REGEX_DEBUG
//--------------------------------------------------------------------------------
//
// MatchAt This is the actual matching engine.
//
// startIdx: begin matching a this index.
// toEnd: if true, match must extend to end of the input region
//
//--------------------------------------------------------------------------------
void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
UBool isMatch = FALSE; // True if the we have a match.
int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
int32_t op; // Operation from the compiled pattern, split into
int32_t opType; // the opcode
int32_t opValue; // and the operand value.
#ifdef REGEX_RUN_DEBUG
if (fTraceDebug) {
printf("MatchAt(startIdx=%ld)\n", startIdx);
printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
}
#endif
if (U_FAILURE(status)) {
return;
}
// Cache frequently referenced items from the compiled pattern
//
int64_t *pat = fPattern->fCompiledPat->getBuffer();
const UChar *litText = fPattern->fLiteralText.getBuffer();
UVector *fSets = fPattern->fSets;
fFrameSize = fPattern->fFrameSize;
REStackFrame *fp = resetStack();
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return;
}
fp->fPatIdx = 0;
fp->fInputIdx = startIdx;
// Zero out the pattern's static data
int32_t i;
for (i = 0; i<fPattern->fDataSize; i++) {
fData[i] = 0;
}
//
// Main loop for interpreting the compiled pattern.
// One iteration of the loop per pattern operation performed.
//
for (;;) {
op = (int32_t)pat[fp->fPatIdx];
opType = URX_TYPE(op);
opValue = URX_VAL(op);
#ifdef REGEX_RUN_DEBUG
if (fTraceDebug) {
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
fPattern->dumpOp(fp->fPatIdx);
}
#endif
fp->fPatIdx++;
switch (opType) {
case URX_NOP:
break;
case URX_BACKTRACK:
// Force a backtrack. In some circumstances, the pattern compiler
// will notice that the pattern can't possibly match anything, and will
// emit one of these at that point.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
case URX_ONECHAR:
if (fp->fInputIdx < fActiveLimit) {
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_NEXT32(fInputText);
if (c == opValue) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
break;
}
} else {
fHitEnd = TRUE;
}
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
case URX_STRING:
{
// Test input against a literal string.
// Strings require two slots in the compiled pattern, one for the
// offset to the string text, and one for the length.
int32_t stringStartIdx = opValue;
op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
fp->fPatIdx++;
opType = URX_TYPE(op);
int32_t stringLen = URX_VAL(op);
U_ASSERT(opType == URX_STRING_LEN);
U_ASSERT(stringLen >= 2);
const UChar *patternString = litText+stringStartIdx;
int32_t patternStringIndex = 0;
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 inputChar;
UChar32 patternChar;
UBool success = TRUE;
while (patternStringIndex < stringLen) {
if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
success = FALSE;
fHitEnd = TRUE;
break;
}
inputChar = UTEXT_NEXT32(fInputText);
U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
if (patternChar != inputChar) {
success = FALSE;
break;
}
}
if (success) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_STATE_SAVE:
fp = StateSave(fp, opValue, status);
break;
case URX_END:
// The match loop will exit via this path on a successful match,
// when we reach the end of the pattern.
if (toEnd && fp->fInputIdx != fActiveLimit) {
// The pattern matched, but not to the end of input. Try some more.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
isMatch = TRUE;
goto breakFromLoop;
// Start and End Capture stack frame variables are laid out out like this:
// fp->fExtra[opValue] - The start of a completed capture group
// opValue+1 - The end of a completed capture group
// opValue+2 - the start of a capture group whose end
// has not yet been reached (and might not ever be).
case URX_START_CAPTURE:
U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
fp->fExtra[opValue+2] = fp->fInputIdx;
break;
case URX_END_CAPTURE:
U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
fp->fExtra[opValue+1] = fp->fInputIdx; // End position
U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
break;
case URX_DOLLAR: // $, test for End of line
// or for position before new line at end of input
{
if (fp->fInputIdx >= fAnchorLimit) {
// We really are at the end of input. Success.
fHitEnd = TRUE;
fRequireEnd = TRUE;
break;
}
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
// If we are positioned just before a new-line that is located at the
// end of input, succeed.
UChar32 c = UTEXT_NEXT32(fInputText);
if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
if (isLineTerminator(c)) {
// If not in the middle of a CR/LF sequence
if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
// At new-line at end of input. Success
fHitEnd = TRUE;
fRequireEnd = TRUE;
break;
}
}
} else {
UChar32 nextC = UTEXT_NEXT32(fInputText);
if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
fHitEnd = TRUE;
fRequireEnd = TRUE;
break; // At CR/LF at end of input. Success
}
}
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
if (fp->fInputIdx >= fAnchorLimit) {
// Off the end of input. Success.
fHitEnd = TRUE;
fRequireEnd = TRUE;
break;
} else {
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_NEXT32(fInputText);
// Either at the last character of input, or off the end.
if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
fHitEnd = TRUE;
fRequireEnd = TRUE;
break;
}
}
// Not at end of input. Back-track out.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
case URX_DOLLAR_M: // $, test for End of line in multi-line mode
{
if (fp->fInputIdx >= fAnchorLimit) {
// We really are at the end of input. Success.
fHitEnd = TRUE;
fRequireEnd = TRUE;
break;
}
// If we are positioned just before a new-line, succeed.
// It makes no difference where the new-line is within the input.
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_CURRENT32(fInputText);
if (isLineTerminator(c)) {
// At a line end, except for the odd chance of being in the middle of a CR/LF sequence
// In multi-line mode, hitting a new-line just before the end of input does not
// set the hitEnd or requireEnd flags
if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
break;
}
}
// not at a new line. Fail.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
{
if (fp->fInputIdx >= fAnchorLimit) {
// We really are at the end of input. Success.
fHitEnd = TRUE;
fRequireEnd = TRUE; // Java set requireEnd in this case, even though
break; // adding a new-line would not lose the match.
}
// If we are not positioned just before a new-line, the test fails; backtrack out.
// It makes no difference where the new-line is within the input.
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
if (UTEXT_CURRENT32(fInputText) != 0x0a) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_CARET: // ^, test for start of line
if (fp->fInputIdx != fAnchorStart) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_CARET_M: // ^, test for start of line in mulit-line mode
{
if (fp->fInputIdx == fAnchorStart) {
// We are at the start input. Success.
break;
}
// Check whether character just before the current pos is a new-line
// unless we are at the end of input
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_PREVIOUS32(fInputText);
if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
// It's a new-line. ^ is true. Success.
// TODO: what should be done with positions between a CR and LF?
break;
}
// Not at the start of a line. Fail.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
{
U_ASSERT(fp->fInputIdx >= fAnchorStart);
if (fp->fInputIdx <= fAnchorStart) {
// We are at the start input. Success.
break;
}
// Check whether character just before the current pos is a new-line
U_ASSERT(fp->fInputIdx <= fAnchorLimit);
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_PREVIOUS32(fInputText);
if (c != 0x0a) {
// Not at the start of a line. Back-track out.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_B: // Test for word boundaries
{
UBool success = isWordBoundary(fp->fInputIdx);
success ^= (UBool)(opValue != 0); // flip sense for \B
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
{
UBool success = isUWordBoundary(fp->fInputIdx, status);
success ^= (UBool)(opValue != 0); // flip sense for \B
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_D: // Test for decimal digit
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_NEXT32(fInputText);
int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
success ^= (UBool)(opValue != 0); // flip sense for \D
if (success) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_G: // Test for position at end of previous match
if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_BACKSLASH_H: // Test for \h, horizontal white space.
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_NEXT32(fInputText);
int8_t ctype = u_charType(c);
UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
success ^= (UBool)(opValue != 0); // flip sense for \H
if (success) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_R: // Test for \R, any line break sequence.
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_NEXT32(fInputText);
if (isLineTerminator(c)) {
if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
utext_next32(fInputText);
}
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_V: // \v, any single line ending character.
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_NEXT32(fInputText);
UBool success = isLineTerminator(c);
success ^= (UBool)(opValue != 0); // flip sense for \V
if (success) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_X:
// Match a Grapheme, as defined by Unicode UAX 29.
// Fail if at end of input
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp->fInputIdx = fActiveLimit;
}
break;
case URX_BACKSLASH_Z: // Test for end of Input
if (fp->fInputIdx < fAnchorLimit) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
} else {
fHitEnd = TRUE;
fRequireEnd = TRUE;
}
break;
case URX_STATIC_SETREF:
{
// Test input character against one of the predefined sets
// (Word Characters, for example)
// The high bit of the op value is a flag for the match polarity.
// 0: success if input char is in set.
// 1: success if input char is not in set.
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
opValue &= ~URX_NEG_SET;
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_NEXT32(fInputText);
if (c < 256) {
Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
if (s8.contains(c)) {
success = !success;
}
} else {
const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
if (s.contains(c)) {
success = !success;
}
}
if (success) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
// the character wasn't in the set.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_STAT_SETREF_N:
{
// Test input character for NOT being a member of one of
// the predefined sets (Word Characters, for example)
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_NEXT32(fInputText);
if (c < 256) {
Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
if (s8.contains(c) == FALSE) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
break;
}
} else {
const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
if (s.contains(c) == FALSE) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
break;
}
}
// the character wasn't in the set.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_SETREF:
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
} else {
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
// There is input left. Pick up one char and test it for set membership.
UChar32 c = UTEXT_NEXT32(fInputText);
U_ASSERT(opValue > 0 && opValue < fSets->size());
if (c<256) {
Regex8BitSet *s8 = &fPattern->fSets8[opValue];
if (s8->contains(c)) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
break;
}
} else {
UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
if (s->contains(c)) {
// The character is in the set. A Match.
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
break;
}
}
// the character wasn't in the set.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_DOTANY:
{
// . matches anything, but stops at end-of-line.
if (fp->fInputIdx >= fActiveLimit) {
// At end of input. Match failed. Backtrack out.
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
// There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c = UTEXT_NEXT32(fInputText);
if (isLineTerminator(c)) {
// End of line in normal mode. . does not match.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
break;
case URX_DOTANY_ALL:
{
// ., in dot-matches-all (including new lines) mode
if (fp->fInputIdx >= fActiveLimit) {
// At end of input. Match failed. Backtrack out.
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
// There is input left. Advance over one char, except if we are
// at a cr/lf, advance over both of them.
UChar32 c;
c = UTEXT_NEXT32(fInputText);
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
if (c==0x0d && fp->fInputIdx < fActiveLimit) {
// In the case of a CR/LF, we need to advance over both.
UChar32 nextc = UTEXT_CURRENT32(fInputText);
if (nextc == 0x0a) {
(void)UTEXT_NEXT32(fInputText);
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
}
}
break;
case URX_DOTANY_UNIX:
{
// '.' operator, matches all, but stops at end-of-line.
// UNIX_LINES mode, so 0x0a is the only recognized line ending.
if (fp->fInputIdx >= fActiveLimit) {
// At end of input. Match failed. Backtrack out.
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
// There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c = UTEXT_NEXT32(fInputText);
if (c == 0x0a) {
// End of line in normal mode. '.' does not match the \n
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
} else {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
}
break;
case URX_JMP:
fp->fPatIdx = opValue;
break;
case URX_FAIL:
isMatch = FALSE;
goto breakFromLoop;
case URX_JMP_SAV:
U_ASSERT(opValue < fPattern->fCompiledPat->size());
fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
fp->fPatIdx = opValue; // Then JMP.
break;
case URX_JMP_SAV_X:
// This opcode is used with (x)+, when x can match a zero length string.
// Same as JMP_SAV, except conditional on the match having made forward progress.
// Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
// data address of the input position at the start of the loop.
{
U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
int32_t stoOp = (int32_t)pat[opValue-1];
U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
int32_t frameLoc = URX_VAL(stoOp);
U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
int64_t prevInputIdx = fp->fExtra[frameLoc];
U_ASSERT(prevInputIdx <= fp->fInputIdx);
if (prevInputIdx < fp->fInputIdx) {
// The match did make progress. Repeat the loop.
fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
fp->fPatIdx = opValue;
fp->fExtra[frameLoc] = fp->fInputIdx;
}
// If the input position did not advance, we do nothing here,
// execution will fall out of the loop.
}
break;
case URX_CTR_INIT:
{
U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
// Pick up the three extra operands that CTR_INIT has, and
// skip the pattern location counter past
int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
fp->fPatIdx += 3;
int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
int32_t minCount = (int32_t)pat[instrOperandLoc+1];
int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
U_ASSERT(minCount>=0);
U_ASSERT(maxCount>=minCount || maxCount==-1);
U_ASSERT(loopLoc>=fp->fPatIdx);
if (minCount == 0) {
fp = StateSave(fp, loopLoc+1, status);
}
if (maxCount == -1) {
fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
} else if (maxCount == 0) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_CTR_LOOP:
{
U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
int32_t initOp = (int32_t)pat[opValue];
U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
int32_t minCount = (int32_t)pat[opValue+2];
int32_t maxCount = (int32_t)pat[opValue+3];
(*pCounter)++;
if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
U_ASSERT(*pCounter == maxCount);
break;
}
if (*pCounter >= minCount) {
if (maxCount == -1) {
// Loop has no hard upper bound.
// Check that it is progressing through the input, break if it is not.
int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
if (fp->fInputIdx == *pLastInputIdx) {
break;
} else {
*pLastInputIdx = fp->fInputIdx;
}
}
fp = StateSave(fp, fp->fPatIdx, status);
} else {
// Increment time-out counter. (StateSave() does it if count >= minCount)
fTickCounter--;
if (fTickCounter <= 0) {
IncrementTime(status); // Re-initializes fTickCounter
}
}
fp->fPatIdx = opValue + 4; // Loop back.
}
break;
case URX_CTR_INIT_NG:
{
// Initialize a non-greedy loop
U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
// Pick up the three extra operands that CTR_INIT_NG has, and
// skip the pattern location counter past
int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
fp->fPatIdx += 3;
int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
int32_t minCount = (int32_t)pat[instrOperandLoc+1];
int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
U_ASSERT(minCount>=0);
U_ASSERT(maxCount>=minCount || maxCount==-1);
U_ASSERT(loopLoc>fp->fPatIdx);
if (maxCount == -1) {
fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
}
if (minCount == 0) {
if (maxCount != 0) {
fp = StateSave(fp, fp->fPatIdx, status);
}
fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
}
}
break;
case URX_CTR_LOOP_NG:
{
// Non-greedy {min, max} loops
U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
int32_t initOp = (int32_t)pat[opValue];
U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
int32_t minCount = (int32_t)pat[opValue+2];
int32_t maxCount = (int32_t)pat[opValue+3];
(*pCounter)++;
if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
// The loop has matched the maximum permitted number of times.
// Break out of here with no action. Matching will
// continue with the following pattern.
U_ASSERT(*pCounter == maxCount);
break;
}
if (*pCounter < minCount) {
// We haven't met the minimum number of matches yet.
// Loop back for another one.
fp->fPatIdx = opValue + 4; // Loop back.
// Increment time-out counter. (StateSave() does it if count >= minCount)
fTickCounter--;
if (fTickCounter <= 0) {
IncrementTime(status); // Re-initializes fTickCounter
}
} else {
// We do have the minimum number of matches.
// If there is no upper bound on the loop iterations, check that the input index
// is progressing, and stop the loop if it is not.
if (maxCount == -1) {
int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
if (fp->fInputIdx == *pLastInputIdx) {
break;
}
*pLastInputIdx = fp->fInputIdx;
}
// Loop Continuation: we will fall into the pattern following the loop
// (non-greedy, don't execute loop body first), but first do
// a state save to the top of the loop, so that a match failure
// in the following pattern will try another iteration of the loop.
fp = StateSave(fp, opValue + 4, status);
}
}
break;
case URX_STO_SP:
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
fData[opValue] = fStack->size();
break;
case URX_LD_SP:
{
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
int32_t newStackSize = (int32_t)fData[opValue];
U_ASSERT(newStackSize <= fStack->size());
int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
if (newFP == (int64_t *)fp) {
break;
}
int32_t j;
for (j=0; j<fFrameSize; j++) {
newFP[j] = ((int64_t *)fp)[j];
}
fp = (REStackFrame *)newFP;
fStack->setSize(newStackSize);
}
break;
case URX_BACKREF:
{
U_ASSERT(opValue < fFrameSize);
int64_t groupStartIdx = fp->fExtra[opValue];
int64_t groupEndIdx = fp->fExtra[opValue+1];
U_ASSERT(groupStartIdx <= groupEndIdx);
if (groupStartIdx < 0) {
// This capture group has not participated in the match thus far,
fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
break;
}
UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
// Note: if the capture group match was of an empty string the backref
// match succeeds. Verified by testing: Perl matches succeed
// in this case, so we do too.
UBool success = TRUE;
for (;;) {
if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
success = TRUE;
break;
}
if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
success = FALSE;
fHitEnd = TRUE;
break;
}
UChar32 captureGroupChar = utext_next32(fAltInputText);
UChar32 inputChar = utext_next32(fInputText);
if (inputChar != captureGroupChar) {
success = FALSE;
break;
}
}
if (success) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKREF_I:
{
U_ASSERT(opValue < fFrameSize);
int64_t groupStartIdx = fp->fExtra[opValue];
int64_t groupEndIdx = fp->fExtra[opValue+1];
U_ASSERT(groupStartIdx <= groupEndIdx);
if (groupStartIdx < 0) {
// This capture group has not participated in the match thus far,
fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
break;
}
utext_setNativeIndex(fAltInputText, groupStartIdx);
utext_setNativeIndex(fInputText, fp->fInputIdx);
CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
CaseFoldingUTextIterator inputItr(*fInputText);
// Note: if the capture group match was of an empty string the backref
// match succeeds. Verified by testing: Perl matches succeed
// in this case, so we do too.
UBool success = TRUE;
for (;;) {
if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
success = TRUE;
break;
}
if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
success = FALSE;
fHitEnd = TRUE;
break;
}
UChar32 captureGroupChar = captureGroupItr.next();
UChar32 inputChar = inputItr.next();
if (inputChar != captureGroupChar) {
success = FALSE;
break;
}
}
if (success && inputItr.inExpansion()) {
// We otained a match by consuming part of a string obtained from
// case-folding a single code point of the input text.
// This does not count as an overall match.
success = FALSE;
}
if (success) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_STO_INP_LOC:
{
U_ASSERT(opValue >= 0 && opValue < fFrameSize);
fp->fExtra[opValue] = fp->fInputIdx;
}
break;
case URX_JMPX:
{
int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
fp->fPatIdx += 1;
int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
int64_t savedInputIdx = fp->fExtra[dataLoc];
U_ASSERT(savedInputIdx <= fp->fInputIdx);
if (savedInputIdx < fp->fInputIdx) {
fp->fPatIdx = opValue; // JMP
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
}
}
break;
case URX_LA_START:
{
// Entering a look around block.
// Save Stack Ptr, Input Pos.
U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
fData[opValue] = fStack->size();
fData[opValue+1] = fp->fInputIdx;
fData[opValue+2] = fActiveStart;
fData[opValue+3] = fActiveLimit;
fActiveStart = fLookStart; // Set the match region change for
fActiveLimit = fLookLimit; // transparent bounds.
}
break;
case URX_LA_END:
{
// Leaving a look-ahead block.
// restore Stack Ptr, Input Pos to positions they had on entry to block.
U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
int32_t stackSize = fStack->size();
int32_t newStackSize =(int32_t)fData[opValue];
U_ASSERT(stackSize >= newStackSize);
if (stackSize > newStackSize) {
// Copy the current top frame back to the new (cut back) top frame.
// This makes the capture groups from within the look-ahead
// expression available.
int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
int32_t j;
for (j=0; j<fFrameSize; j++) {
newFP[j] = ((int64_t *)fp)[j];
}
fp = (REStackFrame *)newFP;
fStack->setSize(newStackSize);
}
fp->fInputIdx = fData[opValue+1];
// Restore the active region bounds in the input string; they may have
// been changed because of transparent bounds on a Region.
fActiveStart = fData[opValue+2];
fActiveLimit = fData[opValue+3];
U_ASSERT(fActiveStart >= 0);
U_ASSERT(fActiveLimit <= fInputLength);
}
break;
case URX_ONECHAR_I:
// Case insensitive one char. The char from the pattern is already case folded.
// Input text is not, but case folding the input can not reduce two or more code
// points to one.
if (fp->fInputIdx < fActiveLimit) {
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_NEXT32(fInputText);
if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
break;
}
} else {
fHitEnd = TRUE;
}
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
case URX_STRING_I:
{
// Case-insensitive test input against a literal string.
// Strings require two slots in the compiled pattern, one for the
// offset to the string text, and one for the length.
// The compiled string has already been case folded.
{
const UChar *patternString = litText + opValue;
int32_t patternStringIdx = 0;
op = (int32_t)pat[fp->fPatIdx];
fp->fPatIdx++;
opType = URX_TYPE(op);
opValue = URX_VAL(op);
U_ASSERT(opType == URX_STRING_LEN);
int32_t patternStringLen = opValue; // Length of the string from the pattern.
UChar32 cPattern;
UChar32 cText;
UBool success = TRUE;
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
CaseFoldingUTextIterator inputIterator(*fInputText);
while (patternStringIdx < patternStringLen) {
if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
success = FALSE;
fHitEnd = TRUE;
break;
}
U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
cText = inputIterator.next();
if (cText != cPattern) {
success = FALSE;
break;
}
}
if (inputIterator.inExpansion()) {
success = FALSE;
}
if (success) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
}
break;
case URX_LB_START:
{
// Entering a look-behind block.
// Save Stack Ptr, Input Pos and active input region.
// TODO: implement transparent bounds. Ticket #6067
U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
fData[opValue] = fStack->size();
fData[opValue+1] = fp->fInputIdx;
// Save input string length, then reset to pin any matches to end at
// the current position.
fData[opValue+2] = fActiveStart;
fData[opValue+3] = fActiveLimit;
fActiveStart = fRegionStart;
fActiveLimit = fp->fInputIdx;
// Init the variable containing the start index for attempted matches.
fData[opValue+4] = -1;
}
break;
case URX_LB_CONT:
{
// Positive Look-Behind, at top of loop checking for matches of LB expression
// at all possible input starting positions.
// Fetch the min and max possible match lengths. They are the operands
// of this op in the pattern.
int32_t minML = (int32_t)pat[fp->fPatIdx++];
int32_t maxML = (int32_t)pat[fp->fPatIdx++];
if (!UTEXT_USES_U16(fInputText)) {
// utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
// The max length need not be exact; it just needs to be >= actual maximum.
maxML *= 3;
}
U_ASSERT(minML <= maxML);
U_ASSERT(minML >= 0);
// Fetch (from data) the last input index where a match was attempted.
U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
int64_t &lbStartIdx = fData[opValue+4];
if (lbStartIdx < 0) {
// First time through loop.
lbStartIdx = fp->fInputIdx - minML;
if (lbStartIdx > 0) {
// move index to a code point boudary, if it's not on one already.
UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
} else {
// 2nd through nth time through the loop.
// Back up start position for match by one.
if (lbStartIdx == 0) {
(lbStartIdx)--;
} else {
UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
(void)UTEXT_PREVIOUS32(fInputText);
lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
}
if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
// We have tried all potential match starting points without
// getting a match. Backtrack out, and out of the
// Look Behind altogether.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
fActiveStart = fData[opValue+2];
fActiveLimit = fData[opValue+3];
U_ASSERT(fActiveStart >= 0);
U_ASSERT(fActiveLimit <= fInputLength);
break;
}
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will fall off the end of the loop.)
fp = StateSave(fp, fp->fPatIdx-3, status);
fp->fInputIdx = lbStartIdx;
}
break;
case URX_LB_END:
// End of a look-behind block, after a successful match.
{
U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
if (fp->fInputIdx != fActiveLimit) {
// The look-behind expression matched, but the match did not
// extend all the way to the point that we are looking behind from.
// FAIL out of here, which will take us back to the LB_CONT, which
// will retry the match starting at another position or fail
// the look-behind altogether, whichever is appropriate.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
// Look-behind match is good. Restore the orignal input string region,
// which had been truncated to pin the end of the lookbehind match to the
// position being looked-behind.
fActiveStart = fData[opValue+2];
fActiveLimit = fData[opValue+3];
U_ASSERT(fActiveStart >= 0);
U_ASSERT(fActiveLimit <= fInputLength);
}
break;
case URX_LBN_CONT:
{
// Negative Look-Behind, at top of loop checking for matches of LB expression
// at all possible input starting positions.
// Fetch the extra parameters of this op.
int32_t minML = (int32_t)pat[fp->fPatIdx++];
int32_t maxML = (int32_t)pat[fp->fPatIdx++];
if (!UTEXT_USES_U16(fInputText)) {
// utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
// The max length need not be exact; it just needs to be >= actual maximum.
maxML *= 3;
}
int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
continueLoc = URX_VAL(continueLoc);
U_ASSERT(minML <= maxML);
U_ASSERT(minML >= 0);
U_ASSERT(continueLoc > fp->fPatIdx);
// Fetch (from data) the last input index where a match was attempted.
U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
int64_t &lbStartIdx = fData[opValue+4];
if (lbStartIdx < 0) {
// First time through loop.
lbStartIdx = fp->fInputIdx - minML;
if (lbStartIdx > 0) {
// move index to a code point boudary, if it's not on one already.
UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
} else {
// 2nd through nth time through the loop.
// Back up start position for match by one.
if (lbStartIdx == 0) {
(lbStartIdx)--;
} else {
UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
(void)UTEXT_PREVIOUS32(fInputText);
lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
}
if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
// We have tried all potential match starting points without
// getting a match, which means that the negative lookbehind as
// a whole has succeeded. Jump forward to the continue location
fActiveStart = fData[opValue+2];
fActiveLimit = fData[opValue+3];
U_ASSERT(fActiveStart >= 0);
U_ASSERT(fActiveLimit <= fInputLength);
fp->fPatIdx = continueLoc;
break;
}
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will cause a FAIL out of the loop altogether.)
fp = StateSave(fp, fp->fPatIdx-4, status);
fp->fInputIdx = lbStartIdx;
}
break;
case URX_LBN_END:
// End of a negative look-behind block, after a successful match.
{
U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
if (fp->fInputIdx != fActiveLimit) {
// The look-behind expression matched, but the match did not
// extend all the way to the point that we are looking behind from.
// FAIL out of here, which will take us back to the LB_CONT, which
// will retry the match starting at another position or succeed
// the look-behind altogether, whichever is appropriate.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
// Look-behind expression matched, which means look-behind test as
// a whole Fails
// Restore the orignal input string length, which had been truncated
// inorder to pin the end of the lookbehind match
// to the position being looked-behind.
fActiveStart = fData[opValue+2];
fActiveLimit = fData[opValue+3];
U_ASSERT(fActiveStart >= 0);
U_ASSERT(fActiveLimit <= fInputLength);
// Restore original stack position, discarding any state saved
// by the successful pattern match.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
int32_t newStackSize = (int32_t)fData[opValue];
U_ASSERT(fStack->size() > newStackSize);
fStack->setSize(newStackSize);
// FAIL, which will take control back to someplace
// prior to entering the look-behind test.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_LOOP_SR_I:
// Loop Initialization for the optimized implementation of
// [some character set]*
// This op scans through all matching input.
// The following LOOP_C op emulates stack unwinding if the following pattern fails.
{
U_ASSERT(opValue > 0 && opValue < fSets->size());
Regex8BitSet *s8 = &fPattern->fSets8[opValue];
UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
// Loop through input, until either the input is exhausted or
// we reach a character that is not a member of the set.
int64_t ix = fp->fInputIdx;
UTEXT_SETNATIVEINDEX(fInputText, ix);
for (;;) {
if (ix >= fActiveLimit) {
fHitEnd = TRUE;
break;
}
UChar32 c = UTEXT_NEXT32(fInputText);
if (c<256) {
if (s8->contains(c) == FALSE) {
break;
}
} else {
if (s->contains(c) == FALSE) {
break;
}
}
ix = UTEXT_GETNATIVEINDEX(fInputText);
}
// If there were no matching characters, skip over the loop altogether.
// The loop doesn't run at all, a * op always succeeds.
if (ix == fp->fInputIdx) {
fp->fPatIdx++; // skip the URX_LOOP_C op.
break;
}
// Peek ahead in the compiled pattern, to the URX_LOOP_C that
// must follow. It's operand is the stack location
// that holds the starting input index for the match of this [set]*
int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
int32_t stackLoc = URX_VAL(loopcOp);
U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
fp->fExtra[stackLoc] = fp->fInputIdx;
fp->fInputIdx = ix;
// Save State to the URX_LOOP_C op that follows this one,
// so that match failures in the following code will return to there.
// Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
fp = StateSave(fp, fp->fPatIdx, status);
fp->fPatIdx++;
}
break;
case URX_LOOP_DOT_I:
// Loop Initialization for the optimized implementation of .*
// This op scans through all remaining input.
// The following LOOP_C op emulates stack unwinding if the following pattern fails.
{
// Loop through input until the input is exhausted (we reach an end-of-line)
// In DOTALL mode, we can just go straight to the end of the input.
int64_t ix;
if ((opValue & 1) == 1) {
// Dot-matches-All mode. Jump straight to the end of the string.
ix = fActiveLimit;
fHitEnd = TRUE;
} else {
// NOT DOT ALL mode. Line endings do not match '.'
// Scan forward until a line ending or end of input.
ix = fp->fInputIdx;
UTEXT_SETNATIVEINDEX(fInputText, ix);
for (;;) {
if (ix >= fActiveLimit) {
fHitEnd = TRUE;
break;
}
UChar32 c = UTEXT_NEXT32(fInputText);
if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
if ((c == 0x0a) || // 0x0a is newline in both modes.
(((opValue & 2) == 0) && // IF not UNIX_LINES mode
isLineTerminator(c))) {
// char is a line ending. Exit the scanning loop.
break;
}
}
ix = UTEXT_GETNATIVEINDEX(fInputText);
}
}
// If there were no matching characters, skip over the loop altogether.
// The loop doesn't run at all, a * op always succeeds.
if (ix == fp->fInputIdx) {
fp->fPatIdx++; // skip the URX_LOOP_C op.
break;
}
// Peek ahead in the compiled pattern, to the URX_LOOP_C that
// must follow. It's operand is the stack location
// that holds the starting input index for the match of this .*
int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
int32_t stackLoc = URX_VAL(loopcOp);
U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
fp->fExtra[stackLoc] = fp->fInputIdx;
fp->fInputIdx = ix;
// Save State to the URX_LOOP_C op that follows this one,
// so that match failures in the following code will return to there.
// Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
fp = StateSave(fp, fp->fPatIdx, status);
fp->fPatIdx++;
}
break;
case URX_LOOP_C:
{
U_ASSERT(opValue>=0 && opValue<fFrameSize);
backSearchIndex = fp->fExtra[opValue];
U_ASSERT(backSearchIndex <= fp->fInputIdx);
if (backSearchIndex == fp->fInputIdx) {
// We've backed up the input idx to the point that the loop started.
// The loop is done. Leave here without saving state.
// Subsequent failures won't come back here.
break;
}
// Set up for the next iteration of the loop, with input index
// backed up by one from the last time through,
// and a state save to this instruction in case the following code fails again.
// (We're going backwards because this loop emulates stack unwinding, not
// the initial scan forward.)
U_ASSERT(fp->fInputIdx > 0);
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
if (prevC == 0x0a &&
fp->fInputIdx > backSearchIndex &&
twoPrevC == 0x0d) {
int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
// .*, stepping back over CRLF pair.
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
}
fp = StateSave(fp, fp->fPatIdx-1, status);
}
break;
default:
// Trouble. The compiled pattern contains an entry with an
// unrecognized type tag.
UPRV_UNREACHABLE;
}
if (U_FAILURE(status)) {
isMatch = FALSE;
break;
}
}
breakFromLoop:
fMatch = isMatch;
if (isMatch) {
fLastMatchEnd = fMatchEnd;
fMatchStart = startIdx;
fMatchEnd = fp->fInputIdx;
}
#ifdef REGEX_RUN_DEBUG
if (fTraceDebug) {
if (isMatch) {
printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
} else {
printf("No match\n\n");
}
}
#endif
fFrame = fp; // The active stack frame when the engine stopped.
// Contains the capture group results that we need to
// access later.
return;
}
//--------------------------------------------------------------------------------
//
// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
// assumption that the entire string is available in the UText's
// chunk buffer. For now, that means we can use int32_t indexes,
// except for anything that needs to be saved (like group starts
// and ends).
//
// startIdx: begin matching a this index.
// toEnd: if true, match must extend to end of the input region
//
//--------------------------------------------------------------------------------
void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
UBool isMatch = FALSE; // True if the we have a match.
int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
int32_t op; // Operation from the compiled pattern, split into
int32_t opType; // the opcode
int32_t opValue; // and the operand value.
#ifdef REGEX_RUN_DEBUG
if (fTraceDebug) {
printf("MatchAt(startIdx=%d)\n", startIdx);
printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
}
#endif
if (U_FAILURE(status)) {
return;
}
// Cache frequently referenced items from the compiled pattern
//
int64_t *pat = fPattern->fCompiledPat->getBuffer();
const UChar *litText = fPattern->fLiteralText.getBuffer();
UVector *fSets = fPattern->fSets;
const UChar *inputBuf = fInputText->chunkContents;
fFrameSize = fPattern->fFrameSize;
REStackFrame *fp = resetStack();
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return;
}
fp->fPatIdx = 0;
fp->fInputIdx = startIdx;
// Zero out the pattern's static data
int32_t i;
for (i = 0; i<fPattern->fDataSize; i++) {
fData[i] = 0;
}
//
// Main loop for interpreting the compiled pattern.
// One iteration of the loop per pattern operation performed.
//
for (;;) {
op = (int32_t)pat[fp->fPatIdx];
opType = URX_TYPE(op);
opValue = URX_VAL(op);
#ifdef REGEX_RUN_DEBUG
if (fTraceDebug) {
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
fPattern->dumpOp(fp->fPatIdx);
}
#endif
fp->fPatIdx++;
switch (opType) {
case URX_NOP:
break;
case URX_BACKTRACK:
// Force a backtrack. In some circumstances, the pattern compiler
// will notice that the pattern can't possibly match anything, and will
// emit one of these at that point.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
case URX_ONECHAR:
if (fp->fInputIdx < fActiveLimit) {
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (c == opValue) {
break;
}
} else {
fHitEnd = TRUE;
}
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
case URX_STRING:
{
// Test input against a literal string.
// Strings require two slots in the compiled pattern, one for the
// offset to the string text, and one for the length.
int32_t stringStartIdx = opValue;
int32_t stringLen;
op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
fp->fPatIdx++;
opType = URX_TYPE(op);
stringLen = URX_VAL(op);
U_ASSERT(opType == URX_STRING_LEN);
U_ASSERT(stringLen >= 2);
const UChar * pInp = inputBuf + fp->fInputIdx;
const UChar * pInpLimit = inputBuf + fActiveLimit;
const UChar * pPat = litText+stringStartIdx;
const UChar * pEnd = pInp + stringLen;
UBool success = TRUE;
while (pInp < pEnd) {
if (pInp >= pInpLimit) {
fHitEnd = TRUE;
success = FALSE;
break;
}
if (*pInp++ != *pPat++) {
success = FALSE;
break;
}
}
if (success) {
fp->fInputIdx += stringLen;
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_STATE_SAVE:
fp = StateSave(fp, opValue, status);
break;
case URX_END:
// The match loop will exit via this path on a successful match,
// when we reach the end of the pattern.
if (toEnd && fp->fInputIdx != fActiveLimit) {
// The pattern matched, but not to the end of input. Try some more.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
isMatch = TRUE;
goto breakFromLoop;
// Start and End Capture stack frame variables are laid out out like this:
// fp->fExtra[opValue] - The start of a completed capture group
// opValue+1 - The end of a completed capture group
// opValue+2 - the start of a capture group whose end
// has not yet been reached (and might not ever be).
case URX_START_CAPTURE:
U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
fp->fExtra[opValue+2] = fp->fInputIdx;
break;
case URX_END_CAPTURE:
U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
fp->fExtra[opValue+1] = fp->fInputIdx; // End position
U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
break;
case URX_DOLLAR: // $, test for End of line
// or for position before new line at end of input
if (fp->fInputIdx < fAnchorLimit-2) {
// We are no where near the end of input. Fail.
// This is the common case. Keep it first.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
if (fp->fInputIdx >= fAnchorLimit) {
// We really are at the end of input. Success.
fHitEnd = TRUE;
fRequireEnd = TRUE;
break;
}
// If we are positioned just before a new-line that is located at the
// end of input, succeed.
if (fp->fInputIdx == fAnchorLimit-1) {
UChar32 c;
U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
if (isLineTerminator(c)) {
if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
// At new-line at end of input. Success
fHitEnd = TRUE;
fRequireEnd = TRUE;
break;
}
}
} else if (fp->fInputIdx == fAnchorLimit-2 &&
inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
fHitEnd = TRUE;
fRequireEnd = TRUE;
break; // At CR/LF at end of input. Success
}
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
if (fp->fInputIdx >= fAnchorLimit-1) {
// Either at the last character of input, or off the end.
if (fp->fInputIdx == fAnchorLimit-1) {
// At last char of input. Success if it's a new line.
if (inputBuf[fp->fInputIdx] == 0x0a) {
fHitEnd = TRUE;
fRequireEnd = TRUE;
break;
}
} else {
// Off the end of input. Success.
fHitEnd = TRUE;
fRequireEnd = TRUE;
break;
}
}
// Not at end of input. Back-track out.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
case URX_DOLLAR_M: // $, test for End of line in multi-line mode
{
if (fp->fInputIdx >= fAnchorLimit) {
// We really are at the end of input. Success.
fHitEnd = TRUE;
fRequireEnd = TRUE;
break;
}
// If we are positioned just before a new-line, succeed.
// It makes no difference where the new-line is within the input.
UChar32 c = inputBuf[fp->fInputIdx];
if (isLineTerminator(c)) {
// At a line end, except for the odd chance of being in the middle of a CR/LF sequence
// In multi-line mode, hitting a new-line just before the end of input does not
// set the hitEnd or requireEnd flags
if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
break;
}
}
// not at a new line. Fail.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
{
if (fp->fInputIdx >= fAnchorLimit) {
// We really are at the end of input. Success.
fHitEnd = TRUE;
fRequireEnd = TRUE; // Java set requireEnd in this case, even though
break; // adding a new-line would not lose the match.
}
// If we are not positioned just before a new-line, the test fails; backtrack out.
// It makes no difference where the new-line is within the input.
if (inputBuf[fp->fInputIdx] != 0x0a) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_CARET: // ^, test for start of line
if (fp->fInputIdx != fAnchorStart) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_CARET_M: // ^, test for start of line in mulit-line mode
{
if (fp->fInputIdx == fAnchorStart) {
// We are at the start input. Success.
break;
}
// Check whether character just before the current pos is a new-line
// unless we are at the end of input
UChar c = inputBuf[fp->fInputIdx - 1];
if ((fp->fInputIdx < fAnchorLimit) &&
isLineTerminator(c)) {
// It's a new-line. ^ is true. Success.
// TODO: what should be done with positions between a CR and LF?
break;
}
// Not at the start of a line. Fail.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
{
U_ASSERT(fp->fInputIdx >= fAnchorStart);
if (fp->fInputIdx <= fAnchorStart) {
// We are at the start input. Success.
break;
}
// Check whether character just before the current pos is a new-line
U_ASSERT(fp->fInputIdx <= fAnchorLimit);
UChar c = inputBuf[fp->fInputIdx - 1];
if (c != 0x0a) {
// Not at the start of a line. Back-track out.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_B: // Test for word boundaries
{
UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
success ^= (UBool)(opValue != 0); // flip sense for \B
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
{
UBool success = isUWordBoundary(fp->fInputIdx, status);
success ^= (UBool)(opValue != 0); // flip sense for \B
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_D: // Test for decimal digit
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
success ^= (UBool)(opValue != 0); // flip sense for \D
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_G: // Test for position at end of previous match
if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_BACKSLASH_H: // Test for \h, horizontal white space.
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
int8_t ctype = u_charType(c);
UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
success ^= (UBool)(opValue != 0); // flip sense for \H
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_R: // Test for \R, any line break sequence.
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (isLineTerminator(c)) {
if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
// Check for CR/LF sequence. Consume both together when found.
UChar c2;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
if (c2 != 0x0a) {
U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
}
}
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_V: // Any single code point line ending.
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
UBool success = isLineTerminator(c);
success ^= (UBool)(opValue != 0); // flip sense for \V
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_X:
// Match a Grapheme, as defined by Unicode UAX 29.
// Fail if at end of input
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp->fInputIdx = fActiveLimit;
}
break;
case URX_BACKSLASH_Z: // Test for end of Input
if (fp->fInputIdx < fAnchorLimit) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
} else {
fHitEnd = TRUE;
fRequireEnd = TRUE;
}
break;
case URX_STATIC_SETREF:
{
// Test input character against one of the predefined sets
// (Word Characters, for example)
// The high bit of the op value is a flag for the match polarity.
// 0: success if input char is in set.
// 1: success if input char is not in set.
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
opValue &= ~URX_NEG_SET;
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (c < 256) {
Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
if (s8.contains(c)) {
success = !success;
}
} else {
const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
if (s.contains(c)) {
success = !success;
}
}
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_STAT_SETREF_N:
{
// Test input character for NOT being a member of one of
// the predefined sets (Word Characters, for example)
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (c < 256) {
Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
if (s8.contains(c) == FALSE) {
break;
}
} else {
const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
if (s.contains(c) == FALSE) {
break;
}
}
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_SETREF:
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
U_ASSERT(opValue > 0 && opValue < fSets->size());
// There is input left. Pick up one char and test it for set membership.
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (c<256) {
Regex8BitSet *s8 = &fPattern->fSets8[opValue];
if (s8->contains(c)) {
// The character is in the set. A Match.
break;
}
} else {
UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
if (s->contains(c)) {
// The character is in the set. A Match.
break;
}
}
// the character wasn't in the set.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_DOTANY:
{
// . matches anything, but stops at end-of-line.
if (fp->fInputIdx >= fActiveLimit) {
// At end of input. Match failed. Backtrack out.
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
// There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (isLineTerminator(c)) {
// End of line in normal mode. . does not match.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
}
break;
case URX_DOTANY_ALL:
{
// . in dot-matches-all (including new lines) mode
if (fp->fInputIdx >= fActiveLimit) {
// At end of input. Match failed. Backtrack out.
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
// There is input left. Advance over one char, except if we are
// at a cr/lf, advance over both of them.
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (c==0x0d && fp->fInputIdx < fActiveLimit) {
// In the case of a CR/LF, we need to advance over both.
if (inputBuf[fp->fInputIdx] == 0x0a) {
U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
}
}
}
break;
case URX_DOTANY_UNIX:
{
// '.' operator, matches all, but stops at end-of-line.
// UNIX_LINES mode, so 0x0a is the only recognized line ending.
if (fp->fInputIdx >= fActiveLimit) {
// At end of input. Match failed. Backtrack out.
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
// There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (c == 0x0a) {
// End of line in normal mode. '.' does not match the \n
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_JMP:
fp->fPatIdx = opValue;
break;
case URX_FAIL:
isMatch = FALSE;
goto breakFromLoop;
case URX_JMP_SAV:
U_ASSERT(opValue < fPattern->fCompiledPat->size());
fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
fp->fPatIdx = opValue; // Then JMP.
break;
case URX_JMP_SAV_X:
// This opcode is used with (x)+, when x can match a zero length string.
// Same as JMP_SAV, except conditional on the match having made forward progress.
// Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
// data address of the input position at the start of the loop.
{
U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
int32_t stoOp = (int32_t)pat[opValue-1];
U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
int32_t frameLoc = URX_VAL(stoOp);
U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
U_ASSERT(prevInputIdx <= fp->fInputIdx);
if (prevInputIdx < fp->fInputIdx) {
// The match did make progress. Repeat the loop.
fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
fp->fPatIdx = opValue;
fp->fExtra[frameLoc] = fp->fInputIdx;
}
// If the input position did not advance, we do nothing here,
// execution will fall out of the loop.
}
break;
case URX_CTR_INIT:
{
U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
// Pick up the three extra operands that CTR_INIT has, and
// skip the pattern location counter past
int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
fp->fPatIdx += 3;
int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
int32_t minCount = (int32_t)pat[instrOperandLoc+1];
int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
U_ASSERT(minCount>=0);
U_ASSERT(maxCount>=minCount || maxCount==-1);
U_ASSERT(loopLoc>=fp->fPatIdx);
if (minCount == 0) {
fp = StateSave(fp, loopLoc+1, status);
}
if (maxCount == -1) {
fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
} else if (maxCount == 0) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_CTR_LOOP:
{
U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
int32_t initOp = (int32_t)pat[opValue];
U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
int32_t minCount = (int32_t)pat[opValue+2];
int32_t maxCount = (int32_t)pat[opValue+3];
(*pCounter)++;
if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
U_ASSERT(*pCounter == maxCount);
break;
}
if (*pCounter >= minCount) {
if (maxCount == -1) {
// Loop has no hard upper bound.
// Check that it is progressing through the input, break if it is not.
int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
if (fp->fInputIdx == *pLastInputIdx) {
break;
} else {
*pLastInputIdx = fp->fInputIdx;
}
}
fp = StateSave(fp, fp->fPatIdx, status);
} else {
// Increment time-out counter. (StateSave() does it if count >= minCount)
fTickCounter--;
if (fTickCounter <= 0) {
IncrementTime(status); // Re-initializes fTickCounter
}
}
fp->fPatIdx = opValue + 4; // Loop back.
}
break;
case URX_CTR_INIT_NG:
{
// Initialize a non-greedy loop
U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
// Pick up the three extra operands that CTR_INIT_NG has, and
// skip the pattern location counter past
int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
fp->fPatIdx += 3;
int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
int32_t minCount = (int32_t)pat[instrOperandLoc+1];
int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
U_ASSERT(minCount>=0);
U_ASSERT(maxCount>=minCount || maxCount==-1);
U_ASSERT(loopLoc>fp->fPatIdx);
if (maxCount == -1) {
fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
}
if (minCount == 0) {
if (maxCount != 0) {
fp = StateSave(fp, fp->fPatIdx, status);
}
fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
}
}
break;
case URX_CTR_LOOP_NG:
{
// Non-greedy {min, max} loops
U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
int32_t initOp = (int32_t)pat[opValue];
U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
int32_t minCount = (int32_t)pat[opValue+2];
int32_t maxCount = (int32_t)pat[opValue+3];
(*pCounter)++;
if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
// The loop has matched the maximum permitted number of times.
// Break out of here with no action. Matching will
// continue with the following pattern.
U_ASSERT(*pCounter == maxCount);
break;
}
if (*pCounter < minCount) {
// We haven't met the minimum number of matches yet.
// Loop back for another one.
fp->fPatIdx = opValue + 4; // Loop back.
fTickCounter--;
if (fTickCounter <= 0) {
IncrementTime(status); // Re-initializes fTickCounter
}
} else {
// We do have the minimum number of matches.
// If there is no upper bound on the loop iterations, check that the input index
// is progressing, and stop the loop if it is not.
if (maxCount == -1) {
int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
if (fp->fInputIdx == *pLastInputIdx) {
break;
}
*pLastInputIdx = fp->fInputIdx;
}
// Loop Continuation: we will fall into the pattern following the loop
// (non-greedy, don't execute loop body first), but first do
// a state save to the top of the loop, so that a match failure
// in the following pattern will try another iteration of the loop.
fp = StateSave(fp, opValue + 4, status);
}
}
break;
case URX_STO_SP:
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
fData[opValue] = fStack->size();
break;
case URX_LD_SP:
{
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
int32_t newStackSize = (int32_t)fData[opValue];
U_ASSERT(newStackSize <= fStack->size());
int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
if (newFP == (int64_t *)fp) {
break;
}
int32_t j;
for (j=0; j<fFrameSize; j++) {
newFP[j] = ((int64_t *)fp)[j];
}
fp = (REStackFrame *)newFP;
fStack->setSize(newStackSize);
}
break;
case URX_BACKREF:
{
U_ASSERT(opValue < fFrameSize);
int64_t groupStartIdx = fp->fExtra[opValue];
int64_t groupEndIdx = fp->fExtra[opValue+1];
U_ASSERT(groupStartIdx <= groupEndIdx);
int64_t inputIndex = fp->fInputIdx;
if (groupStartIdx < 0) {
// This capture group has not participated in the match thus far,
fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
break;
}
UBool success = TRUE;
for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
if (inputIndex >= fActiveLimit) {
success = FALSE;
fHitEnd = TRUE;
break;
}
if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
success = FALSE;
break;
}
}
if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
// Capture group ended with an unpaired lead surrogate.
// Back reference is not permitted to match lead only of a surrogatge pair.
success = FALSE;
}
if (success) {
fp->fInputIdx = inputIndex;
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKREF_I:
{
U_ASSERT(opValue < fFrameSize);
int64_t groupStartIdx = fp->fExtra[opValue];
int64_t groupEndIdx = fp->fExtra[opValue+1];
U_ASSERT(groupStartIdx <= groupEndIdx);
if (groupStartIdx < 0) {
// This capture group has not participated in the match thus far,
fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
break;
}
CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
// Note: if the capture group match was of an empty string the backref
// match succeeds. Verified by testing: Perl matches succeed
// in this case, so we do too.
UBool success = TRUE;
for (;;) {
UChar32 captureGroupChar = captureGroupItr.next();
if (captureGroupChar == U_SENTINEL) {
success = TRUE;
break;
}
UChar32 inputChar = inputItr.next();
if (inputChar == U_SENTINEL) {
success = FALSE;
fHitEnd = TRUE;
break;
}
if (inputChar != captureGroupChar) {
success = FALSE;
break;
}
}
if (success && inputItr.inExpansion()) {
// We otained a match by consuming part of a string obtained from
// case-folding a single code point of the input text.
// This does not count as an overall match.
success = FALSE;
}
if (success) {
fp->fInputIdx = inputItr.getIndex();
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_STO_INP_LOC:
{
U_ASSERT(opValue >= 0 && opValue < fFrameSize);
fp->fExtra[opValue] = fp->fInputIdx;
}
break;
case URX_JMPX:
{
int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
fp->fPatIdx += 1;
int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
U_ASSERT(savedInputIdx <= fp->fInputIdx);
if (savedInputIdx < fp->fInputIdx) {
fp->fPatIdx = opValue; // JMP
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
}
}
break;
case URX_LA_START:
{
// Entering a look around block.
// Save Stack Ptr, Input Pos.
U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
fData[opValue] = fStack->size();
fData[opValue+1] = fp->fInputIdx;
fData[opValue+2] = fActiveStart;
fData[opValue+3] = fActiveLimit;
fActiveStart = fLookStart; // Set the match region change for
fActiveLimit = fLookLimit; // transparent bounds.
}
break;
case URX_LA_END:
{
// Leaving a look around block.
// restore Stack Ptr, Input Pos to positions they had on entry to block.
U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
int32_t stackSize = fStack->size();
int32_t newStackSize = (int32_t)fData[opValue];
U_ASSERT(stackSize >= newStackSize);
if (stackSize > newStackSize) {
// Copy the current top frame back to the new (cut back) top frame.
// This makes the capture groups from within the look-ahead
// expression available.
int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
int32_t j;
for (j=0; j<fFrameSize; j++) {
newFP[j] = ((int64_t *)fp)[j];
}
fp = (REStackFrame *)newFP;
fStack->setSize(newStackSize);
}
fp->fInputIdx = fData[opValue+1];
// Restore the active region bounds in the input string; they may have
// been changed because of transparent bounds on a Region.
fActiveStart = fData[opValue+2];
fActiveLimit = fData[opValue+3];
U_ASSERT(fActiveStart >= 0);
U_ASSERT(fActiveLimit <= fInputLength);
}
break;
case URX_ONECHAR_I:
if (fp->fInputIdx < fActiveLimit) {
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
break;
}
} else {
fHitEnd = TRUE;
}
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
case URX_STRING_I:
// Case-insensitive test input against a literal string.
// Strings require two slots in the compiled pattern, one for the
// offset to the string text, and one for the length.
// The compiled string has already been case folded.
{
const UChar *patternString = litText + opValue;
op = (int32_t)pat[fp->fPatIdx];
fp->fPatIdx++;
opType = URX_TYPE(op);
opValue = URX_VAL(op);
U_ASSERT(opType == URX_STRING_LEN);
int32_t patternStringLen = opValue; // Length of the string from the pattern.
UChar32 cText;
UChar32 cPattern;
UBool success = TRUE;
int32_t patternStringIdx = 0;
CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
while (patternStringIdx < patternStringLen) {
U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
cText = inputIterator.next();
if (cText != cPattern) {
success = FALSE;
if (cText == U_SENTINEL) {
fHitEnd = TRUE;
}
break;
}
}
if (inputIterator.inExpansion()) {
success = FALSE;
}
if (success) {
fp->fInputIdx = inputIterator.getIndex();
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_LB_START:
{
// Entering a look-behind block.
// Save Stack Ptr, Input Pos and active input region.
// TODO: implement transparent bounds. Ticket #6067
U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
fData[opValue] = fStack->size();
fData[opValue+1] = fp->fInputIdx;
// Save input string length, then reset to pin any matches to end at
// the current position.
fData[opValue+2] = fActiveStart;
fData[opValue+3] = fActiveLimit;
fActiveStart = fRegionStart;
fActiveLimit = fp->fInputIdx;
// Init the variable containing the start index for attempted matches.
fData[opValue+4] = -1;
}
break;
case URX_LB_CONT:
{
// Positive Look-Behind, at top of loop checking for matches of LB expression
// at all possible input starting positions.
// Fetch the min and max possible match lengths. They are the operands
// of this op in the pattern.
int32_t minML = (int32_t)pat[fp->fPatIdx++];
int32_t maxML = (int32_t)pat[fp->fPatIdx++];
U_ASSERT(minML <= maxML);
U_ASSERT(minML >= 0);
// Fetch (from data) the last input index where a match was attempted.
U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
int64_t &lbStartIdx = fData[opValue+4];
if (lbStartIdx < 0) {
// First time through loop.
lbStartIdx = fp->fInputIdx - minML;
if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
U16_SET_CP_START(inputBuf, 0, lbStartIdx);
}
} else {
// 2nd through nth time through the loop.
// Back up start position for match by one.
if (lbStartIdx == 0) {
lbStartIdx--;
} else {
U16_BACK_1(inputBuf, 0, lbStartIdx);
}
}
if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
// We have tried all potential match starting points without
// getting a match. Backtrack out, and out of the
// Look Behind altogether.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
fActiveStart = fData[opValue+2];
fActiveLimit = fData[opValue+3];
U_ASSERT(fActiveStart >= 0);
U_ASSERT(fActiveLimit <= fInputLength);
break;
}
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will fall off the end of the loop.)
fp = StateSave(fp, fp->fPatIdx-3, status);
fp->fInputIdx = lbStartIdx;
}
break;
case URX_LB_END:
// End of a look-behind block, after a successful match.
{
U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
if (fp->fInputIdx != fActiveLimit) {
// The look-behind expression matched, but the match did not
// extend all the way to the point that we are looking behind from.
// FAIL out of here, which will take us back to the LB_CONT, which
// will retry the match starting at another position or fail
// the look-behind altogether, whichever is appropriate.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
// Look-behind match is good. Restore the orignal input string region,
// which had been truncated to pin the end of the lookbehind match to the
// position being looked-behind.
fActiveStart = fData[opValue+2];
fActiveLimit = fData[opValue+3];
U_ASSERT(fActiveStart >= 0);
U_ASSERT(fActiveLimit <= fInputLength);
}
break;
case URX_LBN_CONT:
{
// Negative Look-Behind, at top of loop checking for matches of LB expression
// at all possible input starting positions.
// Fetch the extra parameters of this op.
int32_t minML = (int32_t)pat[fp->fPatIdx++];
int32_t maxML = (int32_t)pat[fp->fPatIdx++];
int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
continueLoc = URX_VAL(continueLoc);
U_ASSERT(minML <= maxML);
U_ASSERT(minML >= 0);
U_ASSERT(continueLoc > fp->fPatIdx);
// Fetch (from data) the last input index where a match was attempted.
U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
int64_t &lbStartIdx = fData[opValue+4];
if (lbStartIdx < 0) {
// First time through loop.
lbStartIdx = fp->fInputIdx - minML;
if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
U16_SET_CP_START(inputBuf, 0, lbStartIdx);
}
} else {
// 2nd through nth time through the loop.
// Back up start position for match by one.
if (lbStartIdx == 0) {
lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
} else {
U16_BACK_1(inputBuf, 0, lbStartIdx);
}
}
if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
// We have tried all potential match starting points without
// getting a match, which means that the negative lookbehind as
// a whole has succeeded. Jump forward to the continue location
fActiveStart = fData[opValue+2];
fActiveLimit = fData[opValue+3];
U_ASSERT(fActiveStart >= 0);
U_ASSERT(fActiveLimit <= fInputLength);
fp->fPatIdx = continueLoc;
break;
}
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will cause a FAIL out of the loop altogether.)
fp = StateSave(fp, fp->fPatIdx-4, status);
fp->fInputIdx = lbStartIdx;
}
break;
case URX_LBN_END:
// End of a negative look-behind block, after a successful match.
{
U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
if (fp->fInputIdx != fActiveLimit) {
// The look-behind expression matched, but the match did not
// extend all the way to the point that we are looking behind from.
// FAIL out of here, which will take us back to the LB_CONT, which
// will retry the match starting at another position or succeed
// the look-behind altogether, whichever is appropriate.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
// Look-behind expression matched, which means look-behind test as
// a whole Fails
// Restore the orignal input string length, which had been truncated
// inorder to pin the end of the lookbehind match
// to the position being looked-behind.
fActiveStart = fData[opValue+2];
fActiveLimit = fData[opValue+3];
U_ASSERT(fActiveStart >= 0);
U_ASSERT(fActiveLimit <= fInputLength);
// Restore original stack position, discarding any state saved
// by the successful pattern match.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
int32_t newStackSize = (int32_t)fData[opValue];
U_ASSERT(fStack->size() > newStackSize);
fStack->setSize(newStackSize);
// FAIL, which will take control back to someplace
// prior to entering the look-behind test.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
break;
case URX_LOOP_SR_I:
// Loop Initialization for the optimized implementation of
// [some character set]*
// This op scans through all matching input.
// The following LOOP_C op emulates stack unwinding if the following pattern fails.
{
U_ASSERT(opValue > 0 && opValue < fSets->size());
Regex8BitSet *s8 = &fPattern->fSets8[opValue];
UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
// Loop through input, until either the input is exhausted or
// we reach a character that is not a member of the set.
int32_t ix = (int32_t)fp->fInputIdx;
for (;;) {
if (ix >= fActiveLimit) {
fHitEnd = TRUE;
break;
}
UChar32 c;
U16_NEXT(inputBuf, ix, fActiveLimit, c);
if (c<256) {
if (s8->contains(c) == FALSE) {
U16_BACK_1(inputBuf, 0, ix);
break;
}
} else {
if (s->contains(c) == FALSE) {
U16_BACK_1(inputBuf, 0, ix);
break;
}
}
}
// If there were no matching characters, skip over the loop altogether.
// The loop doesn't run at all, a * op always succeeds.
if (ix == fp->fInputIdx) {
fp->fPatIdx++; // skip the URX_LOOP_C op.
break;
}
// Peek ahead in the compiled pattern, to the URX_LOOP_C that
// must follow. It's operand is the stack location
// that holds the starting input index for the match of this [set]*
int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
int32_t stackLoc = URX_VAL(loopcOp);
U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
fp->fExtra[stackLoc] = fp->fInputIdx;
fp->fInputIdx = ix;
// Save State to the URX_LOOP_C op that follows this one,
// so that match failures in the following code will return to there.
// Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
fp = StateSave(fp, fp->fPatIdx, status);
fp->fPatIdx++;
}
break;
case URX_LOOP_DOT_I:
// Loop Initialization for the optimized implementation of .*
// This op scans through all remaining input.
// The following LOOP_C op emulates stack unwinding if the following pattern fails.
{
// Loop through input until the input is exhausted (we reach an end-of-line)
// In DOTALL mode, we can just go straight to the end of the input.
int32_t ix;
if ((opValue & 1) == 1) {
// Dot-matches-All mode. Jump straight to the end of the string.
ix = (int32_t)fActiveLimit;
fHitEnd = TRUE;
} else {
// NOT DOT ALL mode. Line endings do not match '.'
// Scan forward until a line ending or end of input.
ix = (int32_t)fp->fInputIdx;
for (;;) {
if (ix >= fActiveLimit) {
fHitEnd = TRUE;
break;
}
UChar32 c;
U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
if ((c == 0x0a) || // 0x0a is newline in both modes.
(((opValue & 2) == 0) && // IF not UNIX_LINES mode
isLineTerminator(c))) {
// char is a line ending. Put the input pos back to the
// line ending char, and exit the scanning loop.
U16_BACK_1(inputBuf, 0, ix);
break;
}
}
}
}
// If there were no matching characters, skip over the loop altogether.
// The loop doesn't run at all, a * op always succeeds.
if (ix == fp->fInputIdx) {
fp->fPatIdx++; // skip the URX_LOOP_C op.
break;
}
// Peek ahead in the compiled pattern, to the URX_LOOP_C that
// must follow. It's operand is the stack location
// that holds the starting input index for the match of this .*
int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
int32_t stackLoc = URX_VAL(loopcOp);
U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
fp->fExtra[stackLoc] = fp->fInputIdx;
fp->fInputIdx = ix;
// Save State to the URX_LOOP_C op that follows this one,
// so that match failures in the following code will return to there.
// Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
fp = StateSave(fp, fp->fPatIdx, status);
fp->fPatIdx++;
}
break;
case URX_LOOP_C:
{
U_ASSERT(opValue>=0 && opValue<fFrameSize);
backSearchIndex = (int32_t)fp->fExtra[opValue];
U_ASSERT(backSearchIndex <= fp->fInputIdx);
if (backSearchIndex == fp->fInputIdx) {
// We've backed up the input idx to the point that the loop started.
// The loop is done. Leave here without saving state.
// Subsequent failures won't come back here.
break;
}
// Set up for the next iteration of the loop, with input index
// backed up by one from the last time through,
// and a state save to this instruction in case the following code fails again.
// (We're going backwards because this loop emulates stack unwinding, not
// the initial scan forward.)
U_ASSERT(fp->fInputIdx > 0);
UChar32 prevC;
U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
if (prevC == 0x0a &&
fp->fInputIdx > backSearchIndex &&
inputBuf[fp->fInputIdx-1] == 0x0d) {
int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
// .*, stepping back over CRLF pair.
U16_BACK_1(inputBuf, 0, fp->fInputIdx);
}
}
fp = StateSave(fp, fp->fPatIdx-1, status);
}
break;
default:
// Trouble. The compiled pattern contains an entry with an
// unrecognized type tag.
UPRV_UNREACHABLE;
}
if (U_FAILURE(status)) {
isMatch = FALSE;
break;
}
}
breakFromLoop:
fMatch = isMatch;
if (isMatch) {
fLastMatchEnd = fMatchEnd;
fMatchStart = startIdx;
fMatchEnd = fp->fInputIdx;
}
#ifdef REGEX_RUN_DEBUG
if (fTraceDebug) {
if (isMatch) {
printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
} else {
printf("No match\n\n");
}
}
#endif
fFrame = fp; // The active stack frame when the engine stopped.
// Contains the capture group results that we need to
// access later.
return;
}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS