ICU-10463 Regular Expressions, rework debug conditionals to fix build failures on clang, and to somewhat simplify.
X-SVN-Rev: 34565
This commit is contained in:
parent
f1df548fc4
commit
10dd7ed47b
@ -109,7 +109,7 @@ void RegexCompile::compile(
|
||||
fRXPat->fPatternString = new UnicodeString(pat);
|
||||
UText patternText = UTEXT_INITIALIZER;
|
||||
utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
|
||||
|
||||
|
||||
if (U_SUCCESS(e)) {
|
||||
compile(&patternText, pp, e);
|
||||
utext_close(&patternText);
|
||||
@ -568,13 +568,13 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
|
||||
op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
|
||||
|
||||
op = URX_BUILD(URX_LA_END, dataLoc);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
|
||||
op = URX_BUILD(URX_BACKTRACK, 0);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
|
||||
|
||||
op = URX_BUILD(URX_NOP, 0);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
@ -1147,7 +1147,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
} else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
|
||||
op = URX_CARET_M;
|
||||
} else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
|
||||
op = URX_CARET; // Only testing true start of input.
|
||||
op = URX_CARET; // Only testing true start of input.
|
||||
} else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
|
||||
op = URX_CARET_M_UNIX;
|
||||
}
|
||||
@ -1281,7 +1281,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
literalChar(c);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case doBackRef:
|
||||
// BackReference. Somewhat unusual in that the front-end can not completely parse
|
||||
@ -1643,7 +1643,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
compileSet(theSet);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
case doSetIntersection2:
|
||||
// Have scanned something like [abc&&
|
||||
setPushOp(setIntersection2);
|
||||
@ -1654,7 +1654,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
// This operation is the highest precedence set operation, so we can always do
|
||||
// it immediately, without waiting to see what follows. It is necessary to perform
|
||||
// any pending '-' or '&' operation first, because these have the same precedence
|
||||
// as union-ing in a literal'
|
||||
// as union-ing in a literal'
|
||||
{
|
||||
setEval(setUnion);
|
||||
UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
|
||||
@ -1749,7 +1749,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
} // else error. scanProp() reported the error status already.
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case doSetProp:
|
||||
// Scanned a \p \P within [brackets].
|
||||
{
|
||||
@ -1771,7 +1771,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
// and ICU UnicodeSet behavior.
|
||||
{
|
||||
if (fLastSetLiteral > fC.fChar) {
|
||||
error(U_REGEX_INVALID_RANGE);
|
||||
error(U_REGEX_INVALID_RANGE);
|
||||
}
|
||||
UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
|
||||
s->add(fLastSetLiteral, fC.fChar);
|
||||
@ -1830,7 +1830,7 @@ void RegexCompile::fixLiterals(UBool split) {
|
||||
int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
|
||||
UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
|
||||
|
||||
// Split: We need to ensure that the last item in the compiled pattern
|
||||
// Split: We need to ensure that the last item in the compiled pattern
|
||||
// refers only to the last literal scanned in the pattern, so that
|
||||
// quantifiers (*, +, etc.) affect only it, and not a longer string.
|
||||
// Split before case folding for case insensitive matches.
|
||||
@ -1856,7 +1856,7 @@ void RegexCompile::fixLiterals(UBool split) {
|
||||
|
||||
if (indexOfLastCodePoint == 0) {
|
||||
// Single character, emit a URX_ONECHAR op to match it.
|
||||
if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
|
||||
if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
|
||||
u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
|
||||
op = URX_BUILD(URX_ONECHAR_I, lastCodePoint);
|
||||
} else {
|
||||
@ -1875,7 +1875,7 @@ void RegexCompile::fixLiterals(UBool split) {
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length());
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
|
||||
|
||||
// Add this string into the accumulated strings of the compiled pattern.
|
||||
fRXPat->fLiteralText.append(fLiteralChars);
|
||||
}
|
||||
@ -2449,7 +2449,7 @@ void RegexCompile::matchStartType() {
|
||||
case URX_STO_INP_LOC:
|
||||
case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
|
||||
case URX_BACKREF_I:
|
||||
|
||||
|
||||
case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
|
||||
case URX_LD_SP:
|
||||
break;
|
||||
@ -2762,7 +2762,7 @@ void RegexCompile::matchStartType() {
|
||||
{
|
||||
// Look-around. Scan forward until the matching look-ahead end,
|
||||
// without processing the look-around block. This is overly pessimistic.
|
||||
|
||||
|
||||
// Keep track of the nesting depth of look-around blocks. Boilerplate code for
|
||||
// lookahead contains two LA_END instructions, so count goes up by two
|
||||
// for each LA_START.
|
||||
@ -3322,7 +3322,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
||||
// compiled (folded) string. Folding may add code points, but
|
||||
// not remove them.
|
||||
//
|
||||
// There is a potential problem if a supplemental code point
|
||||
// There is a potential problem if a supplemental code point
|
||||
// case-folds to a BMP code point. In this case our compiled string
|
||||
// could be shorter (in code units) than a matching user string.
|
||||
//
|
||||
@ -3353,7 +3353,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
||||
loc = loopEndLoc;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3);
|
||||
if (maxLoopCount == -1) {
|
||||
// Unbounded Loop. No upper bound on match length.
|
||||
@ -3471,7 +3471,7 @@ void RegexCompile::stripNOPs() {
|
||||
d++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
UnicodeString caseStringBuffer;
|
||||
|
||||
// Make a second pass over the code, removing the NOPs by moving following
|
||||
@ -3518,7 +3518,7 @@ void RegexCompile::stripNOPs() {
|
||||
op = URX_BUILD(opType, where);
|
||||
fRXPat->fCompiledPat->setElementAt(op, dst);
|
||||
dst++;
|
||||
|
||||
|
||||
fRXPat->fNeedsAltInput = TRUE;
|
||||
break;
|
||||
}
|
||||
@ -3609,7 +3609,7 @@ void RegexCompile::error(UErrorCode e) {
|
||||
fParseErr->line = (int32_t)fLineNum;
|
||||
fParseErr->offset = (int32_t)fCharNum;
|
||||
}
|
||||
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
|
||||
|
||||
// Fill in the context.
|
||||
@ -3663,7 +3663,7 @@ UChar32 RegexCompile::nextCharLL() {
|
||||
fPeekChar = -1;
|
||||
return ch;
|
||||
}
|
||||
|
||||
|
||||
// assume we're already in the right place
|
||||
ch = UTEXT_NEXT32(fRXPat->fPattern);
|
||||
if (ch == U_SENTINEL) {
|
||||
@ -3719,7 +3719,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
||||
|
||||
if (fQuoteMode) {
|
||||
c.fQuoted = TRUE;
|
||||
if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
|
||||
if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
|
||||
c.fChar == (UChar32)-1) {
|
||||
fQuoteMode = FALSE; // Exit quote mode,
|
||||
nextCharLL(); // discard the E
|
||||
@ -3780,11 +3780,11 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
||||
//
|
||||
nextCharLL(); // get & discard the peeked char.
|
||||
c.fQuoted = TRUE;
|
||||
|
||||
|
||||
if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
|
||||
int32_t endIndex = (int32_t)pos;
|
||||
c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
|
||||
|
||||
|
||||
if (endIndex == pos) {
|
||||
error(U_REGEX_BAD_ESCAPE_SEQUENCE);
|
||||
}
|
||||
@ -3793,7 +3793,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
||||
} else {
|
||||
int32_t offset = 0;
|
||||
struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
|
||||
|
||||
|
||||
UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
|
||||
c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
|
||||
|
||||
@ -3836,8 +3836,8 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
||||
c.fChar >>= 3;
|
||||
}
|
||||
}
|
||||
c.fQuoted = TRUE;
|
||||
}
|
||||
c.fQuoted = TRUE;
|
||||
}
|
||||
else if (peekCharLL() == chQ) {
|
||||
// "\Q" enter quote mode, which will continue until "\E"
|
||||
fQuoteMode = TRUE;
|
||||
@ -3885,7 +3885,7 @@ UChar32 RegexCompile::scanNamedChar() {
|
||||
error(U_REGEX_PROPERTY_SYNTAX);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
UnicodeString charName;
|
||||
for (;;) {
|
||||
nextChar(fC);
|
||||
@ -3898,7 +3898,7 @@ UChar32 RegexCompile::scanNamedChar() {
|
||||
}
|
||||
charName.append(fC.fChar);
|
||||
}
|
||||
|
||||
|
||||
char name[100];
|
||||
if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
|
||||
(uint32_t)charName.length()>=sizeof(name)) {
|
||||
@ -4006,7 +4006,7 @@ UnicodeSet *RegexCompile::scanPosixProp() {
|
||||
|
||||
// Scan for a closing ]. A little tricky because there are some perverse
|
||||
// edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
|
||||
// ending on the second closing ].
|
||||
// ending on the second closing ].
|
||||
|
||||
UnicodeString propName;
|
||||
UBool negated = FALSE;
|
||||
@ -4017,7 +4017,7 @@ UnicodeSet *RegexCompile::scanPosixProp() {
|
||||
negated = TRUE;
|
||||
nextChar(fC);
|
||||
}
|
||||
|
||||
|
||||
// Scan for the closing ":]", collecting the property name along the way.
|
||||
UBool sawPropSetTerminator = FALSE;
|
||||
for (;;) {
|
||||
@ -4035,7 +4035,7 @@ UnicodeSet *RegexCompile::scanPosixProp() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (sawPropSetTerminator) {
|
||||
uset = createSetForProperty(propName, negated);
|
||||
}
|
||||
@ -4068,7 +4068,7 @@ static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
|
||||
// Create a Unicode Set from a Unicode Property expression.
|
||||
// This is common code underlying both \p{...} ane [:...:] expressions.
|
||||
// Includes trying the Java "properties" that aren't supported as
|
||||
// normal ICU UnicodeSet properties
|
||||
// normal ICU UnicodeSet properties
|
||||
//
|
||||
static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
|
||||
static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
|
||||
@ -4076,7 +4076,7 @@ UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
|
||||
UnicodeString setExpr;
|
||||
UnicodeSet *set;
|
||||
uint32_t usetFlags = 0;
|
||||
|
||||
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return NULL;
|
||||
}
|
||||
@ -4101,13 +4101,13 @@ UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
|
||||
}
|
||||
delete set;
|
||||
set = NULL;
|
||||
|
||||
|
||||
//
|
||||
// The property as it was didn't work.
|
||||
|
||||
// Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX
|
||||
// Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX
|
||||
// or standard Java, but many other regular expression packages do recognize it.
|
||||
|
||||
|
||||
if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
|
||||
*fStatus = U_ZERO_ERROR;
|
||||
set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
|
||||
@ -4127,7 +4127,7 @@ UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
|
||||
// InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
|
||||
//
|
||||
// Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
|
||||
// is accepted by Java. The property part of the name is compared
|
||||
// is accepted by Java. The property part of the name is compared
|
||||
// case-insenstively. The spaces must be exactly as shown, either
|
||||
// all there, or all omitted, with exactly one at each position
|
||||
// if they are present. From checking against JDK 1.6
|
||||
@ -4146,7 +4146,7 @@ UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
|
||||
else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
|
||||
mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
|
||||
}
|
||||
|
||||
|
||||
// See if the property looks like a Java "InBlockName", which
|
||||
// we will recast as "Block=BlockName"
|
||||
//
|
||||
@ -4270,7 +4270,7 @@ UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
|
||||
set = NULL;
|
||||
}
|
||||
error(*fStatus);
|
||||
return NULL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
//
|
||||
// Copyright (C) 2002-2012 International Business Machines Corporation
|
||||
// Copyright (C) 2002-2013 International Business Machines Corporation
|
||||
// and others. All rights reserved.
|
||||
//
|
||||
// file: regeximp.h
|
||||
@ -22,11 +22,11 @@
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// For debugging, define REGEX_DEBUG
|
||||
// For debugging, define REGEX_DEBUG
|
||||
// To define with configure,
|
||||
// ./runConfigureICU --enable-debug --disable-release Linux CPPFLAGS="-DREGEX_DEBUG"
|
||||
// CPPFLAGS="-DREGEX_DEBUG" ./runConfigureICU --enable-debug --disable-release Linux
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
#ifdef REGEX_DEBUG
|
||||
//
|
||||
// debugging options. Enable one or more of the three #defines immediately following
|
||||
//
|
||||
@ -46,19 +46,6 @@ U_NAMESPACE_BEGIN
|
||||
#define REGEX_SCAN_DEBUG_PRINTF(a)
|
||||
#endif
|
||||
|
||||
#ifdef REGEX_DUMP_DEBUG
|
||||
#define REGEX_DUMP_DEBUG_PRINTF(a) printf a
|
||||
#else
|
||||
#define REGEX_DUMP_DEBUG_PRINTF(a)
|
||||
#endif
|
||||
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
#define REGEX_RUN_DEBUG_PRINTF(a) printf a
|
||||
#define REGEX_DUMP_DEBUG_PRINTF(a) printf a
|
||||
#else
|
||||
#define REGEX_RUN_DEBUG_PRINTF(a)
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// Opcode types In the compiled form of the regexp, these are the type, or opcodes,
|
||||
@ -373,9 +360,9 @@ class CaseFoldingUTextIterator: public UMemory {
|
||||
CaseFoldingUTextIterator(UText &text);
|
||||
~CaseFoldingUTextIterator();
|
||||
|
||||
UChar32 next(); // Next case folded character
|
||||
UChar32 next(); // Next case folded character
|
||||
|
||||
UBool inExpansion(); // True if last char returned from next() and the
|
||||
UBool inExpansion(); // True if last char returned from next() and the
|
||||
// next to be returned both originated from a string
|
||||
// folding of the same code point from the orignal UText.
|
||||
private:
|
||||
@ -398,9 +385,9 @@ class CaseFoldingUCharIterator: public UMemory {
|
||||
CaseFoldingUCharIterator(const UChar *chars, int64_t start, int64_t limit);
|
||||
~CaseFoldingUCharIterator();
|
||||
|
||||
UChar32 next(); // Next case folded character
|
||||
UChar32 next(); // Next case folded character
|
||||
|
||||
UBool inExpansion(); // True if last char returned from next() and the
|
||||
UBool inExpansion(); // True if last char returned from next() and the
|
||||
// next to be returned both originated from a string
|
||||
// folding of the same code point from the orignal UText.
|
||||
|
||||
|
@ -2720,7 +2720,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
||||
int32_t opType; // the opcode
|
||||
int32_t opValue; // and the operand value.
|
||||
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
if (fTraceDebug)
|
||||
{
|
||||
printf("MatchAt(startIdx=%ld)\n", startIdx);
|
||||
@ -2730,7 +2730,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
||||
if (c<32 || c>256) {
|
||||
c = '.';
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c", c));
|
||||
printf("%c", c);
|
||||
|
||||
c = UTEXT_NEXT32(fPattern->fPattern);
|
||||
}
|
||||
@ -2748,7 +2748,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
||||
printf("\n");
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
@ -2778,23 +2778,17 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
||||
// One iteration of the loop per pattern operation performed.
|
||||
//
|
||||
for (;;) {
|
||||
#if 0
|
||||
if (_heapchk() != _HEAPOK) {
|
||||
fprintf(stderr, "Heap Trouble\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
op = (int32_t)pat[fp->fPatIdx];
|
||||
opType = URX_TYPE(op);
|
||||
opValue = URX_VAL(op);
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
if (fTraceDebug) {
|
||||
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
||||
printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
|
||||
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
|
||||
fPattern->dumpOp(fp->fPatIdx);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
fp->fPatIdx++;
|
||||
|
||||
switch (opType) {
|
||||
@ -4188,16 +4182,17 @@ breakFromLoop:
|
||||
fLastMatchEnd = fMatchEnd;
|
||||
fMatchStart = startIdx;
|
||||
fMatchEnd = fp->fInputIdx;
|
||||
if (fTraceDebug) {
|
||||
REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fTraceDebug) {
|
||||
REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
|
||||
}
|
||||
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
if (fTraceDebug) {
|
||||
if (isMatch) {
|
||||
printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
|
||||
} else {
|
||||
printf("No match\n\n");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
fFrame = fp; // The active stack frame when the engine stopped.
|
||||
// Contains the capture group results that we need to
|
||||
@ -4228,8 +4223,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
||||
int32_t opValue; // and the operand value.
|
||||
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
if (fTraceDebug)
|
||||
{
|
||||
if (fTraceDebug) {
|
||||
printf("MatchAt(startIdx=%d)\n", startIdx);
|
||||
printf("Original Pattern: ");
|
||||
UChar32 c = utext_next32From(fPattern->fPattern, 0);
|
||||
@ -4237,7 +4231,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
||||
if (c<32 || c>256) {
|
||||
c = '.';
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c", c));
|
||||
printf("%c", c);
|
||||
|
||||
c = UTEXT_NEXT32(fPattern->fPattern);
|
||||
}
|
||||
@ -4287,12 +4281,6 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
||||
// One iteration of the loop per pattern operation performed.
|
||||
//
|
||||
for (;;) {
|
||||
#if 0
|
||||
if (_heapchk() != _HEAPOK) {
|
||||
fprintf(stderr, "Heap Trouble\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
op = (int32_t)pat[fp->fPatIdx];
|
||||
opType = URX_TYPE(op);
|
||||
opValue = URX_VAL(op);
|
||||
@ -5627,20 +5615,21 @@ breakFromLoop:
|
||||
fLastMatchEnd = fMatchEnd;
|
||||
fMatchStart = startIdx;
|
||||
fMatchEnd = fp->fInputIdx;
|
||||
if (fTraceDebug) {
|
||||
REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fTraceDebug) {
|
||||
REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
if (fTraceDebug) {
|
||||
if (isMatch) {
|
||||
printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
|
||||
} else {
|
||||
printf("No match\n\n");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
fFrame = fp; // The active stack frame when the engine stopped.
|
||||
// Contains the capture group results that we need to
|
||||
// access later.
|
||||
// Contains the capture group results that we need to
|
||||
// access later.
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -3,7 +3,7 @@
|
||||
//
|
||||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2002-2012 International Business Machines Corporation *
|
||||
* Copyright (C) 2002-2013 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
***************************************************************************
|
||||
*/
|
||||
@ -275,21 +275,21 @@ RegexPattern::compile(const UnicodeString ®ex,
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
|
||||
UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
|
||||
UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL;
|
||||
|
||||
|
||||
if ((flags & ~allFlags) != 0) {
|
||||
status = U_REGEX_INVALID_FLAG;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
if ((flags & UREGEX_CANON_EQ) != 0) {
|
||||
status = U_REGEX_UNIMPLEMENTED;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
RegexPattern *This = new RegexPattern;
|
||||
if (This == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
@ -301,15 +301,15 @@ RegexPattern::compile(const UnicodeString ®ex,
|
||||
return NULL;
|
||||
}
|
||||
This->fFlags = flags;
|
||||
|
||||
|
||||
RegexCompile compiler(This, status);
|
||||
compiler.compile(regex, pe, status);
|
||||
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
delete This;
|
||||
This = NULL;
|
||||
}
|
||||
|
||||
|
||||
return This;
|
||||
}
|
||||
|
||||
@ -355,7 +355,7 @@ RegexPattern::compile(UText *regex,
|
||||
|
||||
RegexCompile compiler(This, status);
|
||||
compiler.compile(regex, pe, status);
|
||||
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
delete This;
|
||||
This = NULL;
|
||||
@ -538,12 +538,12 @@ UnicodeString RegexPattern::pattern() const {
|
||||
int64_t nativeLen = utext_nativeLength(fPattern);
|
||||
int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error
|
||||
UnicodeString result;
|
||||
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
UChar *resultChars = result.getBuffer(len16);
|
||||
utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning
|
||||
result.releaseBuffer(len16);
|
||||
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@ -622,8 +622,9 @@ int32_t RegexPattern::split(UText *input,
|
||||
// Debugging function only.
|
||||
//
|
||||
//---------------------------------------------------------------------
|
||||
#if defined(REGEX_DEBUG)
|
||||
void RegexPattern::dumpOp(int32_t index) const {
|
||||
(void)index; // Suppress warnings in non-debug build.
|
||||
#if defined(REGEX_DEBUG)
|
||||
static const char * const opNames[] = {URX_OPCODE_NAMES};
|
||||
int32_t op = fCompiledPat->elementAti(index);
|
||||
int32_t val = URX_VAL(op);
|
||||
@ -633,7 +634,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
pinnedType = 0;
|
||||
}
|
||||
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType]));
|
||||
printf("%4d %08x %-15s ", index, op, opNames[pinnedType]);
|
||||
switch (type) {
|
||||
case URX_NOP:
|
||||
case URX_DOTANY:
|
||||
@ -682,12 +683,12 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
case URX_LOOP_C:
|
||||
case URX_LOOP_DOT_I:
|
||||
// types with an integer operand field.
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%d", val));
|
||||
printf("%d", val);
|
||||
break;
|
||||
|
||||
case URX_ONECHAR:
|
||||
case URX_ONECHAR_I:
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
|
||||
printf("%c", val<256?val:'?');
|
||||
break;
|
||||
|
||||
case URX_STRING:
|
||||
@ -700,7 +701,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
for (i=val; i<val+length; i++) {
|
||||
UChar c = fLiteralText[i];
|
||||
if (c < 32 || c >= 256) {c = '.';}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c", c));
|
||||
printf("%c", c);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -712,7 +713,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
|
||||
set->toPattern(s, TRUE);
|
||||
for (int32_t i=0; i<s.length(); i++) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
|
||||
printf("%c", s.charAt(i));
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -722,89 +723,89 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
{
|
||||
UnicodeString s;
|
||||
if (val & URX_NEG_SET) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(("NOT "));
|
||||
printf("NOT ");
|
||||
val &= ~URX_NEG_SET;
|
||||
}
|
||||
UnicodeSet *set = fStaticSets[val];
|
||||
set->toPattern(s, TRUE);
|
||||
for (int32_t i=0; i<s.length(); i++) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
|
||||
printf("%c", s.charAt(i));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
REGEX_DUMP_DEBUG_PRINTF(("??????"));
|
||||
printf("??????");
|
||||
break;
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("\n"));
|
||||
}
|
||||
printf("\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#if defined(REGEX_DEBUG)
|
||||
U_CAPI void U_EXPORT2
|
||||
RegexPatternDump(const RegexPattern *This) {
|
||||
RegexPattern::dumpPattern() const {
|
||||
#if defined(REGEX_DEBUG)
|
||||
int index;
|
||||
int i;
|
||||
|
||||
REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: "));
|
||||
UChar32 c = utext_next32From(This->fPattern, 0);
|
||||
printf("Original Pattern: ");
|
||||
UChar32 c = utext_next32From(fPattern, 0);
|
||||
while (c != U_SENTINEL) {
|
||||
if (c<32 || c>256) {
|
||||
c = '.';
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c", c));
|
||||
|
||||
c = UTEXT_NEXT32(This->fPattern);
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("\n"));
|
||||
REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen));
|
||||
REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
|
||||
if (This->fStartType == START_STRING) {
|
||||
REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \""));
|
||||
for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates.
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("\"\n"));
|
||||
printf("%c", c);
|
||||
|
||||
} else if (This->fStartType == START_SET) {
|
||||
int32_t numSetChars = This->fInitialChars->size();
|
||||
c = UTEXT_NEXT32(fPattern);
|
||||
}
|
||||
printf("\n");
|
||||
printf(" Min Match Length: %d\n", fMinMatchLen);
|
||||
printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType));
|
||||
if (fStartType == START_STRING) {
|
||||
printf(" Initial match string: \"");
|
||||
for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) {
|
||||
printf("%c", fLiteralText[i]); // TODO: non-printables, surrogates.
|
||||
}
|
||||
printf("\"\n");
|
||||
|
||||
} else if (fStartType == START_SET) {
|
||||
int32_t numSetChars = fInitialChars->size();
|
||||
if (numSetChars > 20) {
|
||||
numSetChars = 20;
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
|
||||
printf(" Match First Chars : ");
|
||||
for (i=0; i<numSetChars; i++) {
|
||||
UChar32 c = This->fInitialChars->charAt(i);
|
||||
UChar32 c = fInitialChars->charAt(i);
|
||||
if (0x20<c && c <0x7e) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
|
||||
printf("%c ", c);
|
||||
} else {
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
|
||||
printf("%#x ", c);
|
||||
}
|
||||
}
|
||||
if (numSetChars < This->fInitialChars->size()) {
|
||||
REGEX_DUMP_DEBUG_PRINTF((" ..."));
|
||||
if (numSetChars < fInitialChars->size()) {
|
||||
printf(" ...");
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("\n"));
|
||||
printf("\n");
|
||||
|
||||
} else if (This->fStartType == START_CHAR) {
|
||||
REGEX_DUMP_DEBUG_PRINTF((" First char of Match : "));
|
||||
if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar));
|
||||
} else if (fStartType == START_CHAR) {
|
||||
printf(" First char of Match : ");
|
||||
if (0x20 < fInitialChar && fInitialChar<0x7e) {
|
||||
printf("%c\n", fInitialChar);
|
||||
} else {
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar));
|
||||
printf("%#x\n", fInitialChar);
|
||||
}
|
||||
}
|
||||
|
||||
REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \
|
||||
"-------------------------------------------\n"));
|
||||
for (index = 0; index<This->fCompiledPat->size(); index++) {
|
||||
This->dumpOp(index);
|
||||
printf("\nIndex Binary Type Operand\n" \
|
||||
"-------------------------------------------\n");
|
||||
for (index = 0; index<fCompiledPat->size(); index++) {
|
||||
dumpOp(index);
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
|
||||
}
|
||||
printf("\n\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -68,21 +68,6 @@ class UVector;
|
||||
class UVector32;
|
||||
class UVector64;
|
||||
|
||||
#ifndef U_HIDE_INTERNAL_API
|
||||
/**
|
||||
* RBBIPatternDump Debug function, displays the compiled form of a pattern.
|
||||
* @internal
|
||||
*/
|
||||
#ifdef REGEX_DEBUG
|
||||
U_INTERNAL void U_EXPORT2
|
||||
RegexPatternDump(const RegexPattern *pat);
|
||||
#else
|
||||
#undef RegexPatternDump
|
||||
#define RegexPatternDump(pat)
|
||||
#endif
|
||||
#endif /* U_HIDE_INTERNAL_API */
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Class <code>RegexPattern</code> represents a compiled regular expression. It includes
|
||||
@ -613,11 +598,17 @@ private:
|
||||
//
|
||||
void init(); // Common initialization, for use by constructors.
|
||||
void zap(); // Common cleanup
|
||||
#ifdef REGEX_DEBUG
|
||||
void dumpOp(int32_t index) const;
|
||||
friend void U_EXPORT2 RegexPatternDump(const RegexPattern *);
|
||||
#endif
|
||||
|
||||
void dumpOp(int32_t index) const;
|
||||
|
||||
public:
|
||||
#ifndef U_HIDE_INTERNAL_API
|
||||
/**
|
||||
* Dump a compiled pattern. Internal debug function.
|
||||
* @internal
|
||||
*/
|
||||
void dumpPattern() const;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user