ICU-10024 fixes for Regexp, look-behind pattern fails to compile bug.

X-SVN-Rev: 33835
This commit is contained in:
Andy Heninger 2013-06-18 20:38:08 +00:00
parent b64a61baba
commit b129b40280
3 changed files with 59 additions and 12 deletions

View File

@ -1,7 +1,7 @@
// //
// file: regexcmp.cpp // file: regexcmp.cpp
// //
// Copyright (C) 2002-2012 International Business Machines Corporation and others. // Copyright (C) 2002-2013 International Business Machines Corporation and others.
// All Rights Reserved. // All Rights Reserved.
// //
// This file contains the ICU regular expression compiler, which is responsible // This file contains the ICU regular expression compiler, which is responsible
@ -3335,14 +3335,46 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
case URX_CTR_INIT: case URX_CTR_INIT:
case URX_CTR_INIT_NG: case URX_CTR_INIT_NG:
// For Loops, recursively call this function on the pattern for the loop body,
// then multiply the result by the maximum loop count.
{
int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
if (loopEndLoc == loc+4) {
// Loop has an empty body. No affect on max match length.
// Continue processing with code after the loop end.
loc = loopEndLoc;
break;
}
int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3);
if (maxLoopCount == -1) {
// Unbounded Loop. No upper bound on match length.
currentLen = INT32_MAX;
break;
}
U_ASSERT(loopEndLoc >= loc+4);
int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
if (blockLen == INT32_MAX) {
currentLen = blockLen;
break;
}
currentLen += blockLen * maxLoopCount;
loc = loopEndLoc;
break;
}
case URX_CTR_LOOP: case URX_CTR_LOOP:
case URX_CTR_LOOP_NG: case URX_CTR_LOOP_NG:
// These opcodes will be skipped over by code for URX_CRT_INIT.
// We shouldn't encounter them here.
U_ASSERT(FALSE);
break;
case URX_LOOP_SR_I: case URX_LOOP_SR_I:
case URX_LOOP_DOT_I: case URX_LOOP_DOT_I:
case URX_LOOP_C: case URX_LOOP_C:
// For anything to do with loops, make the match length unbounded. // For anything to do with loops, make the match length unbounded.
// Note: INIT instructions are multi-word. Can ignore because
// INT32_MAX length will stop the per-instruction loop.
currentLen = INT32_MAX; currentLen = INT32_MAX;
break; break;

View File

@ -1,6 +1,6 @@
/* /*
************************************************************************** **************************************************************************
* Copyright (C) 2002-2012 International Business Machines Corporation * * Copyright (C) 2002-2013 International Business Machines Corporation *
* and others. All rights reserved. * * and others. All rights reserved. *
************************************************************************** **************************************************************************
*/ */
@ -2827,7 +2827,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
#ifdef REGEX_RUN_DEBUG #ifdef REGEX_RUN_DEBUG
if (fTraceDebug) { if (fTraceDebug) {
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
fPattern->dumpOp(fp->fPatIdx); fPattern->dumpOp(fp->fPatIdx);
} }
@ -3492,7 +3492,7 @@ GC_Done:
int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
U_ASSERT(minCount>=0); U_ASSERT(minCount>=0);
U_ASSERT(maxCount>=minCount || maxCount==-1); U_ASSERT(maxCount>=minCount || maxCount==-1);
U_ASSERT(loopLoc>fp->fPatIdx); U_ASSERT(loopLoc>=fp->fPatIdx);
if (minCount == 0) { if (minCount == 0) {
fp = StateSave(fp, loopLoc+1, status); fp = StateSave(fp, loopLoc+1, status);
@ -4211,7 +4211,7 @@ breakFromLoop:
fMatchStart = startIdx; fMatchStart = startIdx;
fMatchEnd = fp->fInputIdx; fMatchEnd = fp->fInputIdx;
if (fTraceDebug) { if (fTraceDebug) {
REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
} }
} }
else else
@ -4252,7 +4252,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
#ifdef REGEX_RUN_DEBUG #ifdef REGEX_RUN_DEBUG
if (fTraceDebug) if (fTraceDebug)
{ {
printf("MatchAt(startIdx=%ld)\n", startIdx); printf("MatchAt(startIdx=%d)\n", startIdx);
printf("Original Pattern: "); printf("Original Pattern: ");
UChar32 c = utext_next32From(fPattern->fPattern, 0); UChar32 c = utext_next32From(fPattern->fPattern, 0);
while (c != U_SENTINEL) { while (c != U_SENTINEL) {
@ -4321,7 +4321,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
#ifdef REGEX_RUN_DEBUG #ifdef REGEX_RUN_DEBUG
if (fTraceDebug) { if (fTraceDebug) {
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
fPattern->dumpOp(fp->fPatIdx); fPattern->dumpOp(fp->fPatIdx);
} }
@ -4951,7 +4951,7 @@ GC_Done:
int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
U_ASSERT(minCount>=0); U_ASSERT(minCount>=0);
U_ASSERT(maxCount>=minCount || maxCount==-1); U_ASSERT(maxCount>=minCount || maxCount==-1);
U_ASSERT(loopLoc>fp->fPatIdx); U_ASSERT(loopLoc>=fp->fPatIdx);
if (minCount == 0) { if (minCount == 0) {
fp = StateSave(fp, loopLoc+1, status); fp = StateSave(fp, loopLoc+1, status);
@ -5635,7 +5635,7 @@ breakFromLoop:
fMatchStart = startIdx; fMatchStart = startIdx;
fMatchEnd = fp->fInputIdx; fMatchEnd = fp->fInputIdx;
if (fTraceDebug) { if (fTraceDebug) {
REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
} }
} }
else else

View File

@ -1,4 +1,4 @@
# Copyright (c) 2001-2012 International Business Machines # Copyright (c) 2001-2013 International Business Machines
# Corporation and others. All Rights Reserved. # Corporation and others. All Rights Reserved.
# #
# file: # file:
@ -1146,6 +1146,21 @@
"(ab)?(?<=ab)cd|ef" i "<0><1>ab</1>cd</0>" "(ab)?(?<=ab)cd|ef" i "<0><1>ab</1>cd</0>"
# Bug 10024
# Incorrect (unbounded) longest match length with {1, 20} style quantifiers.
# Unbounded match is disallowed in look-behind expressions.
# Max match length is used to limit where to check for look-behind matches.
"(?<=a{1,5})bc" "aaaa<0>bc</0>def"
"(?<=(?:aa){3,20})bc" "aaaaaa<0>bc</0>def"
"(?<!abc {1,100}|def {1,100}|ghi {1,100})jkl" "def jkl"
"(?<!abc {1,100}|def {1,100}|ghi {1,100})jkl" "rst <0>jkl</0>"
"(?<=a{11})bc" "aaaaaaaaaaa<0>bc</0>"
"(?<=a{11})bc" "aaaaaaaaaabc"
"(?<=a{1,})bc" E "aaaa<0>bc</0>def" # U_REGEX_LOOK_BEHIND_LIMIT error.
"(?<=(?:){11})bc" "<0>bc</0>" # Empty (?:) expression.
# Random debugging, Temporary # Random debugging, Temporary
# #
#"^(?:a?b?)*$" "a--" #"^(?:a?b?)*$" "a--"