ICU-10024 fixes for Regexp, look-behind pattern fails to compile bug.
X-SVN-Rev: 33835
This commit is contained in:
parent
b64a61baba
commit
b129b40280
@ -1,7 +1,7 @@
|
|||||||
//
|
//
|
||||||
// file: regexcmp.cpp
|
// file: regexcmp.cpp
|
||||||
//
|
//
|
||||||
// Copyright (C) 2002-2012 International Business Machines Corporation and others.
|
// Copyright (C) 2002-2013 International Business Machines Corporation and others.
|
||||||
// All Rights Reserved.
|
// All Rights Reserved.
|
||||||
//
|
//
|
||||||
// This file contains the ICU regular expression compiler, which is responsible
|
// This file contains the ICU regular expression compiler, which is responsible
|
||||||
@ -3335,14 +3335,46 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
|||||||
|
|
||||||
case URX_CTR_INIT:
|
case URX_CTR_INIT:
|
||||||
case URX_CTR_INIT_NG:
|
case URX_CTR_INIT_NG:
|
||||||
|
// For Loops, recursively call this function on the pattern for the loop body,
|
||||||
|
// then multiply the result by the maximum loop count.
|
||||||
|
{
|
||||||
|
int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
|
||||||
|
if (loopEndLoc == loc+4) {
|
||||||
|
// Loop has an empty body. No affect on max match length.
|
||||||
|
// Continue processing with code after the loop end.
|
||||||
|
loc = loopEndLoc;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3);
|
||||||
|
if (maxLoopCount == -1) {
|
||||||
|
// Unbounded Loop. No upper bound on match length.
|
||||||
|
currentLen = INT32_MAX;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
U_ASSERT(loopEndLoc >= loc+4);
|
||||||
|
int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
|
||||||
|
if (blockLen == INT32_MAX) {
|
||||||
|
currentLen = blockLen;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
currentLen += blockLen * maxLoopCount;
|
||||||
|
loc = loopEndLoc;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case URX_CTR_LOOP:
|
case URX_CTR_LOOP:
|
||||||
case URX_CTR_LOOP_NG:
|
case URX_CTR_LOOP_NG:
|
||||||
|
// These opcodes will be skipped over by code for URX_CRT_INIT.
|
||||||
|
// We shouldn't encounter them here.
|
||||||
|
U_ASSERT(FALSE);
|
||||||
|
break;
|
||||||
|
|
||||||
case URX_LOOP_SR_I:
|
case URX_LOOP_SR_I:
|
||||||
case URX_LOOP_DOT_I:
|
case URX_LOOP_DOT_I:
|
||||||
case URX_LOOP_C:
|
case URX_LOOP_C:
|
||||||
// For anything to do with loops, make the match length unbounded.
|
// For anything to do with loops, make the match length unbounded.
|
||||||
// Note: INIT instructions are multi-word. Can ignore because
|
|
||||||
// INT32_MAX length will stop the per-instruction loop.
|
|
||||||
currentLen = INT32_MAX;
|
currentLen = INT32_MAX;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
**************************************************************************
|
**************************************************************************
|
||||||
* Copyright (C) 2002-2012 International Business Machines Corporation *
|
* Copyright (C) 2002-2013 International Business Machines Corporation *
|
||||||
* and others. All rights reserved. *
|
* and others. All rights reserved. *
|
||||||
**************************************************************************
|
**************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -2827,7 +2827,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
|||||||
#ifdef REGEX_RUN_DEBUG
|
#ifdef REGEX_RUN_DEBUG
|
||||||
if (fTraceDebug) {
|
if (fTraceDebug) {
|
||||||
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
||||||
printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
|
printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
|
||||||
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
|
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
|
||||||
fPattern->dumpOp(fp->fPatIdx);
|
fPattern->dumpOp(fp->fPatIdx);
|
||||||
}
|
}
|
||||||
@ -3492,7 +3492,7 @@ GC_Done:
|
|||||||
int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
|
int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
|
||||||
U_ASSERT(minCount>=0);
|
U_ASSERT(minCount>=0);
|
||||||
U_ASSERT(maxCount>=minCount || maxCount==-1);
|
U_ASSERT(maxCount>=minCount || maxCount==-1);
|
||||||
U_ASSERT(loopLoc>fp->fPatIdx);
|
U_ASSERT(loopLoc>=fp->fPatIdx);
|
||||||
|
|
||||||
if (minCount == 0) {
|
if (minCount == 0) {
|
||||||
fp = StateSave(fp, loopLoc+1, status);
|
fp = StateSave(fp, loopLoc+1, status);
|
||||||
@ -4211,7 +4211,7 @@ breakFromLoop:
|
|||||||
fMatchStart = startIdx;
|
fMatchStart = startIdx;
|
||||||
fMatchEnd = fp->fInputIdx;
|
fMatchEnd = fp->fInputIdx;
|
||||||
if (fTraceDebug) {
|
if (fTraceDebug) {
|
||||||
REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
|
REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -4252,7 +4252,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
|||||||
#ifdef REGEX_RUN_DEBUG
|
#ifdef REGEX_RUN_DEBUG
|
||||||
if (fTraceDebug)
|
if (fTraceDebug)
|
||||||
{
|
{
|
||||||
printf("MatchAt(startIdx=%ld)\n", startIdx);
|
printf("MatchAt(startIdx=%d)\n", startIdx);
|
||||||
printf("Original Pattern: ");
|
printf("Original Pattern: ");
|
||||||
UChar32 c = utext_next32From(fPattern->fPattern, 0);
|
UChar32 c = utext_next32From(fPattern->fPattern, 0);
|
||||||
while (c != U_SENTINEL) {
|
while (c != U_SENTINEL) {
|
||||||
@ -4321,7 +4321,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
|||||||
#ifdef REGEX_RUN_DEBUG
|
#ifdef REGEX_RUN_DEBUG
|
||||||
if (fTraceDebug) {
|
if (fTraceDebug) {
|
||||||
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
||||||
printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
|
printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
|
||||||
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
|
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
|
||||||
fPattern->dumpOp(fp->fPatIdx);
|
fPattern->dumpOp(fp->fPatIdx);
|
||||||
}
|
}
|
||||||
@ -4951,7 +4951,7 @@ GC_Done:
|
|||||||
int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
|
int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
|
||||||
U_ASSERT(minCount>=0);
|
U_ASSERT(minCount>=0);
|
||||||
U_ASSERT(maxCount>=minCount || maxCount==-1);
|
U_ASSERT(maxCount>=minCount || maxCount==-1);
|
||||||
U_ASSERT(loopLoc>fp->fPatIdx);
|
U_ASSERT(loopLoc>=fp->fPatIdx);
|
||||||
|
|
||||||
if (minCount == 0) {
|
if (minCount == 0) {
|
||||||
fp = StateSave(fp, loopLoc+1, status);
|
fp = StateSave(fp, loopLoc+1, status);
|
||||||
@ -5635,7 +5635,7 @@ breakFromLoop:
|
|||||||
fMatchStart = startIdx;
|
fMatchStart = startIdx;
|
||||||
fMatchEnd = fp->fInputIdx;
|
fMatchEnd = fp->fInputIdx;
|
||||||
if (fTraceDebug) {
|
if (fTraceDebug) {
|
||||||
REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
|
REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
17
icu4c/source/test/testdata/regextst.txt
vendored
17
icu4c/source/test/testdata/regextst.txt
vendored
@ -1,4 +1,4 @@
|
|||||||
# Copyright (c) 2001-2012 International Business Machines
|
# Copyright (c) 2001-2013 International Business Machines
|
||||||
# Corporation and others. All Rights Reserved.
|
# Corporation and others. All Rights Reserved.
|
||||||
#
|
#
|
||||||
# file:
|
# file:
|
||||||
@ -1146,6 +1146,21 @@
|
|||||||
|
|
||||||
"(ab)?(?<=ab)cd|ef" i "<0><1>ab</1>cd</0>"
|
"(ab)?(?<=ab)cd|ef" i "<0><1>ab</1>cd</0>"
|
||||||
|
|
||||||
|
# Bug 10024
|
||||||
|
# Incorrect (unbounded) longest match length with {1, 20} style quantifiers.
|
||||||
|
# Unbounded match is disallowed in look-behind expressions.
|
||||||
|
# Max match length is used to limit where to check for look-behind matches.
|
||||||
|
|
||||||
|
"(?<=a{1,5})bc" "aaaa<0>bc</0>def"
|
||||||
|
"(?<=(?:aa){3,20})bc" "aaaaaa<0>bc</0>def"
|
||||||
|
"(?<!abc {1,100}|def {1,100}|ghi {1,100})jkl" "def jkl"
|
||||||
|
"(?<!abc {1,100}|def {1,100}|ghi {1,100})jkl" "rst <0>jkl</0>"
|
||||||
|
"(?<=a{11})bc" "aaaaaaaaaaa<0>bc</0>"
|
||||||
|
"(?<=a{11})bc" "aaaaaaaaaabc"
|
||||||
|
"(?<=a{1,})bc" E "aaaa<0>bc</0>def" # U_REGEX_LOOK_BEHIND_LIMIT error.
|
||||||
|
"(?<=(?:){11})bc" "<0>bc</0>" # Empty (?:) expression.
|
||||||
|
|
||||||
|
|
||||||
# Random debugging, Temporary
|
# Random debugging, Temporary
|
||||||
#
|
#
|
||||||
#"^(?:a?b?)*$" "a--"
|
#"^(?:a?b?)*$" "a--"
|
||||||
|
Loading…
Reference in New Issue
Block a user