ICU-10024 fixes for Regexp, look-behind pattern fails to compile bug.
X-SVN-Rev: 33835
This commit is contained in:
parent
b64a61baba
commit
b129b40280
@ -1,7 +1,7 @@
|
||||
//
|
||||
// file: regexcmp.cpp
|
||||
//
|
||||
// Copyright (C) 2002-2012 International Business Machines Corporation and others.
|
||||
// Copyright (C) 2002-2013 International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains the ICU regular expression compiler, which is responsible
|
||||
@ -3335,14 +3335,46 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
||||
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
// For Loops, recursively call this function on the pattern for the loop body,
|
||||
// then multiply the result by the maximum loop count.
|
||||
{
|
||||
int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
|
||||
if (loopEndLoc == loc+4) {
|
||||
// Loop has an empty body. No affect on max match length.
|
||||
// Continue processing with code after the loop end.
|
||||
loc = loopEndLoc;
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3);
|
||||
if (maxLoopCount == -1) {
|
||||
// Unbounded Loop. No upper bound on match length.
|
||||
currentLen = INT32_MAX;
|
||||
break;
|
||||
}
|
||||
|
||||
U_ASSERT(loopEndLoc >= loc+4);
|
||||
int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
|
||||
if (blockLen == INT32_MAX) {
|
||||
currentLen = blockLen;
|
||||
break;
|
||||
}
|
||||
currentLen += blockLen * maxLoopCount;
|
||||
loc = loopEndLoc;
|
||||
break;
|
||||
}
|
||||
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
// These opcodes will be skipped over by code for URX_CRT_INIT.
|
||||
// We shouldn't encounter them here.
|
||||
U_ASSERT(FALSE);
|
||||
break;
|
||||
|
||||
case URX_LOOP_SR_I:
|
||||
case URX_LOOP_DOT_I:
|
||||
case URX_LOOP_C:
|
||||
// For anything to do with loops, make the match length unbounded.
|
||||
// Note: INIT instructions are multi-word. Can ignore because
|
||||
// INT32_MAX length will stop the per-instruction loop.
|
||||
currentLen = INT32_MAX;
|
||||
break;
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**************************************************************************
|
||||
* Copyright (C) 2002-2012 International Business Machines Corporation *
|
||||
* Copyright (C) 2002-2013 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
**************************************************************************
|
||||
*/
|
||||
@ -2827,7 +2827,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
if (fTraceDebug) {
|
||||
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
||||
printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
|
||||
printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
|
||||
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
|
||||
fPattern->dumpOp(fp->fPatIdx);
|
||||
}
|
||||
@ -3492,7 +3492,7 @@ GC_Done:
|
||||
int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
|
||||
U_ASSERT(minCount>=0);
|
||||
U_ASSERT(maxCount>=minCount || maxCount==-1);
|
||||
U_ASSERT(loopLoc>fp->fPatIdx);
|
||||
U_ASSERT(loopLoc>=fp->fPatIdx);
|
||||
|
||||
if (minCount == 0) {
|
||||
fp = StateSave(fp, loopLoc+1, status);
|
||||
@ -4211,7 +4211,7 @@ breakFromLoop:
|
||||
fMatchStart = startIdx;
|
||||
fMatchEnd = fp->fInputIdx;
|
||||
if (fTraceDebug) {
|
||||
REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
|
||||
REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -4252,7 +4252,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
if (fTraceDebug)
|
||||
{
|
||||
printf("MatchAt(startIdx=%ld)\n", startIdx);
|
||||
printf("MatchAt(startIdx=%d)\n", startIdx);
|
||||
printf("Original Pattern: ");
|
||||
UChar32 c = utext_next32From(fPattern->fPattern, 0);
|
||||
while (c != U_SENTINEL) {
|
||||
@ -4321,7 +4321,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
if (fTraceDebug) {
|
||||
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
||||
printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
|
||||
printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
|
||||
UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
|
||||
fPattern->dumpOp(fp->fPatIdx);
|
||||
}
|
||||
@ -4951,7 +4951,7 @@ GC_Done:
|
||||
int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
|
||||
U_ASSERT(minCount>=0);
|
||||
U_ASSERT(maxCount>=minCount || maxCount==-1);
|
||||
U_ASSERT(loopLoc>fp->fPatIdx);
|
||||
U_ASSERT(loopLoc>=fp->fPatIdx);
|
||||
|
||||
if (minCount == 0) {
|
||||
fp = StateSave(fp, loopLoc+1, status);
|
||||
@ -5635,7 +5635,7 @@ breakFromLoop:
|
||||
fMatchStart = startIdx;
|
||||
fMatchEnd = fp->fInputIdx;
|
||||
if (fTraceDebug) {
|
||||
REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
|
||||
REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
|
||||
}
|
||||
}
|
||||
else
|
||||
|
17
icu4c/source/test/testdata/regextst.txt
vendored
17
icu4c/source/test/testdata/regextst.txt
vendored
@ -1,4 +1,4 @@
|
||||
# Copyright (c) 2001-2012 International Business Machines
|
||||
# Copyright (c) 2001-2013 International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file:
|
||||
@ -1146,6 +1146,21 @@
|
||||
|
||||
"(ab)?(?<=ab)cd|ef" i "<0><1>ab</1>cd</0>"
|
||||
|
||||
# Bug 10024
|
||||
# Incorrect (unbounded) longest match length with {1, 20} style quantifiers.
|
||||
# Unbounded match is disallowed in look-behind expressions.
|
||||
# Max match length is used to limit where to check for look-behind matches.
|
||||
|
||||
"(?<=a{1,5})bc" "aaaa<0>bc</0>def"
|
||||
"(?<=(?:aa){3,20})bc" "aaaaaa<0>bc</0>def"
|
||||
"(?<!abc {1,100}|def {1,100}|ghi {1,100})jkl" "def jkl"
|
||||
"(?<!abc {1,100}|def {1,100}|ghi {1,100})jkl" "rst <0>jkl</0>"
|
||||
"(?<=a{11})bc" "aaaaaaaaaaa<0>bc</0>"
|
||||
"(?<=a{11})bc" "aaaaaaaaaabc"
|
||||
"(?<=a{1,})bc" E "aaaa<0>bc</0>def" # U_REGEX_LOOK_BEHIND_LIMIT error.
|
||||
"(?<=(?:){11})bc" "<0>bc</0>" # Empty (?:) expression.
|
||||
|
||||
|
||||
# Random debugging, Temporary
|
||||
#
|
||||
#"^(?:a?b?)*$" "a--"
|
||||
|
Loading…
Reference in New Issue
Block a user