From 9c02edfe2d41e9bba08768aeb881f4f7a9ba5d8f Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Fri, 23 May 2003 01:32:25 +0000 Subject: [PATCH] ICU-2908 fix crash on regexp patterns with quantifiers on a surrogate pair e.g. \ud800\udc00* X-SVN-Rev: 12065 --- icu4c/source/i18n/regexcmp.cpp | 27 +++++++++++++++++++------ icu4c/source/test/testdata/regextst.txt | 11 ++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index ae70f94fdf..f949527d91 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -1005,7 +1005,7 @@ UBool RegexCompile::doParseActions(EParseAction action) } break; - case doPossesiveInterval: + case doPossessiveInterval: // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. { // Remember the loc for the top of the block being looped over. @@ -1215,7 +1215,7 @@ UBool RegexCompile::doParseActions(EParseAction action) - case doPossesivePlus: + case doPossessivePlus: // Possessive ++ quantifier. // Compiles to // 1. STO_SP @@ -1250,7 +1250,7 @@ UBool RegexCompile::doParseActions(EParseAction action) } break; - case doPossesiveStar: + case doPossessiveStar: // Possessive *+ quantifier. // Compiles to // 1. STO_SP loc @@ -1286,7 +1286,7 @@ UBool RegexCompile::doParseActions(EParseAction action) } break; - case doPossesiveOpt: + case doPossessiveOpt: // Possessive ?+ quantifier. // Compiles to // 1. STO_SP loc @@ -1445,11 +1445,26 @@ void RegexCompile::literalChar(UChar32 c) { // We are adding onto an existing string fRXPat->fLiteralText.append(c); - // If the most recently emitted op is a URX_ONECHAR, change it to a string op. op = fRXPat->fCompiledPat->lastElementi(); opType = URX_TYPE(op); U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN); + + // If the most recently emitted op is a URX_ONECHAR, if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) { + if (U16_IS_TRAIL(c) && U16_IS_LEAD(URX_VAL(op))) { + // The most recently emitted op is a ONECHAR that was the first half + // of a surrogate pair. Update the ONECHAR's operand to be the + // supplementary code point resulting from both halves of the pair. + c = U16_GET_SUPPLEMENTARY(URX_VAL(op), c); + op = URX_BUILD(opType, c); + patternLoc = fRXPat->fCompiledPat->size() - 1; + fRXPat->fCompiledPat->setElementAt(op, patternLoc); + return; + } + + // The most recently emitted op is a ONECHAR. + // We've now received another adjacent char. Change the ONECHAR op + // to a string op. if (fModeFlags & UREGEX_CASE_INSENSITIVE) { op = URX_BUILD(URX_STRING_I, fStringOpStart); } else { @@ -1460,7 +1475,7 @@ void RegexCompile::literalChar(UChar32 c) { op = URX_BUILD(URX_STRING_LEN, 0); fRXPat->fCompiledPat->addElement(op, *fStatus); } - + // The pattern contains a URX_SRING / URX_STRING_LEN. Update the // string length to reflect the new char we just added to the string. stringLen = fRXPat->fLiteralText.length() - fStringOpStart; diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index a0ee9ee0e8..cf08f6be9f 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt @@ -338,6 +338,17 @@ "ab\x{101234}c" "<0>ab\U00101234c" "abα" "<0>abα" + +# +# \u Surrogate Pairs +# +"\ud800\udc00" "<0>\U00010000" +"\ud800\udc00*" "<0>\U00010000\U00010000\U00010000\U00010001" +"\ud800\ud800\udc00" "<0>\ud800\U00010000\U00010000\U00010000\U00010001" +"(\ud800)(\udc00)" "\u00010000" + + + # # Random debugging, Temporary #