diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index 156b1d241c..5a15ab6b9d 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -106,7 +106,7 @@ void RegexCompile::compile( UParseError &pp, // Error position info UErrorCode &e) // Error Code { - fRXPat->fPatternString = new UnicodeString(pat); + fRXPat->fPatternString = new UnicodeString(pat); UText patternText = UTEXT_INITIALIZER; utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); @@ -147,6 +147,12 @@ void RegexCompile::compile( fPatternLength = utext_nativeLength(pat); uint16_t state = 1; const RegexTableEl *tableEl; + + // UREGEX_LITERAL force entire pattern to be treated as a literal string. + if (fModeFlags & UREGEX_LITERAL) { + fQuoteMode = TRUE; + } + nextChar(fC); // Fetch the first char from the pattern string. // @@ -3652,10 +3658,11 @@ void RegexCompile::nextChar(RegexPatternChar &c) { if (fQuoteMode) { c.fQuoted = TRUE; - if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) { + if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) || + c.fChar == (UChar32)-1) { fQuoteMode = FALSE; // Exit quote mode, - nextCharLL(); // discard the E - nextChar(c); // recurse to get the real next char + nextCharLL(); // discard the E + nextChar(c); // recurse to get the real next char } } else if (fInBackslashQuote) { diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index fe1ab35a4f..1454a093a3 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -30,8 +30,6 @@ U_NAMESPACE_BEGIN // //-------------------------------------------------------------------------- RegexPattern::RegexPattern() { - UErrorCode status = U_ZERO_ERROR; - // Init all of this instances data. init(); } @@ -287,7 +285,7 @@ RegexPattern::compile(const UnicodeString ®ex, return NULL; } - if ((flags & (UREGEX_CANON_EQ | UREGEX_LITERAL)) != 0) { + if ((flags & UREGEX_CANON_EQ) != 0) { status = U_REGEX_UNIMPLEMENTED; return NULL; } @@ -338,7 +336,7 @@ RegexPattern::compile(UText *regex, return NULL; } - if ((flags & (UREGEX_CANON_EQ | UREGEX_LITERAL)) != 0) { + if ((flags & UREGEX_CANON_EQ) != 0) { status = U_REGEX_UNIMPLEMENTED; return NULL; } diff --git a/icu4c/source/i18n/unicode/uregex.h b/icu4c/source/i18n/unicode/uregex.h index 853725a277..d1c279422b 100644 --- a/icu4c/source/i18n/unicode/uregex.h +++ b/icu4c/source/i18n/unicode/uregex.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2004-2011, International Business Machines +* Copyright (C) 2004-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: uregex.h @@ -64,13 +64,12 @@ typedef enum URegexpFlag{ /** If set, treat the entire pattern as a literal string. * Metacharacters or escape sequences in the input sequence will be given - * no special meaning. Not implemented yet as of ICU 4.4. + * no special meaning. * - * The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact + * The flag UREGEX_CASE_INSENSITIVE retains its impact * on matching when used in conjunction with this flag. * The other flags become superfluous. - * TODO: say which escapes are still handled; anything Java does - * early (\\u) we should still do. + * * @stable ICU 4.0 */ UREGEX_LITERAL = 16, diff --git a/icu4c/source/test/cintltst/reapits.c b/icu4c/source/test/cintltst/reapits.c index 52f19a438c..c1a3afc888 100644 --- a/icu4c/source/test/cintltst/reapits.c +++ b/icu4c/source/test/cintltst/reapits.c @@ -228,7 +228,7 @@ static void TestRegexCAPI(void) { /* Open with an unimplemented flag */ status = U_ZERO_ERROR; - re = uregex_open(pat, -1, UREGEX_LITERAL, 0, &status); + re = uregex_open(pat, -1, UREGEX_CANON_EQ, 0, &status); TEST_ASSERT(status == U_REGEX_UNIMPLEMENTED); uregex_close(re); diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index 01f674d1df..f5f55e3d86 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -3119,7 +3119,7 @@ void RegexTest::Extended() { RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status); RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status); - RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status); + RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status); RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status); UnicodeString testPattern; // The pattern for test from the test file. @@ -3329,6 +3329,9 @@ void RegexTest::regex_find(const UnicodeString &pattern, if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag bflags |= UREGEX_UNIX_LINES; } + if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag + bflags |= UREGEX_LITERAL; + } callerPattern = RegexPattern::compile(pattern, bflags, pe, status); diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index 6c6232d8e9..21ecae9377 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt @@ -23,6 +23,7 @@ # m multi-line mode. # ($ and ^ match at embedded new-lines) # D Unix Lines mode (only recognize 0x0a as new-line) +# Q UREGEX_LITERAL flag. Entire pattern is literal string. # v If icu configured without break iteration, this # regex test pattern should not compile. # e set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag @@ -282,6 +283,11 @@ "\Q$*^^(*)?\A\E(a*)" "<0>$*^^(*)?\\A<1>aaaaaaaaaaaaaaa" "[abc\Q]\r\E]+" "<0>aaaccc]]]\\\\\\\r..." # \Q ... \E escape in a [set] +# UREGEX_LITERAL - entire pattern is a literal string, no escapes recognized. +# Note that data strings in test cases still get escape processing. +"abc\an\r\E\\abcd\u0031bye" Q "lead<0>abc\\an\\r\\E\\\\abcd\\u0031byeextra" +"case insensitive \\ (l)iteral" Qi "stuff!! <0>cAsE InSenSiTiVE \\\\ (L)ITeral" + # \S and \s space characters "\s+" "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029xyz" "(\S+).*?(\S+).*" "<0><1>Not-spaces <2>more-non-spaces "