ICU-6947 implement UREGEX_LITERAL flag.

X-SVN-Rev: 31398
This commit is contained in:
Andy Heninger 2012-02-15 01:30:55 +00:00
parent 3f18f96246
commit c74df646b7
6 changed files with 28 additions and 15 deletions

View File

@ -106,7 +106,7 @@ void RegexCompile::compile(
UParseError &pp, // Error position info
UErrorCode &e) // Error Code
{
fRXPat->fPatternString = new UnicodeString(pat);
fRXPat->fPatternString = new UnicodeString(pat);
UText patternText = UTEXT_INITIALIZER;
utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
@ -147,6 +147,12 @@ void RegexCompile::compile(
fPatternLength = utext_nativeLength(pat);
uint16_t state = 1;
const RegexTableEl *tableEl;
// UREGEX_LITERAL force entire pattern to be treated as a literal string.
if (fModeFlags & UREGEX_LITERAL) {
fQuoteMode = TRUE;
}
nextChar(fC); // Fetch the first char from the pattern string.
//
@ -3652,10 +3658,11 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
if (fQuoteMode) {
c.fQuoted = TRUE;
if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) {
if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
c.fChar == (UChar32)-1) {
fQuoteMode = FALSE; // Exit quote mode,
nextCharLL(); // discard the E
nextChar(c); // recurse to get the real next char
nextCharLL(); // discard the E
nextChar(c); // recurse to get the real next char
}
}
else if (fInBackslashQuote) {

View File

@ -30,8 +30,6 @@ U_NAMESPACE_BEGIN
//
//--------------------------------------------------------------------------
RegexPattern::RegexPattern() {
UErrorCode status = U_ZERO_ERROR;
// Init all of this instances data.
init();
}
@ -287,7 +285,7 @@ RegexPattern::compile(const UnicodeString &regex,
return NULL;
}
if ((flags & (UREGEX_CANON_EQ | UREGEX_LITERAL)) != 0) {
if ((flags & UREGEX_CANON_EQ) != 0) {
status = U_REGEX_UNIMPLEMENTED;
return NULL;
}
@ -338,7 +336,7 @@ RegexPattern::compile(UText *regex,
return NULL;
}
if ((flags & (UREGEX_CANON_EQ | UREGEX_LITERAL)) != 0) {
if ((flags & UREGEX_CANON_EQ) != 0) {
status = U_REGEX_UNIMPLEMENTED;
return NULL;
}

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2004-2011, International Business Machines
* Copyright (C) 2004-2012, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: uregex.h
@ -64,13 +64,12 @@ typedef enum URegexpFlag{
/** If set, treat the entire pattern as a literal string.
* Metacharacters or escape sequences in the input sequence will be given
* no special meaning. Not implemented yet as of ICU 4.4.
* no special meaning.
*
* The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact
* The flag UREGEX_CASE_INSENSITIVE retains its impact
* on matching when used in conjunction with this flag.
* The other flags become superfluous.
* TODO: say which escapes are still handled; anything Java does
* early (\\u) we should still do.
*
* @stable ICU 4.0
*/
UREGEX_LITERAL = 16,

View File

@ -228,7 +228,7 @@ static void TestRegexCAPI(void) {
/* Open with an unimplemented flag */
status = U_ZERO_ERROR;
re = uregex_open(pat, -1, UREGEX_LITERAL, 0, &status);
re = uregex_open(pat, -1, UREGEX_CANON_EQ, 0, &status);
TEST_ASSERT(status == U_REGEX_UNIMPLEMENTED);
uregex_close(re);

View File

@ -3119,7 +3119,7 @@ void RegexTest::Extended() {
RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
UnicodeString testPattern; // The pattern for test from the test file.
@ -3329,6 +3329,9 @@ void RegexTest::regex_find(const UnicodeString &pattern,
if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
bflags |= UREGEX_UNIX_LINES;
}
if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
bflags |= UREGEX_LITERAL;
}
callerPattern = RegexPattern::compile(pattern, bflags, pe, status);

View File

@ -23,6 +23,7 @@
# m multi-line mode.
# ($ and ^ match at embedded new-lines)
# D Unix Lines mode (only recognize 0x0a as new-line)
# Q UREGEX_LITERAL flag. Entire pattern is literal string.
# v If icu configured without break iteration, this
# regex test pattern should not compile.
# e set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag
@ -282,6 +283,11 @@
"\Q$*^^(*)?\A\E(a*)" "<0>$*^^(*)?\\A<1>aaaaaaaaaaaaaaa</1></0>"
"[abc\Q]\r\E]+" "<0>aaaccc]]]\\\\\\</0>\r..." # \Q ... \E escape in a [set]
# UREGEX_LITERAL - entire pattern is a literal string, no escapes recognized.
# Note that data strings in test cases still get escape processing.
"abc\an\r\E\\abcd\u0031bye" Q "lead<0>abc\\an\\r\\E\\\\abcd\\u0031bye</0>extra"
"case insensitive \\ (l)iteral" Qi "stuff!! <0>cAsE InSenSiTiVE \\\\ (L)ITeral</0>"
# \S and \s space characters
"\s+" "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029</0>xyz"
"(\S+).*?(\S+).*" "<0><1>Not-spaces</1> <2>more-non-spaces</2> </0>"