/******************************************************************** * COPYRIGHT: * Copyright (c) 2002, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ // // regex.cpp // // ICU Regular Expressions test, part of intltest. // #include "unicode/utypes.h" #include "intltest.h" #include "regextst.h" RegexTest::RegexTest() { }; RegexTest::~RegexTest() { }; void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) { if (exec) logln("TestSuite RegexTest: "); switch (index) { case 0: name = "Basic"; if (exec) Basic(); break; case 1: name = "API_Match"; if (exec) API_Match(); break; case 2: name = "API_Replace"; if (exec) API_Replace(); break; case 3: name = "API_Pattern"; if (exec) API_Pattern(); break; default: name = ""; break; //needed to end loop } } //--------------------------------------------------------------------------- // // REGEX_TESTLM Macro + invocation function to simplify writing quick tests // for the LookingAt() and Match() functions. // // usage: // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected); // // The expected results are UBool - TRUE or FALSE. // The input text is unescaped. The pattern is not. // // //--------------------------------------------------------------------------- #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%d\n", \ __LINE__, status); return;}} #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};} #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\ if (status!=errcode) {errln("RegexTest failure at line %d.\n", __LINE__);};} #define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__); UBool RegexTest::doRegexLMTest(char *pat, char *text, UBool looking, UBool match, int line) { const UnicodeString pattern(pat); const UnicodeString inputText(text); UErrorCode status = U_ZERO_ERROR; UParseError pe; RegexPattern *REPattern = NULL; RegexMatcher *REMatcher = NULL; UBool retVal = TRUE; UnicodeString patString(pat); REPattern = RegexPattern::compile(patString, 0, pe, status); if (U_FAILURE(status)) { errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %d\n", line, status); return FALSE; } // REPattern->dump(); UnicodeString inputString(inputText); UnicodeString unEscapedInput = inputString.unescape(); REMatcher = REPattern->matcher(unEscapedInput, status); if (U_FAILURE(status)) { errln("RegexTest failure in REPattern::matcher() at line %d. Status = %d\n", line, status); return FALSE; } UBool actualmatch; actualmatch = REMatcher->lookingAt(status); if (U_FAILURE(status)) { errln("RegexTest failure in lookingAt() at line %d. Status = %d\n", line, status); retVal = FALSE; } if (actualmatch != looking) { errln("RegexTest: wrong return from lookingAt() at line %d.\n", line); retVal = FALSE; } status = U_ZERO_ERROR; actualmatch = REMatcher->matches(status); if (U_FAILURE(status)) { errln("RegexTest failure in matches() at line %d. Status = %d\n", line, status); retVal = FALSE; } if (actualmatch != match) { errln("RegexTest: wrong return from matches() at line %d.\n", line); retVal = FALSE; } if (retVal == FALSE) { REPattern->dump(); } delete REPattern; delete REMatcher; return retVal; } //--------------------------------------------------------------------------- // // API_Match // //--------------------------------------------------------------------------- void RegexTest::API_Match() { UParseError pe; UErrorCode status=U_ZERO_ERROR; int32_t flags = 0; // // Debug - slide failing test cases early // #if 0 { } return; #endif // // Simple pattern compilation // { UnicodeString re("abc"); RegexPattern *pat2; pat2 = RegexPattern::compile(re, flags, pe, status); REGEX_CHECK_STATUS; UnicodeString inStr1 = "abcdef this is a test"; UnicodeString instr2 = "not abc"; UnicodeString empty = ""; // // Matcher creation and reset. // RegexMatcher *m1 = pat2->matcher(inStr1, status); REGEX_CHECK_STATUS; REGEX_ASSERT(m1->lookingAt(status) == TRUE); REGEX_ASSERT(m1->input() == inStr1); m1->reset(instr2); REGEX_ASSERT(m1->lookingAt(status) == FALSE); REGEX_ASSERT(m1->input() == instr2); m1->reset(inStr1); REGEX_ASSERT(m1->input() == inStr1); REGEX_ASSERT(m1->lookingAt(status) == TRUE); m1->reset(empty); REGEX_ASSERT(m1->lookingAt(status) == FALSE); REGEX_ASSERT(m1->input() == empty); REGEX_ASSERT(&m1->pattern() == pat2); delete m1; delete pat2; } // // Capture Group. // RegexMatcher::start(); // RegexMatcher::end(); // RegexMatcher::groupCount(); // { int32_t flags=0; UParseError pe; UErrorCode status=U_ZERO_ERROR; UnicodeString re("01(23(45)67)(.*)"); RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); REGEX_CHECK_STATUS; UnicodeString data = "0123456789"; RegexMatcher *matcher = pat->matcher(data, status); REGEX_CHECK_STATUS; REGEX_ASSERT(matcher->lookingAt(status) == TRUE); int matchStarts[] = {0, 2, 4, 8}; int matchEnds[] = {10, 8, 6, 10}; int i; for (i=0; i<4; i++) { int32_t actualStart = matcher->start(i, status); REGEX_CHECK_STATUS; if (actualStart != matchStarts[i]) { errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", __LINE__, i, matchStarts[i], actualStart); } int32_t actualEnd = matcher->end(i, status); REGEX_CHECK_STATUS; if (actualEnd != matchEnds[i]) { errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", __LINE__, i, matchEnds[i], actualEnd); } } REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); matcher->reset(); REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); matcher->lookingAt(status); REGEX_ASSERT(matcher->group(status) == "0123456789"); REGEX_ASSERT(matcher->group(0, status) == "0123456789"); REGEX_ASSERT(matcher->group(1, status) == "234567" ); REGEX_ASSERT(matcher->group(2, status) == "45" ); REGEX_ASSERT(matcher->group(3, status) == "89" ); REGEX_CHECK_STATUS; REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); matcher->reset(); REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); delete matcher; delete pat; } // // find // { int32_t flags=0; UParseError pe; UErrorCode status=U_ZERO_ERROR; UnicodeString re("abc"); RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); REGEX_CHECK_STATUS; UnicodeString data = ".abc..abc...abc.."; // 012345678901234567 RegexMatcher *matcher = pat->matcher(data, status); REGEX_CHECK_STATUS; REGEX_ASSERT(matcher->find()); REGEX_ASSERT(matcher->start(status) == 1); REGEX_ASSERT(matcher->find()); REGEX_ASSERT(matcher->start(status) == 6); REGEX_ASSERT(matcher->find()); REGEX_ASSERT(matcher->start(status) == 12); REGEX_ASSERT(matcher->find() == FALSE); REGEX_ASSERT(matcher->find() == FALSE); matcher->reset(); REGEX_ASSERT(matcher->find()); REGEX_ASSERT(matcher->start(status) == 1); REGEX_ASSERT(matcher->find(0, status)); REGEX_ASSERT(matcher->start(status) == 1); REGEX_ASSERT(matcher->find(1, status)); REGEX_ASSERT(matcher->start(status) == 1); REGEX_ASSERT(matcher->find(2, status)); REGEX_ASSERT(matcher->start(status) == 6); REGEX_ASSERT(matcher->find(12, status)); REGEX_ASSERT(matcher->start(status) == 12); REGEX_ASSERT(matcher->find(13, status) == FALSE); REGEX_ASSERT(matcher->find(16, status) == FALSE); REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); REGEX_CHECK_STATUS; REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); REGEX_ASSERT_FAIL(matcher->find(17, status), U_INDEX_OUTOFBOUNDS_ERROR); REGEX_ASSERT(matcher->groupCount() == 0); delete matcher; delete pat; } // // Replace // { int32_t flags=0; UParseError pe; UErrorCode status=U_ZERO_ERROR; UnicodeString re("abc"); RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); REGEX_CHECK_STATUS; UnicodeString data = ".abc..abc...abc.."; // 012345678901234567 RegexMatcher *matcher = pat->matcher(data, status); // // Plain vanilla matches. // UnicodeString dest; dest = matcher->replaceFirst("yz", status); REGEX_CHECK_STATUS; REGEX_ASSERT(dest == ".yz..abc...abc.."); dest = matcher->replaceAll("yz", status); REGEX_CHECK_STATUS; REGEX_ASSERT(dest == ".yz..yz...yz.."); // // Plain vanilla non-matches. // UnicodeString d2 = ".abx..abx...abx.."; matcher->reset(d2); dest = matcher->replaceFirst("yz", status); REGEX_CHECK_STATUS; REGEX_ASSERT(dest == ".abx..abx...abx.."); dest = matcher->replaceAll("yz", status); REGEX_CHECK_STATUS; REGEX_ASSERT(dest == ".abx..abx...abx.."); // // Empty source string // UnicodeString d3 = ""; matcher->reset(d3); dest = matcher->replaceFirst("yz", status); REGEX_CHECK_STATUS; REGEX_ASSERT(dest == ""); dest = matcher->replaceAll("yz", status); REGEX_CHECK_STATUS; REGEX_ASSERT(dest == ""); // // Empty substitution string // matcher->reset(data); // ".abc..abc...abc.." dest = matcher->replaceFirst("", status); REGEX_CHECK_STATUS; REGEX_ASSERT(dest == "...abc...abc.."); dest = matcher->replaceAll("", status); REGEX_CHECK_STATUS; REGEX_ASSERT(dest == "........"); // // match whole string // UnicodeString d4 = "abc"; matcher->reset(d4); dest = matcher->replaceFirst("xyz", status); REGEX_CHECK_STATUS; REGEX_ASSERT(dest == "xyz"); dest = matcher->replaceAll("xyz", status); REGEX_CHECK_STATUS; REGEX_ASSERT(dest == "xyz"); // // Capture Group, simple case // UnicodeString re2("a(..)"); RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); REGEX_CHECK_STATUS; UnicodeString d5 = "abcdefg"; RegexMatcher *matcher2 = pat2->matcher(d5, status); REGEX_CHECK_STATUS; dest = matcher2->replaceFirst("$1$1", status); REGEX_CHECK_STATUS; REGEX_ASSERT(dest == "bcbcdefg"); } } //--------------------------------------------------------------------------- // // Basic Check for basic functionality of // regex pattern matching. // //--------------------------------------------------------------------------- void RegexTest::Basic() { // // Debug - slide failing test cases early // #if 0 { REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input } return; #endif // // Pattern with parentheses // REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE); REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE); REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE); // // Patterns with * // REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE); REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE); REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE); REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE); REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE); REGEX_TESTLM("a*", "", TRUE, TRUE); REGEX_TESTLM("a*", "b", TRUE, FALSE); // // Patterns with "." // REGEX_TESTLM(".", "abc", TRUE, FALSE); REGEX_TESTLM("...", "abc", TRUE, TRUE); REGEX_TESTLM("....", "abc", FALSE, FALSE); REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE); REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE); REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE); REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE); REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE); // // Patterns with * applied to chars at end of literal string // REGEX_TESTLM("abc*", "ab", TRUE, TRUE); REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE); // // Supplemental chars match as single chars, not a pair of surrogates. // REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE); REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE); REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE); // // UnicodeSets in the pattern // REGEX_TESTLM("[1-6]", "1", TRUE, TRUE); REGEX_TESTLM("[1-6]", "3", TRUE, TRUE); REGEX_TESTLM("[1-6]", "7", FALSE, FALSE); REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE); REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE); REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE); REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE); REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); // // OR operator in patterns // REGEX_TESTLM("(a|b)", "a", TRUE, TRUE); REGEX_TESTLM("(a|b)", "b", TRUE, TRUE); REGEX_TESTLM("(a|b)", "c", FALSE, FALSE); REGEX_TESTLM("a|b", "b", TRUE, TRUE); REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE); REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE); REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE); REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE); REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE); REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE); // // + // REGEX_TESTLM("ab+", "abbc", TRUE, FALSE); REGEX_TESTLM("ab+c", "ac", FALSE, FALSE); REGEX_TESTLM("b+", "", FALSE, FALSE); REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE); REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE); REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE); // // ? // REGEX_TESTLM("ab?", "ab", TRUE, TRUE); REGEX_TESTLM("ab?", "a", TRUE, TRUE); REGEX_TESTLM("ab?", "ac", TRUE, FALSE); REGEX_TESTLM("ab?", "abb", TRUE, FALSE); REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE); REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE); REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE); REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE); REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE); // // Escape sequences that become single literal chars, handled internally // by ICU's Unescape. // // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet. REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL REGEX_TESTLM("\\b", "\\u0008", TRUE, TRUE); // BS // REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L (or whatever) TODO: bug in Unescape // REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape TODO: bug in Unescape REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE); REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE); REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input }; //--------------------------------------------------------------------------- // // API_Replace // //--------------------------------------------------------------------------- void RegexTest::API_Replace() { } //--------------------------------------------------------------------------- // // API_Pattern // //--------------------------------------------------------------------------- void RegexTest::API_Pattern() { RegexPattern pata; // Test default constructor to not crash. RegexPattern patb; REGEX_ASSERT(pata == patb); REGEX_ASSERT(pata == pata); UnicodeString re1("abc[a-l][m-z]"); UnicodeString re2("def"); UErrorCode status = U_ZERO_ERROR; UParseError pe; RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status); RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status); REGEX_CHECK_STATUS; REGEX_ASSERT(*pat1 == *pat1); REGEX_ASSERT(*pat1 != pata); // Assign patb = *pat1; REGEX_ASSERT(patb == *pat1); // Copy Construct RegexPattern patc(*pat1); REGEX_ASSERT(patc == *pat1); REGEX_ASSERT(patb == patc); REGEX_ASSERT(pat1 != pat2); patb = *pat2; REGEX_ASSERT(patb != patc); REGEX_ASSERT(patb == *pat2); // Compile with no flags. RegexPattern *pat1a = RegexPattern::compile(re1, pe, status); REGEX_ASSERT(*pat1a == *pat1); // Compile with different flags should be not equal RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status); REGEX_CHECK_STATUS; REGEX_ASSERT(*pat1b != *pat1a); REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); REGEX_ASSERT(pat1a->flags() == 0); // clone RegexPattern *pat1c = pat1b->clone(); REGEX_ASSERT(*pat1b == *pat1c); REGEX_ASSERT(*pat1a != *pat1c); // TODO: Actually do some matches with the cloned/copied/assigned patterns. delete pat1c; delete pat1b; delete pat1a; delete pat1; delete pat2; // // matches convenience API // REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE); REGEX_CHECK_STATUS; REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); REGEX_CHECK_STATUS; REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); REGEX_CHECK_STATUS; REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); REGEX_CHECK_STATUS; REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); REGEX_CHECK_STATUS; status = U_INDEX_OUTOFBOUNDS_ERROR; REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); // // Split() // status = U_ZERO_ERROR; pat1 = RegexPattern::compile(" +", pe, status); REGEX_CHECK_STATUS; UnicodeString fields[10]; int32_t n; n = pat1->split("Now is the time", fields, 10, status); REGEX_CHECK_STATUS; REGEX_ASSERT(n==4); REGEX_ASSERT(fields[0]=="Now"); REGEX_ASSERT(fields[1]=="is"); REGEX_ASSERT(fields[2]=="the"); REGEX_ASSERT(fields[3]=="time"); REGEX_ASSERT(fields[4]==""); n = pat1->split("Now is the time", fields, 2, status); REGEX_CHECK_STATUS; REGEX_ASSERT(n==2); REGEX_ASSERT(fields[0]=="Now"); REGEX_ASSERT(fields[1]=="is the time"); REGEX_ASSERT(fields[2]=="the"); // left over from previous test fields[1] = "*"; n = pat1->split("Now is the time", fields, 1, status); REGEX_CHECK_STATUS; REGEX_ASSERT(n==1); REGEX_ASSERT(fields[0]=="Now is the time"); REGEX_ASSERT(fields[1]=="*"); n = pat1->split(" Now is the time ", fields, 10, status); REGEX_CHECK_STATUS; REGEX_ASSERT(n==5); REGEX_ASSERT(fields[0]==""); REGEX_ASSERT(fields[1]=="Now"); REGEX_ASSERT(fields[2]=="is"); REGEX_ASSERT(fields[3]=="the"); REGEX_ASSERT(fields[4]=="time"); REGEX_ASSERT(fields[5]==""); n = pat1->split(" ", fields, 10, status); REGEX_CHECK_STATUS; REGEX_ASSERT(n==1); REGEX_ASSERT(fields[0]==""); fields[0] = "foo"; n = pat1->split("", fields, 10, status); REGEX_CHECK_STATUS; REGEX_ASSERT(n==0); REGEX_ASSERT(fields[0]=="foo"); }