2003-02-05 02:17:32 +00:00
|
|
|
#
|
2003-02-05 06:50:32 +00:00
|
|
|
# file:
|
2003-02-05 02:17:32 +00:00
|
|
|
#
|
|
|
|
# ICU regular expression test cases.
|
|
|
|
#
|
|
|
|
# format: one test case per line,
|
|
|
|
# <test case> = <pattern> <flags> <match string> [# comment]
|
|
|
|
# <pattern> = "<regular expression pattern>"
|
|
|
|
# <match string> = "<tagged string>"
|
2003-02-06 01:55:17 +00:00
|
|
|
# the quotes on the pattern and match string can be " or ' or /
|
2003-02-05 02:17:32 +00:00
|
|
|
# <tagged string> = text, with the start and end of each
|
|
|
|
# capture group tagged with <n>...</n>. The overall match,
|
2003-02-06 01:55:17 +00:00
|
|
|
# if any, is group 0, as in <0>matched text</0>
|
2003-02-05 02:17:32 +00:00
|
|
|
# <flags> = any combination of
|
|
|
|
# i case insensitive match
|
|
|
|
# x free spacing and comments
|
|
|
|
# s dot-matches-all mode
|
|
|
|
# m multi-line mode. $ and ^ match at embedded new-lines
|
|
|
|
# d dump the compiled pattern
|
|
|
|
# t trace operation of match engine.
|
|
|
|
# White space must be present between the flags and the match string.
|
|
|
|
#
|
|
|
|
|
|
|
|
# Capturing parens
|
|
|
|
".(..)." "<0>a<1>bc</1>d</0>"
|
2003-02-05 06:50:32 +00:00
|
|
|
".*\A( +hello)" "<0><1> hello</1></0>"
|
2003-02-05 02:17:32 +00:00
|
|
|
"(hello)|(goodbye)" "<0><1>hello</1></0>"
|
|
|
|
"(hello)|(goodbye)" "<0><2>goodbye</2></0>"
|
|
|
|
"abc( +( inner(X?) +) xyz)" "leading cruft <0>abc<1> <2> inner<3></3> </2> xyz</1></0> cruft"
|
2003-02-06 01:55:17 +00:00
|
|
|
"\s*([ixsmdt]*)([:letter:]*)" "<0> <1>d</1><2></2></0> "
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# Non-capturing parens (?: stuff). Groups, but does not capture.
|
|
|
|
"(?:abc)*(tail)" "<0>abcabcabc<1>tail</1></0>"
|
|
|
|
|
|
|
|
# Non-greedy *? quantifier
|
|
|
|
".*?(abc)" "<0> abx <1>abc</1></0> abc abc abc"
|
|
|
|
".*(abc)" "<0> abx abc abc abc <1>abc</1></0>"
|
|
|
|
|
|
|
|
"((?:abc |xyz )*?)abc " "<0><1>xyz </1>abc </0>abc abc "
|
|
|
|
"((?:abc |xyz )*)abc " "<0><1>xyz abc abc </1>abc </0>"
|
|
|
|
|
|
|
|
# Non-greedy +? quantifier
|
|
|
|
"(a+?)(a*)" "<0><1>a</1><2>aaaaaaaaaaaa</2></0>"
|
|
|
|
"(a+)(a*)" "<0><1>aaaaaaaaaaaaa</1><2></2></0>"
|
|
|
|
|
|
|
|
"((ab)+?)((ab)*)" "<0><1><2>ab</2></1><3>ababababab<4>ab</4></3></0>"
|
|
|
|
"((ab)+)((ab)*)" "<0><1>abababababab<2>ab</2></1><3></3></0>"
|
|
|
|
|
|
|
|
# Non-greedy ?? quantifier
|
2003-02-06 01:55:17 +00:00
|
|
|
"(ab)(ab)??(ab)??(ab)??(ab)??c" "<0><1>ab</1><4>ab</4><5>ab</5>c</0>"
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# Unicode Properties as naked elements in a pattern
|
2003-02-05 06:50:32 +00:00
|
|
|
"\p{Lu}+" "here we go ... <0>ABC</0> and no more."
|
|
|
|
"(\p{L}+)(\P{L}*?) (\p{Zs}*)" "7999<0><1>letters</1><2>4949%^&*(</2> <3> </3></0>"
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# \w and \W
|
2003-02-05 06:50:32 +00:00
|
|
|
"\w+" " $%^&*( <0>hello123</0>%^&*("
|
|
|
|
"\W+" "<0> $%^&*( </0>hello123%^&*("
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# \A match at beginning of input only.
|
2003-02-05 06:50:32 +00:00
|
|
|
".*\Ahello" "<0>hello</0> hello"
|
2003-02-05 02:17:32 +00:00
|
|
|
".*hello" "<0>hello hello</0>"
|
2003-02-05 06:50:32 +00:00
|
|
|
".*\Ahello" "stuff\nhello" # don't match after embedded new-line.
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# \b \B
|
2003-02-05 06:50:32 +00:00
|
|
|
".*?\b(.).*" "<0> $%^&*( <1>h</1>ello123%^&*()gxx</0>"
|
|
|
|
"\ba\b" "-<0>a</0>"
|
|
|
|
"\by\b" "xy"
|
2003-02-05 02:17:32 +00:00
|
|
|
|
2003-02-06 01:55:17 +00:00
|
|
|
# Finds first chars of up to 5 words
|
2003-02-05 06:50:32 +00:00
|
|
|
"(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?" "<0><1>T</1>the <2>q</2>ick <3>b</3>rown <4>f</4></0>ox"
|
2003-02-06 01:55:17 +00:00
|
|
|
|
2003-02-05 06:50:32 +00:00
|
|
|
"H.*?((?:\B.)+)" "<0>H<1>ello</1></0> "
|
|
|
|
".*?((?:\B.)+).*?((?:\B.)+).*?((?:\B.)+)" "<0>H<1>ello</1> <2> </2>g<3>oodbye</3></0> "
|
2003-02-05 02:17:32 +00:00
|
|
|
|
2003-02-05 06:50:32 +00:00
|
|
|
"(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?.*" "<0> \u0301 \u0301<1>A</1>\u0302BC\u0303\u0304<2> </2>\u0305 \u0306<3>X</3>\u0307Y\u0308</0>"
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# . does not match new-lines
|
2003-02-05 06:50:32 +00:00
|
|
|
"." "\u000a\u000d\u0085\u000c\u2028\u2029<0>X</0>\u000aY"
|
|
|
|
"A." "A\u000a "# no match
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# \d for decimal digits
|
2003-02-05 06:50:32 +00:00
|
|
|
"\d*" "<0>0123456789\u0660\u06F9\u0969\u0A66\u1369\u17E2\uFF10\U0001D7CE\U0001D7FF</0>non-digits"
|
|
|
|
"\D+" "<0>non digits</0>"
|
|
|
|
"\D*(\d*)(\D*)" "<0>non-digits<1>3456666</1><2>more non digits</2></0>"
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# \Q...\E quote mode
|
2003-02-05 06:50:32 +00:00
|
|
|
"hel\Qlo, worl\Ed" "<0>hello, world</0>"
|
2003-02-06 01:55:17 +00:00
|
|
|
"\Q$*^^(*)?\A\E(a*)" "<0>$*^^(*)?\\A<1>aaaaaaaaaaaaaaa</1></0>"
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# \S and \s space characters
|
2003-02-05 06:50:32 +00:00
|
|
|
"\s+" "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029</0>xyz"
|
|
|
|
"(\S+).*?(\S+).*" "<0><1>Not-spaces</1> <2>more-non-spaces</2> </0>"
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# \X consume one combining char sequence.
|
2003-02-05 06:50:32 +00:00
|
|
|
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>A</1><2>B</2><3> </3><4>\r\n</4></0>"
|
|
|
|
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>A\u0301</1><2>\n</2><3>\u0305</3><4>a\u0302\u0303\u0304</4></0>"
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# ^ matches only at beginning of line
|
|
|
|
".*^(Hello)" "<0><1>Hello</1></0> Hello Hello Hello Goodbye"
|
|
|
|
".*(Hello)" "<0>Hello Hello Hello <1>Hello</1></0> Goodbye"
|
|
|
|
".*^(Hello)" " Hello Hello Hello Hello Goodbye"# No Match
|
|
|
|
|
|
|
|
# $ matches only at end of line, or before a newline preceding the end of line
|
|
|
|
".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
|
|
|
|
".*?(Goodbye)" "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
|
|
|
|
".*?(Goodbye)$" "Hello Goodbye> Goodbye Goodbye "# No Match
|
|
|
|
|
2003-02-05 06:50:32 +00:00
|
|
|
".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
|
|
|
|
".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
|
|
|
|
".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\r\n"
|
|
|
|
".*?(Goodbye)$" "Hello Goodbye Goodbye Goodbye\n\n"# No Match
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# \Z matches at end of input, like $ with default flags.
|
2003-02-05 06:50:32 +00:00
|
|
|
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
|
2003-02-05 02:17:32 +00:00
|
|
|
".*?(Goodbye)" "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
|
2003-02-05 06:50:32 +00:00
|
|
|
".*?(Goodbye)\Z" "Hello Goodbye> Goodbye Goodbye "# No Match
|
|
|
|
"here$" "here\nthe end"# No Match
|
2003-02-05 02:17:32 +00:00
|
|
|
|
2003-02-05 06:50:32 +00:00
|
|
|
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
|
|
|
|
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
|
|
|
|
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\r\n"
|
|
|
|
".*?(Goodbye)\Z" "Hello Goodbye Goodbye Goodbye\n\n"# No Match
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# \z matches only at the end of string.
|
|
|
|
# no special treatment of new lines.
|
|
|
|
# no dependencies on flag settings.
|
2003-02-05 06:50:32 +00:00
|
|
|
".*?(Goodbye)\z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
|
|
|
|
".*?(Goodbye)\z" "Hello Goodbye Goodbye Goodbye "# No Match
|
|
|
|
"here$" "here\nthe end"# No Match
|
2003-02-05 02:17:32 +00:00
|
|
|
|
2003-02-05 06:50:32 +00:00
|
|
|
".*?(Goodbye)\z" "Hello Goodbye Goodbye Goodbye\n"# No Match
|
|
|
|
".*?(Goodbye)\n\z" "<0>Hello Goodbye Goodbye <1>Goodbye</1>\n</0>"
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# (?# comment) doesn't muck up pattern
|
|
|
|
"Hello (?# this is a comment) world" " <0>Hello world</0>..."
|
|
|
|
|
|
|
|
# Check some implementation corner cases base on the way literal strings are compiled.
|
|
|
|
"A" "<0>A</0>"
|
|
|
|
"AB" "<0>AB</0>ABABAB"
|
|
|
|
"AB+" "<0>ABBB</0>A"
|
|
|
|
"AB+" "<0>AB</0>ABAB"
|
|
|
|
"ABC+" "<0>ABC</0>ABC"
|
|
|
|
"ABC+" "<0>ABCCCC</0>ABC"
|
|
|
|
"(?:ABC)+" "<0>ABCABCABC</0>D"
|
|
|
|
"(?:ABC)DEF+" "<0>ABCDEFFF</0>D"
|
2003-02-05 06:50:32 +00:00
|
|
|
"AB\.C\eD\u0666E" "<0>AB.C\u001BD\u0666E</0>F"
|
2003-02-07 02:04:14 +00:00
|
|
|
"ab\Bde" "<0>abde</0>"
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
|
|
|
|
# {min,max} iteration qualifier
|
|
|
|
"A{3}BC" "<0>AAABC</0>"
|
|
|
|
|
|
|
|
"(ABC){2,3}AB" "no matchAB"
|
|
|
|
"(ABC){2,3}AB" "ABCAB"
|
|
|
|
"(ABC){2,3}AB" "<0>ABC<1>ABC</1>AB</0>"
|
|
|
|
"(ABC){2,3}AB" "<0>ABCABC<1>ABC</1>AB</0>"
|
|
|
|
"(ABC){2,3}AB" "<0>ABCABC<1>ABC</1>AB</0>CAB"
|
|
|
|
|
|
|
|
"(ABC){2}AB" "ABCAB"
|
|
|
|
"(ABC){2}AB" "<0>ABC<1>ABC</1>AB</0>"
|
|
|
|
"(ABC){2}AB" "<0>ABC<1>ABC</1>AB</0>CAB"
|
|
|
|
"(ABC){2}AB" "<0>ABC<1>ABC</1>AB</0>CABCAB"
|
|
|
|
|
|
|
|
"(ABC){2,}AB" "ABCAB"
|
|
|
|
"(ABC){2,}AB" "<0>ABC<1>ABC</1>AB</0>"
|
|
|
|
"(ABC){2,}AB" "<0>ABCABC<1>ABC</1>AB</0>"
|
|
|
|
"(ABC){2,}AB" "<0>ABCABCABC<1>ABC</1>AB</0>"
|
|
|
|
|
|
|
|
"X{0,0}ABC" "<0>ABC</0>"
|
|
|
|
"X{0,1}ABC" "<0>ABC</0>"
|
|
|
|
|
|
|
|
"(?:Hello(!{1,3}) there){1}" "Hello there"
|
|
|
|
"(?:Hello(!{1,3}) there){1}" "<0>Hello<1>!</1> there</0>"
|
|
|
|
"(?:Hello(!{1,3}) there){1}" "<0>Hello<1>!!</1> there</0>"
|
|
|
|
"(?:Hello(!{1,3}) there){1}" "<0>Hello<1>!!!</1> there</0>"
|
|
|
|
"(?:Hello(!{1,3}) there){1}" "Hello!!!! there"
|
|
|
|
|
|
|
|
# Nongreedy {min,max}? intervals
|
|
|
|
"(ABC){2,3}?AB" "no matchAB"
|
|
|
|
"(ABC){2,3}?AB" "ABCAB"
|
|
|
|
"(ABC){2,3}?AB" "<0>ABC<1>ABC</1>AB</0>"
|
|
|
|
"(ABC){2,3}?AB" "<0>ABC<1>ABC</1>AB</0>CAB"
|
|
|
|
"(ABC){2,3}?AB" "<0>ABC<1>ABC</1>AB</0>CABCAB"
|
|
|
|
"(ABC){2,3}?AX" "<0>ABCABC<1>ABC</1>AX</0>"
|
|
|
|
"(ABC){2,3}?AX" "ABC<0>ABCABC<1>ABC</1>AX</0>"
|
|
|
|
|
|
|
|
# Atomic Grouping
|
|
|
|
"(?>.*)abc" "abcabcabc" # no match. .* consumed entire string.
|
|
|
|
"(?>(abc{2,4}?))(c*)" "<0><1>abcc</1><2>ccc</2></0>ddd"
|
2003-02-05 06:50:32 +00:00
|
|
|
"(\.\d\d(?>[1-9]?))\d+" "1.625"
|
|
|
|
"(\.\d\d(?>[1-9]?))\d+" "1<0><1>.625</1>0</0>"
|
2003-02-05 02:17:32 +00:00
|
|
|
|
|
|
|
# Possessive *+
|
|
|
|
"(abc)*+a" "abcabcabc"
|
|
|
|
"(abc)*+a" "<0>abc<1>abc</1>a</0>b"
|
|
|
|
"(a*b)*+a" "<0><1>aaaab</1>a</0>aaa"
|
|
|
|
|
|
|
|
# Possessive ?+
|
|
|
|
"c?+ddd" "<0>cddd</0>"
|
|
|
|
"c?+cddd" "cddd"
|
|
|
|
"c?cddd" "<0>cddd</0>"
|
|
|
|
|
|
|
|
# Back Reference
|
2003-02-05 06:50:32 +00:00
|
|
|
"(?:ab(..)cd\1)*" "<0>ab23cd23ab<1>ww</1>cdww</0>abxxcdyy"
|
|
|
|
"ab(?:c|(d?))(\1)" "<0>ab<1><2></2></1></0>c"
|
|
|
|
"ab(?:c|(d?))(\1)" "<0>ab<1>d</1><2>d</2></0>"
|
|
|
|
"ab(?:c|(d?))(\1)" "<0>ab<1></1><2></2></0>e"
|
|
|
|
"ab(?:c|(d?))(\1)" "<0>ab<1></1><2></2></0>"
|