# # file: # # ICU regular expression test cases. # # format: one test case per line, # = [# comment] # = "" # = "" # the quotes on the pattern and match string can be " or ' or / # = text, with the start and end of each # capture group tagged with .... The overall match, # if any, is group 0, as in <0>matched text # = any combination of # i case insensitive match # x free spacing and comments # s dot-matches-all mode # m multi-line mode. $ and ^ match at embedded new-lines # d dump the compiled pattern # t trace operation of match engine. # White space must be present between the flags and the match string. # # Capturing parens ".(..)." "<0>a<1>bcd" ".*\A( +hello)" "<0><1> hello" "(hello)|(goodbye)" "<0><1>hello" "(hello)|(goodbye)" "<0><2>goodbye" "abc( +( inner(X?) +) xyz)" "leading cruft <0>abc<1> <2> inner<3> xyz cruft" "\s*([ixsmdt]*)([:letter:]*)" "<0> <1>d<2> " # Non-capturing parens (?: stuff). Groups, but does not capture. "(?:abc)*(tail)" "<0>abcabcabc<1>tail" # Non-greedy *? quantifier ".*?(abc)" "<0> abx <1>abc abc abc abc" ".*(abc)" "<0> abx abc abc abc <1>abc" "((?:abc |xyz )*?)abc " "<0><1>xyz abc abc abc " "((?:abc |xyz )*)abc " "<0><1>xyz abc abc abc " # Non-greedy +? quantifier "(a+?)(a*)" "<0><1>a<2>aaaaaaaaaaaa" "(a+)(a*)" "<0><1>aaaaaaaaaaaaa<2>" "((ab)+?)((ab)*)" "<0><1><2>ab<3>ababababab<4>ab" "((ab)+)((ab)*)" "<0><1>abababababab<2>ab<3>" # Non-greedy ?? quantifier "(ab)(ab)??(ab)??(ab)??(ab)??c" "<0><1>ab<4>ab<5>abc" # Unicode Properties as naked elements in a pattern "\p{Lu}+" "here we go ... <0>ABC and no more." "(\p{L}+)(\P{L}*?) (\p{Zs}*)" "7999<0><1>letters<2>4949%^&*( <3> " # \w and \W "\w+" " $%^&*( <0>hello123%^&*(" "\W+" "<0> $%^&*( hello123%^&*(" # \A match at beginning of input only. ".*\Ahello" "<0>hello hello" ".*hello" "<0>hello hello" ".*\Ahello" "stuff\nhello" # don't match after embedded new-line. # \b \B ".*?\b(.).*" "<0> $%^&*( <1>hello123%^&*()gxx" "\ba\b" "-<0>a" "\by\b" "xy" # Finds first chars of up to 5 words "(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?" "<0><1>Tthe <2>qick <3>brown <4>fox" "H.*?((?:\B.)+)" "<0>H<1>ello " ".*?((?:\B.)+).*?((?:\B.)+).*?((?:\B.)+)" "<0>H<1>ello <2> g<3>oodbye " "(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?.*" "<0> \u0301 \u0301<1>A\u0302BC\u0303\u0304<2> \u0305 \u0306<3>X\u0307Y\u0308" # . does not match new-lines "." "\u000a\u000d\u0085\u000c\u2028\u2029<0>X\u000aY" "A." "A\u000a "# no match # \d for decimal digits "\d*" "<0>0123456789\u0660\u06F9\u0969\u0A66\u1369\u17E2\uFF10\U0001D7CE\U0001D7FFnon-digits" "\D+" "<0>non digits" "\D*(\d*)(\D*)" "<0>non-digits<1>3456666<2>more non digits" # \Q...\E quote mode "hel\Qlo, worl\Ed" "<0>hello, world" "\Q$*^^(*)?\A\E(a*)" "<0>$*^^(*)?\\A<1>aaaaaaaaaaaaaaa" # \S and \s space characters "\s+" "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029xyz" "(\S+).*?(\S+).*" "<0><1>Not-spaces <2>more-non-spaces " # \X consume one combining char sequence. "(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>A<2>B<3> <4>\r\n" "(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>A\u0301<2>\n<3>\u0305<4>a\u0302\u0303\u0304" # ^ matches only at beginning of line ".*^(Hello)" "<0><1>Hello Hello Hello Hello Goodbye" ".*(Hello)" "<0>Hello Hello Hello <1>Hello Goodbye" ".*^(Hello)" " Hello Hello Hello Hello Goodbye"# No Match # $ matches only at end of line, or before a newline preceding the end of line ".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye" ".*?(Goodbye)" "<0>Hello <1>Goodbye Goodbye Goodbye" ".*?(Goodbye)$" "Hello Goodbye> Goodbye Goodbye "# No Match ".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye\n" ".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye\n" ".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye\r\n" ".*?(Goodbye)$" "Hello Goodbye Goodbye Goodbye\n\n"# No Match # \Z matches at end of input, like $ with default flags. ".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye" ".*?(Goodbye)" "<0>Hello <1>Goodbye Goodbye Goodbye" ".*?(Goodbye)\Z" "Hello Goodbye> Goodbye Goodbye "# No Match "here$" "here\nthe end"# No Match ".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye\n" ".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye\n" ".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye\r\n" ".*?(Goodbye)\Z" "Hello Goodbye Goodbye Goodbye\n\n"# No Match # \z matches only at the end of string. # no special treatment of new lines. # no dependencies on flag settings. ".*?(Goodbye)\z" "<0>Hello Goodbye Goodbye <1>Goodbye" ".*?(Goodbye)\z" "Hello Goodbye Goodbye Goodbye "# No Match "here$" "here\nthe end"# No Match ".*?(Goodbye)\z" "Hello Goodbye Goodbye Goodbye\n"# No Match ".*?(Goodbye)\n\z" "<0>Hello Goodbye Goodbye <1>Goodbye\n" # (?# comment) doesn't muck up pattern "Hello (?# this is a comment) world" " <0>Hello world..." # Check some implementation corner cases base on the way literal strings are compiled. "A" "<0>A" "AB" "<0>ABABABAB" "AB+" "<0>ABBBA" "AB+" "<0>ABABAB" "ABC+" "<0>ABCABC" "ABC+" "<0>ABCCCCABC" "(?:ABC)+" "<0>ABCABCABCD" "(?:ABC)DEF+" "<0>ABCDEFFFD" "AB\.C\eD\u0666E" "<0>AB.C\u001BD\u0666EF" "ab\Bde" "<0>abde" # {min,max} iteration qualifier "A{3}BC" "<0>AAABC" "(ABC){2,3}AB" "no matchAB" "(ABC){2,3}AB" "ABCAB" "(ABC){2,3}AB" "<0>ABC<1>ABCAB" "(ABC){2,3}AB" "<0>ABCABC<1>ABCAB" "(ABC){2,3}AB" "<0>ABCABC<1>ABCABCAB" "(ABC){2}AB" "ABCAB" "(ABC){2}AB" "<0>ABC<1>ABCAB" "(ABC){2}AB" "<0>ABC<1>ABCABCAB" "(ABC){2}AB" "<0>ABC<1>ABCABCABCAB" "(ABC){2,}AB" "ABCAB" "(ABC){2,}AB" "<0>ABC<1>ABCAB" "(ABC){2,}AB" "<0>ABCABC<1>ABCAB" "(ABC){2,}AB" "<0>ABCABCABC<1>ABCAB" "X{0,0}ABC" "<0>ABC" "X{0,1}ABC" "<0>ABC" "(?:Hello(!{1,3}) there){1}" "Hello there" "(?:Hello(!{1,3}) there){1}" "<0>Hello<1>! there" "(?:Hello(!{1,3}) there){1}" "<0>Hello<1>!! there" "(?:Hello(!{1,3}) there){1}" "<0>Hello<1>!!! there" "(?:Hello(!{1,3}) there){1}" "Hello!!!! there" # Nongreedy {min,max}? intervals "(ABC){2,3}?AB" "no matchAB" "(ABC){2,3}?AB" "ABCAB" "(ABC){2,3}?AB" "<0>ABC<1>ABCAB" "(ABC){2,3}?AB" "<0>ABC<1>ABCABCAB" "(ABC){2,3}?AB" "<0>ABC<1>ABCABCABCAB" "(ABC){2,3}?AX" "<0>ABCABC<1>ABCAX" "(ABC){2,3}?AX" "ABC<0>ABCABC<1>ABCAX" # Atomic Grouping "(?>.*)abc" "abcabcabc" # no match. .* consumed entire string. "(?>(abc{2,4}?))(c*)" "<0><1>abcc<2>cccddd" "(\.\d\d(?>[1-9]?))\d+" "1.625" "(\.\d\d(?>[1-9]?))\d+" "1<0><1>.6250" # Possessive *+ "(abc)*+a" "abcabcabc" "(abc)*+a" "<0>abc<1>abcab" "(a*b)*+a" "<0><1>aaaabaaaa" # Possessive ?+ "c?+ddd" "<0>cddd" "c?+cddd" "cddd" "c?cddd" "<0>cddd" # Back Reference "(?:ab(..)cd\1)*" "<0>ab23cd23ab<1>wwcdwwabxxcdyy" "ab(?:c|(d?))(\1)" "<0>ab<1><2>c" "ab(?:c|(d?))(\1)" "<0>ab<1>d<2>d" "ab(?:c|(d?))(\1)" "<0>ab<1><2>e" "ab(?:c|(d?))(\1)" "<0>ab<1><2>"