From c06a6956a40a878fdbe319e4688196e7c990bf7a Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 18 Nov 2004 23:57:34 +0000 Subject: [PATCH] [BZ #544] Update. 2004-11-18 Jakub Jelinek [BZ #544] * posix/regex.h (RE_NO_SUB): New define. * posix/regex_internal.h (OP_DELETED_SUBEXP): New. (re_dfa_t): Add subexp_map. * posix/regcomp.c (struct subexp_optimize): New type. (optimize_subexps): New routine. (re_compile_internal): Call it. (re_compile_pattern): Set preg->no_sub to 1 if RE_NO_SUB. (free_dfa_content): Free subexp_map. (calc_inveclosure, calc_eclosure): Skip OP_DELETED_SUBEXP nodes. * posix/regexec.c (re_search_internal): If subexp_map is not NULL, duplicate registers as needed. * posix/Makefile: Add rules to build and run tst-regex2. * posix/tst-regex2.c: New test. * posix/rxspencer/tests: Fix last two tests (\0 -> \1). Add some new tests for nested subexpressions. --- ChangeLog | 20 ++++ posix/Makefile | 5 +- posix/regcomp.c | 105 ++++++++++++++++- posix/regex.h | 4 + posix/regex_internal.h | 2 + posix/regexec.c | 12 ++ posix/rxspencer/tests | 20 +++- posix/tst-regex2.c | 244 ++++++++++++++++++++++++++++++++++++++++ stdio-common/vfprintf.c | 2 +- 9 files changed, 408 insertions(+), 6 deletions(-) create mode 100644 posix/tst-regex2.c diff --git a/ChangeLog b/ChangeLog index d973d88707..9cadb342fc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,23 @@ +2004-11-18 Jakub Jelinek + + [BZ #544] + * posix/regex.h (RE_NO_SUB): New define. + * posix/regex_internal.h (OP_DELETED_SUBEXP): New. + (re_dfa_t): Add subexp_map. + * posix/regcomp.c (struct subexp_optimize): New type. + (optimize_subexps): New routine. + (re_compile_internal): Call it. + (re_compile_pattern): Set preg->no_sub to 1 if RE_NO_SUB. + (free_dfa_content): Free subexp_map. + (calc_inveclosure, calc_eclosure): Skip OP_DELETED_SUBEXP + nodes. + * posix/regexec.c (re_search_internal): If subexp_map + is not NULL, duplicate registers as needed. + * posix/Makefile: Add rules to build and run tst-regex2. + * posix/tst-regex2.c: New test. + * posix/rxspencer/tests: Fix last two tests (\0 -> \1). + Add some new tests for nested subexpressions. + 2004-11-18 Ulrich Drepper * libio/libio.h (_IO_FLAGS2_FORTIFY): Renamed from diff --git a/posix/Makefile b/posix/Makefile index 744f49f5b3..c89cfc0b41 100644 --- a/posix/Makefile +++ b/posix/Makefile @@ -80,7 +80,7 @@ tests := tstgetopt testfnm runtests runptests \ bug-regex13 bug-regex14 bug-regex15 bug-regex16 \ bug-regex17 bug-regex18 bug-regex19 bug-regex20 \ bug-regex21 bug-regex22 bug-regex23 bug-regex24 \ - tst-nice tst-nanosleep \ + tst-nice tst-nanosleep tst-regex2 \ transbug tst-rxspencer tst-pcre tst-boost \ bug-ga1 tst-vfork1 tst-vfork2 tst-waitid \ tst-getaddrinfo2 bug-glob1 bug-glob2 @@ -160,6 +160,7 @@ tst-fnmatch-ENV = LOCPATH=$(common-objpfx)localedata tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata bug-regex1-ENV = LOCPATH=$(common-objpfx)localedata tst-regex-ENV = LOCPATH=$(common-objpfx)localedata +tst-regex2-ENV = LOCPATH=$(common-objpfx)localedata bug-regex5-ENV = LOCPATH=$(common-objpfx)localedata bug-regex6-ENV = LOCPATH=$(common-objpfx)localedata bug-regex17-ENV = LOCPATH=$(common-objpfx)localedata @@ -244,8 +245,10 @@ $(objpfx)tst-getconf.out: tst-getconf.sh $(objpfx)getconf ifeq (yes,$(build-shared)) $(objpfx)tst-regex: $(common-objpfx)rt/librt.so +$(objpfx)tst-regex2: $(common-objpfx)rt/librt.so else $(objpfx)tst-regex: $(common-objpfx)rt/librt.a +$(objpfx)tst-regex2: $(common-objpfx)rt/librt.a endif $(objpfx)bug-ga2-mem: $(objpfx)bug-ga2.out diff --git a/posix/regcomp.c b/posix/regcomp.c index ba7a1cc5d4..dafad9bd0c 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -33,6 +33,14 @@ static reg_errcode_t create_initial_state (re_dfa_t *dfa); #ifdef RE_ENABLE_I18N static void optimize_utf8 (re_dfa_t *dfa); #endif +struct subexp_optimize +{ + re_dfa_t *dfa; + re_token_t *nodes; + int no_sub, re_nsub; +}; +static bin_tree_t *optimize_subexps (struct subexp_optimize *so, + bin_tree_t *node, int sidx, int depth); static reg_errcode_t analyze (re_dfa_t *dfa); static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node); static void calc_first (re_dfa_t *dfa, bin_tree_t *node); @@ -238,8 +246,8 @@ re_compile_pattern (pattern, length, bufp) /* And GNU code determines whether or not to get register information by passing null for the REGS argument to re_match, etc., not by - setting no_sub. */ - bufp->no_sub = 0; + setting no_sub, unless RE_NO_SUB is set. */ + bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); /* Match anchors at newline. */ bufp->newline_anchor = 1; @@ -633,6 +641,7 @@ free_dfa_content (re_dfa_t *dfa) if (dfa->sb_char != utf8_sb_map) re_free (dfa->sb_char); #endif + re_free (dfa->subexp_map); #ifdef DEBUG re_free (dfa->re_str); #endif @@ -810,6 +819,17 @@ re_compile_internal (preg, pattern, length, syntax) optimize_utf8 (dfa); #endif + if (preg->re_nsub > 0) + { + struct subexp_optimize so; + + so.dfa = dfa; + so.nodes = dfa->nodes; + so.no_sub = preg->no_sub; + so.re_nsub = preg->re_nsub; + dfa->str_tree = optimize_subexps (&so, dfa->str_tree, -1, 0); + } + /* Analyze the tree and collect information which is necessary to create the dfa. */ err = analyze (dfa); @@ -1121,6 +1141,82 @@ optimize_utf8 (dfa) } #endif +static bin_tree_t * +optimize_subexps (so, node, sidx, depth) + struct subexp_optimize *so; + bin_tree_t *node; + int sidx, depth; +{ + int idx, new_depth, new_sidx; + bin_tree_t *ret; + if (node == NULL) + return NULL; + + new_depth = 0; + new_sidx = sidx; + if ((depth & 1) && node->type == CONCAT + && node->right && node->right->type == 0 + && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP) + { + new_depth = depth + 1; + if (new_depth == 2 + || (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map) + && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx))) + new_sidx = so->nodes[idx].opr.idx; + } + node->left = optimize_subexps (so, node->left, new_sidx, new_depth); + new_depth = (depth & 1) == 0 && node->type == CONCAT + && node->left && node->left->type == 0 + && so->nodes[node->left->node_idx].type == OP_OPEN_SUBEXP + ? depth + 1 : 0; + node->right = optimize_subexps (so, node->right, sidx, new_depth); + + if (node->type != CONCAT) + return node; + if ((depth & 1) == 0 + && node->left + && node->left->type == 0 + && so->nodes[idx = node->left->node_idx].type == OP_OPEN_SUBEXP) + ret = node->right; + else if ((depth & 1) + && node->right + && node->right->type == 0 + && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP) + ret = node->left; + else + return node; + + if (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map) + && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx)) + return node; + + if (!so->no_sub) + { + int i; + + if (depth < 2) + return node; + + if (so->dfa->subexp_map == NULL) + { + so->dfa->subexp_map = re_malloc (int, so->re_nsub); + if (so->dfa->subexp_map == NULL) + return node; + + for (i = 0; i < so->re_nsub; i++) + so->dfa->subexp_map[i] = i; + } + + i = so->nodes[idx].opr.idx; + assert (sidx < i); + so->dfa->subexp_map[i] = sidx; + } + + so->nodes[idx].type = OP_DELETED_SUBEXP; + ret->parent = node->parent; + return ret; +} + /* Analyze the structure tree, and calculate "first", "next", "edest", "eclosure", and "inveclosure". */ @@ -1525,6 +1621,8 @@ calc_inveclosure (dfa) int src, idx, dest; for (src = 0; src < dfa->nodes_len; ++src) { + if (dfa->nodes[src].type == OP_DELETED_SUBEXP) + continue; for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) { dest = dfa->eclosures[src].elems[idx]; @@ -1560,6 +1658,9 @@ calc_eclosure (dfa) #ifdef DEBUG assert (dfa->eclosures[node_idx].nelem != -1); #endif + if (dfa->nodes[node_idx].type == OP_DELETED_SUBEXP) + continue; + /* If we have already calculated, skip it. */ if (dfa->eclosures[node_idx].nelem != 0) continue; diff --git a/posix/regex.h b/posix/regex.h index 4279dbcc5b..b2d9a62fec 100644 --- a/posix/regex.h +++ b/posix/regex.h @@ -179,6 +179,10 @@ typedef unsigned long int reg_syntax_t; immediately after an alternation or begin-group operator. */ #define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1) +/* If this bit is set, then no_sub will be set to 1 during + re_compile_pattern. */ +#define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1) + /* This global variable defines the particular regexp syntax to use (for some interfaces). When a regexp is compiled, the syntax used is stored in the pattern buffer, so changing this does not affect diff --git a/posix/regex_internal.h b/posix/regex_internal.h index 14d95a5b84..a778032d77 100644 --- a/posix/regex_internal.h +++ b/posix/regex_internal.h @@ -189,6 +189,7 @@ typedef enum OP_DUP_PLUS = EPSILON_BIT | 4, OP_DUP_QUESTION = EPSILON_BIT | 5, ANCHOR = EPSILON_BIT | 6, + OP_DELETED_SUBEXP = EPSILON_BIT | 7, /* Tree type, these are used only by tree. */ CONCAT = 16, @@ -644,6 +645,7 @@ struct re_dfa_t int mb_cur_max; bitset word_char; reg_syntax_t syntax; + int *subexp_map; #ifdef DEBUG char* re_str; #endif diff --git a/posix/regexec.c b/posix/regexec.c index a03df2636a..5877adeb55 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -882,6 +882,18 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, pmatch[reg_idx].rm_so += match_first; pmatch[reg_idx].rm_eo += match_first; } + + if (dfa->subexp_map) + for (reg_idx = 0; + reg_idx + 1 < nmatch && reg_idx < preg->re_nsub; + reg_idx++) + if (dfa->subexp_map[reg_idx] != reg_idx) + { + pmatch[reg_idx + 1].rm_so + = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; + pmatch[reg_idx + 1].rm_eo + = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; + } } free_return: diff --git a/posix/rxspencer/tests b/posix/rxspencer/tests index 30fff15946..a724252d8c 100644 --- a/posix/rxspencer/tests +++ b/posix/rxspencer/tests @@ -508,5 +508,21 @@ a*a*a*a*a*a*a* & aaaaaa aaaaaa (\b){0} - x @x - \(\b\)\{0,0\} b abc @abc - a(\b){0}c - ac ac - -a(.*)b(\0){0}c - abc abc @bc,- -a(.*)b(\0){0}c - axbc axbc x,- +a(.*)b(\1){0}c - abc abc @bc,- +a(.*)b(\1){0}c - axbc axbc x,- + +a\(\(b*\)\)c\1d b abbcbbd abbcbbd bb,bb +a\(\([bc]\)\)\2d b abcdabbd abbd b,b +a\(\(\(\([bc]\)\)\3\)\)*d b abbccd abbccd cc,cc,c,c +a(b)(c)d - abcd abcd b,c +a(((b)))c - abc abc b,b,b +a(((b|(((c))))))d - abd abd b,b,b,-,-,- +a(((b*|c|e)))d - abbd abbd bb,bb,bb +a((b|c)){0,0}d - ad ad -,- +a((b|c)){0,1}d - abd abd b,b +a((b|c)){0,2}d - abcd abcd c,c +a((b+|((c)*)))+d - abd abd b,b,-,- +a((b+|((c)*)))+d - abcd abcd c,c,c,c +(((\b))){0} - x @x -,-,- +a(((.*)))b((\2)){0}c - abc abc @bc,@bc,@bc,-,- +a(((.*)))b((\1)){0}c - axbc axbc x,x,x,-,- diff --git a/posix/tst-regex2.c b/posix/tst-regex2.c new file mode 100644 index 0000000000..a107f73ba4 --- /dev/null +++ b/posix/tst-regex2.c @@ -0,0 +1,244 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _POSIX_CPUTIME +static clockid_t cl; +static int use_clock; +#endif + +static int +do_test (void) +{ +#ifdef _POSIX_CPUTIME + /* See whether we can use the CPU clock. */ + use_clock = clock_getcpuclockid (0, &cl) == 0; +#endif + + static const char *pat[] = { + ".?.?.?.?.?.?.?Log\\.13", + "(.?)(.?)(.?)(.?)(.?)(.?)(.?)Log\\.13", + "((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))" + "((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))" + "((((((((((.?))))))))))Log\\.13" }; + + int fd = open ("../ChangeLog.14", O_RDONLY); + if (fd < 0) + { + printf ("Couldn't open ChangeLog.14: %m\n"); + return 1; + } + + struct stat64 st; + if (fstat64 (fd, &st) < 0) + { + printf ("Couldn't fstat ChangeLog.14: %m\n"); + return 1; + } + + char *buf = malloc (st.st_size + 1); + if (buf == NULL) + { + printf ("Couldn't allocate buffer: %m\n"); + return 1; + } + + if (read (fd, buf, st.st_size) != (ssize_t) st.st_size) + { + puts ("Couldn't read ChangeLog.14"); + return 1; + } + + close (fd); + buf[st.st_size] = '\0'; + + setlocale (LC_ALL, "de_DE.UTF-8"); + + char *string = buf; + size_t len = st.st_size; + +#ifndef WHOLE_FILE_TIMING + /* Don't search the whole file normally, it takes too long. */ + if (len > 500000 + 64) + { + string += 500000; + len -= 500000; + } +#endif + + for (int testno = 0; testno < 4; ++testno) + for (int i = 0; i < sizeof (pat) / sizeof (pat[0]); ++i) + { + printf ("test %d pattern %d", testno, i); + + regex_t rbuf; + struct re_pattern_buffer rpbuf; + int err; + if (testno < 2) + { + err = regcomp (&rbuf, pat[i], + REG_EXTENDED | (testno ? REG_NOSUB : 0)); + if (err != 0) + { + putchar ('\n'); + char errstr[300]; + regerror (err, &rbuf, errstr, sizeof (errstr)); + puts (errstr); + return err; + } + } + else + { + re_set_syntax (RE_SYNTAX_POSIX_EGREP + | (testno == 3 ? RE_NO_SUB : 0)); + + memset (&rpbuf, 0, sizeof (rpbuf)); + const char *s = re_compile_pattern (pat[i], strlen (pat[i]), + &rpbuf); + if (s != NULL) + { + printf ("\n%s\n", s); + return 1; + } + + /* Just so that this can be tested with earlier glibc as well. */ + if (testno == 3) + rpbuf.no_sub = 1; + } + +#ifdef _POSIX_CPUTIME + struct timespec start, stop; + if (use_clock) + use_clock = clock_gettime (cl, &start) == 0; +#endif + + if (testno < 2) + { + regmatch_t pmatch[71]; + err = regexec (&rbuf, string, 71, pmatch, 0); + if (err == REG_NOMATCH) + { + puts ("\nregexec failed"); + return 1; + } + + if (testno == 0) + { + if (pmatch[0].rm_eo != pmatch[0].rm_so + 13 + || pmatch[0].rm_eo > len + || pmatch[0].rm_so < len - 100 + || strncmp (string + pmatch[0].rm_so, + " ChangeLog.13 for earlier changes", + sizeof " ChangeLog.13 for earlier changes" - 1) + != 0) + { + puts ("\nregexec without REG_NOSUB did not find the correct match"); + return 1; + } + + if (i > 0) + for (int j = 0, l = 1; j < 7; ++j) + for (int k = 0; k < (i == 1 ? 1 : 10); ++k, ++l) + if (pmatch[l].rm_so != pmatch[0].rm_so + j + || pmatch[l].rm_eo != pmatch[l].rm_so + 1) + { + printf ("\npmatch[%d] incorrect\n", l); + return 1; + } + } + } + else + { + struct re_registers regs; + + memset (®s, 0, sizeof (regs)); + int match = re_search (&rpbuf, string, len, 0, len, + ®s); + if (match < 0) + { + puts ("\nre_search failed"); + return 1; + } + + if (match + 13 > len + || match < len - 100 + || strncmp (string + match, + " ChangeLog.13 for earlier changes", + sizeof " ChangeLog.13 for earlier changes" - 1) + != 0) + { + puts ("\nre_search did not find the correct match"); + return 1; + } + + if (testno == 2) + { + if (regs.num_regs != 2 + (i == 0 ? 0 : i == 1 ? 7 : 70)) + { + printf ("\nincorrect num_regs %d\n", regs.num_regs); + return 1; + } + + if (regs.start[0] != match || regs.end[0] != match + 13) + { + printf ("\nincorrect regs.{start,end}[0] = { %d, %d}\n", + regs.start[0], regs.end[0]); + return 1; + } + + if (regs.start[regs.num_regs - 1] != -1 + || regs.end[regs.num_regs - 1] != -1) + { + puts ("\nincorrect regs.{start,end}[num_regs - 1]"); + return 1; + } + + if (i > 0) + for (int j = 0, l = 1; j < 7; ++j) + for (int k = 0; k < (i == 1 ? 1 : 10); ++k, ++l) + if (regs.start[l] != match + j + || regs.end[l] != regs.start[l] + 1) + { + printf ("\nregs.{start,end}[%d] incorrect\n", l); + return 1; + } + } + } + +#ifdef _POSIX_CPUTIME + if (use_clock) + use_clock = clock_gettime (cl, &stop) == 0; + if (use_clock) + { + stop.tv_sec -= start.tv_sec; + if (stop.tv_nsec < start.tv_nsec) + { + stop.tv_sec--; + stop.tv_nsec += 1000000000 - start.tv_nsec; + } + else + stop.tv_nsec -= start.tv_nsec; + printf (": %ld.%09lds\n", (long) stop.tv_sec, (long) stop.tv_nsec); + } + else +#endif + putchar ('\n'); + + if (testno < 2) + regfree (&rbuf); + else + regfree (&rpbuf); + } + + return 0; +} + +#define TIMEOUT 20 +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" diff --git a/stdio-common/vfprintf.c b/stdio-common/vfprintf.c index 3f0e1dee5e..832a6ed547 100644 --- a/stdio-common/vfprintf.c +++ b/stdio-common/vfprintf.c @@ -893,7 +893,7 @@ vfprintf (FILE *s, const CHAR_T *format, va_list ap) * sizeof (CHAR_T))); \ } \ if (readonly_format < 0) \ - __libc_fatal ("*** %n is writable segment detected ***\n"); \ + __libc_fatal ("*** %n in writable segment detected ***\n"); \ } \ /* Answer the count of characters written. */ \ if (fspec == NULL) \