diff --git a/ChangeLog b/ChangeLog index b132ab7926..41b32438c0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,16 @@ 2002-02-26 Ulrich Drepper + * posix/Makefile (distribute): Add regcomp.c, regexec.c, + regex_internal.c, and regex_internal.h. + (CFLAGS-regex.c): Replace -DMBS_SUPPORT with -DRE_ENABLE_I18N. + * posix/regex.c: Complete rewrite. + * posix/regexec.c: New file. + * posix/regcomp.c: New file. + * posix/regex_internal.c: New file. + * posix/regex_internal.h: New file. + * posix/regex.h (RE_ICASE): New macro. + Contributed by Isamu Hasegawa . + * stdio-common/vfscanf.c (_IO_vfwscanf): Always use ungetc, never ungetwc. It's a macro. * libio/tst-swscanf.c (do_test): Adjust for now fixed wscanf diff --git a/NEWS b/NEWS index 42ca9c7493..6bd0f4b563 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -GNU C Library NEWS -- history of user-visible changes. 2002-1-28 +GNU C Library NEWS -- history of user-visible changes. 2002-2-26 Copyright (C) 1992-2000, 2001, 2002 Free Software Foundation, Inc. See the end for copying conditions. @@ -25,6 +25,9 @@ Version 2.3 * The malloc functions were completely rewritten by Wolfram Gloger based on Doug Lea's malloc-2.7.0.c. + +* Isamu Hasegawa contributed a completely new and POSIX conforming + implementation of regex. Version 2.2.5 diff --git a/posix/Makefile b/posix/Makefile index 785c227ebb..008154de9a 100644 --- a/posix/Makefile +++ b/posix/Makefile @@ -1,4 +1,4 @@ -# Copyright (C) 1991-1999, 2000, 2001 Free Software Foundation, Inc. +# Copyright (C) 1991-1999, 2000, 2001, 2002 Free Software Foundation, Inc. # This file is part of the GNU C Library. # The GNU C Library is free software; you can redistribute it and/or @@ -32,7 +32,8 @@ headers := sys/utsname.h sys/times.h sys/wait.h sys/types.h unistd.h \ distribute := confstr.h TESTS TESTS2C.sed testcases.h \ PTESTS PTESTS2C.sed ptestcases.h \ globtest.c globtest.sh wordexp-tst.sh annexc.c fnmatch_loop.c \ - spawn_int.h tst-getconf.sh + spawn_int.h tst-getconf.sh regcomp.c regexec.c regex_internal.c \ + regex_internal.h routines := \ uname \ @@ -109,7 +110,7 @@ $(objpfx)wordexp-tst.out: wordexp-tst.sh $(objpfx)wordexp-test endif endif -CFLAGS-regex.c = -Wno-strict-prototypes -DMBS_SUPPORT +CFLAGS-regex.c = -Wno-strict-prototypes -DRE_ENABLE_I18N CFLAGS-getaddrinfo.c = -DRESOLVER tstgetopt-ARGS = -a -b -cfoobar --required foobar --optional=bazbug \ --none random --col --color --colour diff --git a/posix/regcomp.c b/posix/regcomp.c new file mode 100644 index 0000000000..12da043062 --- /dev/null +++ b/posix/regcomp.c @@ -0,0 +1,3092 @@ +/* Extended regular expression matching and search library. + Copyright (C) 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Isamu Hasegawa . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _LIBC +# ifndef _RE_DEFINE_LOCALE_FUNCTIONS +# define _RE_DEFINE_LOCALE_FUNCTIONS 1 +# include +# include +# include +# endif +#endif + +/* This is for other GNU distributions with internationalized messages. */ +#if HAVE_LIBINTL_H || defined _LIBC +# include +# ifdef _LIBC +# undef gettext +# define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES) +# endif +#else +# define gettext(msgid) (msgid) +#endif + +#ifndef gettext_noop +/* This define is so xgettext can find the internationalizable + strings. */ +# define gettext_noop(String) String +#endif + +#include "regex.h" +#include "regex_internal.h" + +static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, + int length, reg_syntax_t syntax); +static void re_compile_fastmap_iter (regex_t *bufp, + const re_dfastate_t *init_state, + char *fastmap); +static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len); +static void init_word_char (re_dfa_t *dfa); +static void free_charset (re_charset_t *cset); +static void free_workarea_compile (regex_t *preg); +static reg_errcode_t create_initial_state (re_dfa_t *dfa); +static reg_errcode_t analyze (re_dfa_t *dfa); +static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node); +static void calc_first (re_dfa_t *dfa, bin_tree_t *node); +static void calc_next (re_dfa_t *dfa, bin_tree_t *node); +static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node); +static int duplicate_node (re_dfa_t *dfa, int org_idx, + unsigned int constraint); +static reg_errcode_t calc_eclosure (re_dfa_t *dfa); +static re_node_set calc_eclosure_iter (re_dfa_t *dfa, int node, int root); +static void calc_inveclosure (re_dfa_t *dfa); +static int fetch_number (re_string_t *input, re_token_t *token, + reg_syntax_t syntax); +static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax); +static int peek_token (re_token_t *token, re_string_t *input, + reg_syntax_t syntax); +static int peek_token_bracket (re_token_t *token, re_string_t *input, + reg_syntax_t syntax); +static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, + reg_syntax_t syntax, reg_errcode_t *err); +static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, + re_token_t *token, reg_syntax_t syntax, + int nest, reg_errcode_t *err); +static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, + re_token_t *token, reg_syntax_t syntax, + int nest, reg_errcode_t *err); +static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, + re_token_t *token, reg_syntax_t syntax, + int nest, reg_errcode_t *err); +static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, + re_token_t *token, reg_syntax_t syntax, + int nest, reg_errcode_t *err); +static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, + re_dfa_t *dfa, re_token_t *token, + reg_syntax_t syntax, reg_errcode_t *err); +static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, + re_token_t *token, reg_syntax_t syntax, + reg_errcode_t *err); +static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, + re_string_t *regexp, + re_token_t *token, int token_len, + re_dfa_t *dfa, + reg_syntax_t syntax); +static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, + re_string_t *regexp, + re_token_t *token); +static reg_errcode_t build_equiv_class (re_charset_t *mbcset, + re_bitset_ptr_t sbcset, + int *equiv_class_alloc, + const unsigned char *name); +static reg_errcode_t build_charclass (re_charset_t *mbcset, + re_bitset_ptr_t sbcset, + int *char_class_alloc, + const unsigned char *name); +static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err); +static void free_bin_tree (bin_tree_t *tree); +static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right, + re_token_type_t type, int index); +static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); + +/* This table gives an error message for each of the error codes listed + in regex.h. Obviously the order here has to be same as there. + POSIX doesn't require that we do anything for REG_NOERROR, + but why not be nice? */ + +const char re_error_msgid[] = + { +#define REG_NOERROR_IDX 0 + gettext_noop ("Success") /* REG_NOERROR */ + "\0" +#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") + gettext_noop ("No match") /* REG_NOMATCH */ + "\0" +#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") + gettext_noop ("Invalid regular expression") /* REG_BADPAT */ + "\0" +#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") + gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ + "\0" +#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") + gettext_noop ("Invalid character class name") /* REG_ECTYPE */ + "\0" +#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") + gettext_noop ("Trailing backslash") /* REG_EESCAPE */ + "\0" +#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") + gettext_noop ("Invalid back reference") /* REG_ESUBREG */ + "\0" +#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") + gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ + "\0" +#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") + gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ + "\0" +#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") + gettext_noop ("Unmatched \\{") /* REG_EBRACE */ + "\0" +#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") + gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ + "\0" +#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") + gettext_noop ("Invalid range end") /* REG_ERANGE */ + "\0" +#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") + gettext_noop ("Memory exhausted") /* REG_ESPACE */ + "\0" +#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") + gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ + "\0" +#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") + gettext_noop ("Premature end of regular expression") /* REG_EEND */ + "\0" +#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") + gettext_noop ("Regular expression too big") /* REG_ESIZE */ + "\0" +#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") + gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ + }; + +const size_t re_error_msgid_idx[] = + { + REG_NOERROR_IDX, + REG_NOMATCH_IDX, + REG_BADPAT_IDX, + REG_ECOLLATE_IDX, + REG_ECTYPE_IDX, + REG_EESCAPE_IDX, + REG_ESUBREG_IDX, + REG_EBRACK_IDX, + REG_EPAREN_IDX, + REG_EBRACE_IDX, + REG_BADBR_IDX, + REG_ERANGE_IDX, + REG_ESPACE_IDX, + REG_BADRPT_IDX, + REG_EEND_IDX, + REG_ESIZE_IDX, + REG_ERPAREN_IDX + }; + +/* Entry points for GNU code. */ + +/* re_compile_pattern is the GNU regular expression compiler: it + compiles PATTERN (of length SIZE) and puts the result in BUFP. + Returns 0 if the pattern was valid, otherwise an error string. + + Assumes the `allocated' (and perhaps `buffer') and `translate' fields + are set in BUFP on entry. */ + +const char * +re_compile_pattern (pattern, length, bufp) + const char *pattern; + size_t length; + struct re_pattern_buffer *bufp; +{ + reg_errcode_t ret; + + /* GNU code is written to assume at least RE_NREGS registers will be set + (and at least one extra will be -1). */ + bufp->regs_allocated = REGS_UNALLOCATED; + + /* And GNU code determines whether or not to get register information + by passing null for the REGS argument to re_match, etc., not by + setting no_sub. */ + bufp->no_sub = 0; + + /* Match anchors at newline. */ + bufp->newline_anchor = 1; + + ret = re_compile_internal (bufp, (const unsigned char *) pattern, length, + re_syntax_options); + + if (!ret) + return NULL; + return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]); +} +#ifdef _LIBC +weak_alias (__re_compile_pattern, re_compile_pattern) +#endif + +/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can + also be assigned to arbitrarily: each pattern buffer stores its own + syntax, so it can be changed between regex compilations. */ +/* This has no initializer because initialized variables in Emacs + become read-only after dumping. */ +reg_syntax_t re_syntax_options; + + +/* Specify the precise syntax of regexps for compilation. This provides + for compatibility for various utilities which historically have + different, incompatible syntaxes. + + The argument SYNTAX is a bit mask comprised of the various bits + defined in regex.h. We return the old syntax. */ + +reg_syntax_t +re_set_syntax (syntax) + reg_syntax_t syntax; +{ + reg_syntax_t ret = re_syntax_options; + + re_syntax_options = syntax; + return ret; +} +#ifdef _LIBC +weak_alias (__re_set_syntax, re_set_syntax) +#endif + +int +re_compile_fastmap (bufp) + struct re_pattern_buffer *bufp; +{ + re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; + char *fastmap = bufp->fastmap; + + memset (fastmap, '\0', sizeof (char) * SBC_MAX); + re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); + if (dfa->init_state != dfa->init_state_word) + re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); + if (dfa->init_state != dfa->init_state_nl) + re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); + if (dfa->init_state != dfa->init_state_begbuf) + re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); + bufp->fastmap_accurate = 1; + return 0; +} +#ifdef _LIBC +weak_alias (__re_compile_fastmap, re_compile_fastmap) +#endif + +/* Helper function for re_compile_fastmap. + Compile fastmap for the initial_state INIT_STATE. */ + +static void +re_compile_fastmap_iter (bufp, init_state, fastmap) + regex_t *bufp; + const re_dfastate_t *init_state; + char *fastmap; +{ + re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; + int node_cnt; + for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) + { + int node = init_state->nodes.elems[node_cnt]; + re_token_type_t type = dfa->nodes[node].type; + if (type == OP_CONTEXT_NODE) + { + node = dfa->nodes[node].opr.ctx_info->entity; + type = dfa->nodes[node].type; + } + + if (type == CHARACTER) + fastmap[dfa->nodes[node].opr.c] = 1; + else if (type == SIMPLE_BRACKET) + { + int i, j, ch; + for (i = 0, ch = 0; i < BITSET_UINTS; ++i) + for (j = 0; j < UINT_BITS; ++j, ++ch) + if (dfa->nodes[node].opr.sbcset[i] & (1 << j)) + fastmap[ch] = 1; + } + else if (type == COMPLEX_BRACKET) + { + int i; + re_charset_t *cset = dfa->nodes[node].opr.mbcset; + if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes + || cset->nranges || cset->nchar_classes) + { + if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) + { + /* In this case we want to catch the bytes which are + the first byte of any collation elements. + e.g. In da_DK, we want to catch 'a' since "aa" + is a valid collation element, and don't catch + 'b' since 'b' is the only collation element + which starts from 'b'. */ + int j, ch; + const int32_t *table = (const int32_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); + for (i = 0, ch = 0; i < BITSET_UINTS; ++i) + for (j = 0; j < UINT_BITS; ++j, ++ch) + if (table[ch] < 0) + fastmap[ch] = 1; + } + } + for (i = 0; i < cset->nmbchars; ++i) + { + unsigned char buf[256]; + wctomb (buf, cset->mbchars[i]); + fastmap[buf[0]] = 1; + } + } + else if (type == END_OF_RE || type == COMPLEX_BRACKET + || type == OP_PERIOD) + { + memset (fastmap, '\1', sizeof (char) * SBC_MAX); + if (type == END_OF_RE) + bufp->can_be_null = 1; + return; + } + } +} + +/* Entry point for POSIX code. */ +/* regcomp takes a regular expression as a string and compiles it. + + PREG is a regex_t *. We do not expect any fields to be initialized, + since POSIX says we shouldn't. Thus, we set + + `buffer' to the compiled pattern; + `used' to the length of the compiled pattern; + `syntax' to RE_SYNTAX_POSIX_EXTENDED if the + REG_EXTENDED bit in CFLAGS is set; otherwise, to + RE_SYNTAX_POSIX_BASIC; + `newline_anchor' to REG_NEWLINE being set in CFLAGS; + `fastmap' to an allocated space for the fastmap; + `fastmap_accurate' to zero; + `re_nsub' to the number of subexpressions in PATTERN. + + PATTERN is the address of the pattern string. + + CFLAGS is a series of bits which affect compilation. + + If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we + use POSIX basic syntax. + + If REG_NEWLINE is set, then . and [^...] don't match newline. + Also, regexec will try a match beginning after every newline. + + If REG_ICASE is set, then we considers upper- and lowercase + versions of letters to be equivalent when matching. + + If REG_NOSUB is set, then when PREG is passed to regexec, that + routine will report only success or failure, and nothing about the + registers. + + It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for + the return codes and their meanings.) */ + +int +regcomp (preg, pattern, cflags) + regex_t *preg; + const char *pattern; + int cflags; +{ + reg_errcode_t ret; + reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED + : RE_SYNTAX_POSIX_BASIC); + + preg->buffer = NULL; + preg->allocated = 0; + preg->used = 0; + + /* Try to allocate space for the fastmap. */ + preg->fastmap = re_malloc (char, SBC_MAX); + if (preg->fastmap == NULL) + return REG_ESPACE; + + syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; + + /* If REG_NEWLINE is set, newlines are treated differently. */ + if (cflags & REG_NEWLINE) + { /* REG_NEWLINE implies neither . nor [^...] match newline. */ + syntax &= ~RE_DOT_NEWLINE; + syntax |= RE_HAT_LISTS_NOT_NEWLINE; + /* It also changes the matching behavior. */ + preg->newline_anchor = 1; + } + else + preg->newline_anchor = 0; + preg->no_sub = !!(cflags & REG_NOSUB); + preg->translate = NULL; + + ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); + + /* POSIX doesn't distinguish between an unmatched open-group and an + unmatched close-group: both are REG_EPAREN. */ + if (ret == REG_ERPAREN) + ret = REG_EPAREN; + + /* XXX Why the test for preg->fastmap != NULL? */ + if (ret == REG_NOERROR && preg->fastmap != NULL) + { + /* Compute the fastmap now, since regexec cannot modify the pattern + buffer. */ + if (re_compile_fastmap (preg) == -2) + { + /* Some error occurred while computing the fastmap, just forget + about it. */ + re_free (preg->fastmap); + preg->fastmap = NULL; + } + } + + return (int) ret; +} +#ifdef _LIBC +weak_alias (__regcomp, regcomp) +#endif + +/* Returns a message corresponding to an error code, ERRCODE, returned + from either regcomp or regexec. We don't use PREG here. */ + +size_t +regerror (errcode, preg, errbuf, errbuf_size) + int errcode; + const regex_t *preg; + char *errbuf; + size_t errbuf_size; +{ + const char *msg; + size_t msg_size; + + if (errcode < 0 + || errcode >= (int) (sizeof (re_error_msgid_idx) + / sizeof (re_error_msgid_idx[0]))) + /* Only error codes returned by the rest of the code should be passed + to this routine. If we are given anything else, or if other regex + code generates an invalid error code, then the program has a bug. + Dump core so we can fix it. */ + abort (); + + msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]); + + msg_size = strlen (msg) + 1; /* Includes the null. */ + + if (errbuf_size != 0) + { + if (msg_size > errbuf_size) + { +#if defined HAVE_MEMPCPY || defined _LIBC + *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; +#else + memcpy (errbuf, msg, errbuf_size - 1); + errbuf[errbuf_size - 1] = 0; +#endif + } + else + memcpy (errbuf, msg, msg_size); + } + + return msg_size; +} +#ifdef _LIBC +weak_alias (__regerror, regerror) +#endif + +/* Free dynamically allocated space used by PREG. */ + +void +regfree (preg) + regex_t *preg; +{ + int i, j; + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + if (dfa != NULL) + { + re_free (dfa->subexps); + + for (i = 0; i < dfa->nodes_len; ++i) + { + re_token_t *node = dfa->nodes + i; + if (node->type == COMPLEX_BRACKET && node->duplicated == 0) + free_charset (node->opr.mbcset); + else if (node->type == SIMPLE_BRACKET && node->duplicated == 0) + re_free (node->opr.sbcset); + else if (node->type == OP_CONTEXT_NODE) + { + if (dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF) + { + if (node->opr.ctx_info->bkref_eclosure != NULL) + re_node_set_free (node->opr.ctx_info->bkref_eclosure); + re_free (node->opr.ctx_info->bkref_eclosure); + } + re_free (node->opr.ctx_info); + } + } + re_free (dfa->firsts); + re_free (dfa->nexts); + for (i = 0; i < dfa->nodes_len; ++i) + { + if (dfa->eclosures != NULL) + re_node_set_free (dfa->eclosures + i); + if (dfa->inveclosures != NULL) + re_node_set_free (dfa->inveclosures + i); + if (dfa->edests != NULL) + re_node_set_free (dfa->edests + i); + } + re_free (dfa->edests); + re_free (dfa->eclosures); + re_free (dfa->inveclosures); + re_free (dfa->nodes); + + for (i = 0; i <= dfa->state_hash_mask; ++i) + { + struct re_state_table_entry *entry = dfa->state_table + i; + if (entry->alloc == 0) + re_free (entry->entry.state); + else + { + for (j = 0; j < entry->num; ++j) + { + re_dfastate_t *state = entry->entry.array[j]; + if (state->entrance_nodes != &state->nodes) + { + re_node_set_free (state->entrance_nodes); + re_free (state->entrance_nodes); + } + re_node_set_free (&state->nodes); + re_free (state->trtable); + re_free (state->trtable_search); + re_free (state); + } + re_free (entry->entry.array); + } + } + re_free (dfa->state_table); + + if (dfa->word_char != NULL) + re_free (dfa->word_char); + re_free (dfa); + } + re_free (preg->fastmap); +} +#ifdef _LIBC +weak_alias (__regfree, regfree) +#endif + +/* Entry points compatible with 4.2 BSD regex library. We don't define + them unless specifically requested. */ + +#if defined _REGEX_RE_COMP || defined _LIBC + +/* BSD has one and only one pattern buffer. */ +static struct re_pattern_buffer re_comp_buf; + +char * +# ifdef _LIBC +/* Make these definitions weak in libc, so POSIX programs can redefine + these names if they don't use our functions, and still use + regcomp/regexec above without link errors. */ +weak_function +# endif +re_comp (s) + const char *s; +{ + reg_errcode_t ret; + + if (!s) + { + if (!re_comp_buf.buffer) + return gettext ("No previous regular expression"); + return 0; + } + + if (!re_comp_buf.buffer) + { + re_comp_buf.fastmap = (char *) malloc (SBC_MAX); + if (re_comp_buf.fastmap == NULL) + return (char *) gettext (re_error_msgid + + re_error_msgid_idx[(int) REG_ESPACE]); + } + + /* Since `re_exec' always passes NULL for the `regs' argument, we + don't need to initialize the pattern buffer fields which affect it. */ + + /* Match anchors at newlines. */ + re_comp_buf.newline_anchor = 1; + + ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); + + if (!ret) + return NULL; + + /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ + return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]); +} +#endif /* _REGEX_RE_COMP */ + +/* Internal entry point. + Compile the regular expression PATTERN, whose length is LENGTH. + SYNTAX indicate regular expression's syntax. */ + +static reg_errcode_t +re_compile_internal (preg, pattern, length, syntax) + regex_t *preg; + const char * pattern; + int length; + reg_syntax_t syntax; +{ + reg_errcode_t err = REG_NOERROR; + re_dfa_t *dfa; + re_string_t regexp; + + /* Initialize the pattern buffer. */ + preg->fastmap_accurate = 0; + preg->syntax = syntax; + preg->not_bol = preg->not_eol = 0; + preg->used = 0; + preg->re_nsub = 0; + + /* Initialize the dfa. */ + dfa = (re_dfa_t *) preg->buffer; + if (preg->allocated < sizeof (re_dfa_t)) + { + /* If zero allocated, but buffer is non-null, try to realloc + enough space. This loses if buffer's address is bogus, but + that is the user's responsibility. If ->buffer is NULL this + is a simple allocation. */ + dfa = re_realloc (preg->buffer, re_dfa_t, 1); + if (dfa == NULL) + return REG_ESPACE; + memset (dfa, '\0', sizeof (re_dfa_t)); + preg->allocated = sizeof (re_dfa_t); + } + preg->buffer = (unsigned char *) dfa; + preg->used = sizeof (re_dfa_t); + + err = init_dfa (dfa, length); + if (err != REG_NOERROR) + { + re_free (dfa); + preg->buffer = NULL; + return err; + } + + if (syntax & RE_ICASE) + err = re_string_construct_toupper (®exp, pattern, length, + preg->translate); + else + err = re_string_construct (®exp, pattern, length, preg->translate); + + if (err != REG_NOERROR) + { + re_free (dfa); + preg->buffer = NULL; + return err; + } + + /* Parse the regular expression, and build a structure tree. */ + preg->re_nsub = 0; + dfa->str_tree = parse (®exp, preg, syntax, &err); + if (dfa->str_tree == NULL) + goto re_compile_internal_free_return; + + /* Analyze the tree and collect information which is necessary to + create the dfa. */ + err = analyze (dfa); + if (err != REG_NOERROR) + goto re_compile_internal_free_return; + + /* Then create the initial state of the dfa. */ + err = create_initial_state (dfa); + if (err != REG_NOERROR) + goto re_compile_internal_free_return; + + re_compile_internal_free_return: + /* Release work areas. */ + free_workarea_compile (preg); + re_string_destruct (®exp); + + return err; +} + +/* Initialize DFA. We use the length of the regular expression PAT_LEN + as the initial length of some arrays. */ + +static reg_errcode_t +init_dfa (dfa, pat_len) + re_dfa_t *dfa; + int pat_len; +{ + int table_size; + dfa->nodes_alloc = pat_len + 1; + dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); + + dfa->states_alloc = pat_len + 1; + + /* table_size = 2 ^ ceil(log pat_len) */ + for (table_size = 1; table_size > 0; table_size <<= 1) + if (table_size > pat_len) + break; + + dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); + dfa->state_hash_mask = table_size - 1; + + dfa->subexps_alloc = 1; + dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc); + dfa->word_char = NULL; + + if (dfa->nodes == NULL || dfa->state_table == NULL || dfa->subexps == NULL) + { + /* We don't bother to free anything which was allocated. Very + soon the process will go down anyway. */ + dfa->subexps = NULL; + dfa->state_table = NULL; + dfa->nodes = NULL; + return REG_ESPACE; + } + return REG_NOERROR; +} + +/* Initialize WORD_CHAR table, which indicate which character is + "word". In this case "word" means that it is the word construction + character used by some operators like "\<", "\>", etc. */ + +static void +init_word_char (dfa) + re_dfa_t *dfa; +{ + int i, j, ch; + dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); + for (i = 0, ch = 0; i < BITSET_UINTS; ++i) + for (j = 0; j < UINT_BITS; ++j, ++ch) + if (isalnum (ch) || ch == '_') + dfa->word_char[i] |= 1 << j; +} + +/* Free the work area which are only used while compiling. */ + +static void +free_workarea_compile (preg) + regex_t *preg; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + free_bin_tree (dfa->str_tree); + dfa->str_tree = NULL; +} + +/* Create initial states for all contexts. */ + +static reg_errcode_t +create_initial_state (dfa) + re_dfa_t *dfa; +{ + int first, i; + reg_errcode_t err; + re_node_set init_nodes; + + /* Initial states have the epsilon closure of the node which is + the first node of the regular expression. */ + first = dfa->str_tree->first; + dfa->init_node = first; + err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); + if (err != REG_NOERROR) + return err; + + /* The back-references which are in initial states can epsilon transit, + since in this case all of the subexpressions can be null. + Then we add epsilon closures of the nodes which are the next nodes of + the back-references. */ + if (dfa->nbackref > 0) + for (i = 0; i < init_nodes.nelem; ++i) + { + int node_idx = init_nodes.elems[i]; + re_token_type_t type = dfa->nodes[node_idx].type; + if (type == OP_CONTEXT_NODE + && (dfa->nodes[dfa->nodes[node_idx].opr.ctx_info->entity].type + == OP_BACK_REF)) + { + int prev_nelem = init_nodes.nelem; + re_node_set_merge (&init_nodes, + dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure); + if (prev_nelem < init_nodes.nelem) + i = 0; + } + else if (type == OP_BACK_REF) + { + int next_idx = dfa->nexts[node_idx]; + if (!re_node_set_contains (&init_nodes, next_idx)) + { + re_node_set_merge (&init_nodes, dfa->eclosures + next_idx); + i = 0; + } + } + } + + /* It must be the first time to invoke acquire_state. */ + dfa->init_state = re_acquire_state_context (dfa, &init_nodes, 0); + if (dfa->init_state->has_constraint) + { + dfa->init_state_word = re_acquire_state_context (dfa, &init_nodes, + CONTEXT_WORD); + dfa->init_state_nl = re_acquire_state_context (dfa, &init_nodes, + CONTEXT_NEWLINE); + dfa->init_state_begbuf = re_acquire_state_context (dfa, &init_nodes, + CONTEXT_NEWLINE + | CONTEXT_BEGBUF); + } + else + dfa->init_state_word = dfa->init_state_nl + = dfa->init_state_begbuf = dfa->init_state; + + if (dfa->init_state == NULL || dfa->init_state_word == NULL + || dfa->init_state_nl == NULL || dfa->init_state_begbuf == NULL ) + return REG_ESPACE; + re_node_set_free (&init_nodes); + return REG_NOERROR; +} + +/* Analyze the structure tree, and calculate "first", "next", "edest", + "eclosure", and "inveclosure". */ + +static reg_errcode_t +analyze (dfa) + re_dfa_t *dfa; +{ + int i; + reg_errcode_t ret; + + /* Allocate arrays. */ + dfa->firsts = re_malloc (int, dfa->nodes_alloc); + dfa->nexts = re_malloc (int, dfa->nodes_alloc); + dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); + dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); + dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc); + if (dfa->firsts == NULL || dfa->nexts == NULL || dfa->edests == NULL + || dfa->eclosures == NULL || dfa->inveclosures == NULL) + return REG_ESPACE; + /* Initialize them. */ + for (i = 0; i < dfa->nodes_len; ++i) + { + dfa->firsts[i] = -1; + dfa->nexts[i] = -1; + re_node_set_init_empty (dfa->edests + i); + re_node_set_init_empty (dfa->eclosures + i); + re_node_set_init_empty (dfa->inveclosures + i); + } + + ret = analyze_tree (dfa, dfa->str_tree); + if (ret == REG_NOERROR) + { + ret = calc_eclosure (dfa); + if (ret == REG_NOERROR) + calc_inveclosure (dfa); + } + return ret; +} + +/* Helper functions for analyze. + This function calculate "first", "next", and "edest" for the subtree + whose root is NODE. */ + +static reg_errcode_t +analyze_tree (dfa, node) + re_dfa_t *dfa; + bin_tree_t *node; +{ + reg_errcode_t ret; + if (node->first == -1) + calc_first (dfa, node); + if (node->next == -1) + calc_next (dfa, node); + if (node->eclosure.nelem == 0) + calc_epsdest (dfa, node); + /* Calculate "first" etc. for the left child. */ + if (node->left != NULL) + { + ret = analyze_tree (dfa, node->left); + if (ret != REG_NOERROR) + return ret; + } + /* Calculate "first" etc. for the right child. */ + if (node->right != NULL) + { + ret = analyze_tree (dfa, node->right); + if (ret != REG_NOERROR) + return ret; + } + return REG_NOERROR; +} + +/* Calculate "first" for the node NODE. */ +static void +calc_first (dfa, node) + re_dfa_t *dfa; + bin_tree_t *node; +{ + int idx, type; + idx = node->node_idx; + type = (node->type == 0) ? dfa->nodes[idx].type : node->type; + + switch (type) + { +#ifdef DEBUG + case OP_OPEN_SUBEXP: + case OP_CLOSE_SUBEXP: + case OP_OPEN_BRACKET: + case OP_CLOSE_BRACKET: + case OP_OPEN_DUP_NUM: + case OP_CLOSE_DUP_NUM: + case OP_NON_MATCH_LIST: + case OP_OPEN_COLL_ELEM: + case OP_CLOSE_COLL_ELEM: + case OP_OPEN_EQUIV_CLASS: + case OP_CLOSE_EQUIV_CLASS: + case OP_OPEN_CHAR_CLASS: + case OP_CLOSE_CHAR_CLASS: + /* These must not be appeared here. */ + assert (0); +#endif + case END_OF_RE: + case CHARACTER: + case OP_PERIOD: + case OP_DUP_ASTERISK: + case OP_DUP_QUESTION: + case COMPLEX_BRACKET: + case SIMPLE_BRACKET: + case OP_BACK_REF: + case ANCHOR: + node->first = idx; + break; + case OP_DUP_PLUS: +#ifdef DEBUG + assert (node->left != NULL); +#endif + if (node->left->first == -1) + calc_first (dfa, node->left); + node->first = node->left->first; + break; + case OP_ALT: + node->first = idx; + break; + case SUBEXP: + if (node->left == NULL) + { + if (node->next == -1) + calc_next (dfa, node); + node->first = node->next; + break; + } + /* else fall through */ + default: +#ifdef DEBUG + assert (node->left != NULL); +#endif + if (node->left->first == -1) + calc_first (dfa, node->left); + node->first = node->left->first; + break; + } + if (node->type == 0) + dfa->firsts[idx] = node->first; +} + +/* Calculate "next" for the node NODE. */ + +static void +calc_next (dfa, node) + re_dfa_t *dfa; + bin_tree_t *node; +{ + int idx, type; + bin_tree_t *parent = node->parent; + if (parent == NULL) + { + node->next = -1; + idx = node->node_idx; + if (node->type == 0) + dfa->nexts[idx] = node->next; + return; + } + + idx = parent->node_idx; + type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type; + + switch (type) + { + case OP_DUP_ASTERISK: + case OP_DUP_PLUS: + node->next = idx; + break; + case CONCAT: + if (parent->left == node) + { + if (parent->right->first == -1) + calc_first (dfa, parent->right); + node->next = parent->right->first; + break; + } + /* else fall through */ + default: + if (parent->next == -1) + calc_next (dfa, parent); + node->next = parent->next; + break; + } + idx = node->node_idx; + if (node->type == 0) + dfa->nexts[idx] = node->next; +} + +/* Calculate "edest" for the node NODE. */ + +static void +calc_epsdest (dfa, node) + re_dfa_t *dfa; + bin_tree_t *node; +{ + int idx; + idx = node->node_idx; + if (node->type == 0) + { + if (dfa->nodes[idx].type == OP_DUP_ASTERISK + || dfa->nodes[idx].type == OP_DUP_PLUS + || dfa->nodes[idx].type == OP_DUP_QUESTION) + { + if (node->left->first == -1) + calc_first (dfa, node->left); + if (node->next == -1) + calc_next (dfa, node); + re_node_set_init_2 (dfa->edests + idx, node->left->first, + node->next); + } + else if (dfa->nodes[idx].type == OP_ALT) + { + int left, right; + if (node->left != NULL) + { + if (node->left->first == -1) + calc_first (dfa, node->left); + left = node->left->first; + } + else + { + if (node->next == -1) + calc_next (dfa, node); + left = node->next; + } + if (node->right != NULL) + { + if (node->right->first == -1) + calc_first (dfa, node->right); + right = node->right->first; + } + else + { + if (node->next == -1) + calc_next (dfa, node); + right = node->next; + } + re_node_set_init_2 (dfa->edests + idx, left, right); + } + else if (dfa->nodes[idx].type == ANCHOR) + re_node_set_init_1 (dfa->edests + idx, node->next); + } +} + +static int +duplicate_node (dfa, org_idx, constraint) + re_dfa_t *dfa; + int org_idx; + unsigned int constraint; +{ + re_token_t dup; + int dup_idx; + + dup.type = OP_CONTEXT_NODE; + if (dfa->nodes[org_idx].type == OP_CONTEXT_NODE) + { + if (dfa->nodes[org_idx].constraint == constraint) + return org_idx; + dup.constraint = constraint | + dfa->nodes[org_idx].constraint; + } + else + dup.constraint = constraint; + + /* In case that `entity' points OP_CONTEXT_NODE, + we correct `entity' to real entity in calc_inveclosures(). */ + dup.opr.ctx_info = malloc (sizeof (*dup.opr.ctx_info)); + dup.opr.ctx_info->entity = org_idx; + dup.opr.ctx_info->bkref_eclosure = NULL; + dup_idx = re_dfa_add_node (dfa, dup, 1); + dfa->nodes[dup_idx].duplicated = 1; + + dfa->firsts[dup_idx] = dfa->firsts[org_idx]; + dfa->nexts[dup_idx] = dfa->nexts[org_idx]; + re_node_set_init_copy (dfa->edests + dup_idx, dfa->edests + org_idx); + /* Since we don't duplicate epsilon nodes, epsilon closure have + only itself. */ + re_node_set_init_1 (dfa->eclosures + dup_idx, dup_idx); + re_node_set_init_1 (dfa->inveclosures + dup_idx, dup_idx); + /* Then we must update inveclosure for this node. + We process them at last part of calc_eclosure(), + since we don't complete to calculate them here. */ + + return dup_idx; +} + +static void +calc_inveclosure (dfa) + re_dfa_t *dfa; +{ + int src, idx, dest, entity; + for (src = 0; src < dfa->nodes_len; ++src) + { + for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) + { + dest = dfa->eclosures[src].elems[idx]; + re_node_set_insert (dfa->inveclosures + dest, src); + } + + entity = src; + while (dfa->nodes[entity].type == OP_CONTEXT_NODE) + { + entity = dfa->nodes[entity].opr.ctx_info->entity; + re_node_set_merge (dfa->inveclosures + src, + dfa->inveclosures + entity); + dfa->nodes[src].opr.ctx_info->entity = entity; + } + } +} + +/* Calculate "eclosure" for all the node in DFA. */ + +static reg_errcode_t +calc_eclosure (dfa) + re_dfa_t *dfa; +{ + int idx, node_idx, max, incomplete = 0; +#ifdef DEBUG + assert (dfa->nodes_len > 0); +#endif + /* For each nodes, calculate epsilon closure. */ + for (node_idx = 0, max = dfa->nodes_len; ; ++node_idx) + { + re_node_set eclosure_elem; + if (node_idx == max) + { + if (!incomplete) + break; + incomplete = 0; + node_idx = 0; + } + +#ifdef DEBUG + assert (dfa->nodes[node_idx].type != OP_CONTEXT_NODE); + assert (dfa->eclosures[node_idx].nelem != -1); +#endif + /* If we have already calculated, skip it. */ + if (dfa->eclosures[node_idx].nelem != 0) + continue; + /* Calculate epsilon closure of `node_idx'. */ + eclosure_elem = calc_eclosure_iter (dfa, node_idx, 1); + + if (dfa->eclosures[node_idx].nelem == 0) + { + incomplete = 1; + re_node_set_free (&eclosure_elem); + } + } + + /* for duplicated nodes. */ + for (idx = max; idx < dfa->nodes_len; ++idx) + { + int entity, i, constraint; + re_node_set *bkref_eclosure; + entity = dfa->nodes[idx].opr.ctx_info->entity; + re_node_set_merge (dfa->inveclosures + idx, dfa->inveclosures + entity); + if (dfa->nodes[entity].type != OP_BACK_REF) + continue; + + /* If the node is backreference, duplicate the epsilon closure of + the next node. Since it may epsilon transit. */ + /* Note: duplicate_node() may realloc dfa->eclosures, etc. */ + bkref_eclosure = re_malloc (re_node_set, 1); + if (bkref_eclosure == NULL) + return REG_ESPACE; + re_node_set_init_empty (bkref_eclosure); + constraint = dfa->nodes[idx].constraint; + for (i = 0; i < dfa->eclosures[dfa->nexts[idx]].nelem; ++i) + { + int dest_node_idx = dfa->eclosures[dfa->nexts[idx]].elems[i]; + if (!IS_EPSILON_NODE (dfa->nodes[dest_node_idx].type)) + dest_node_idx = duplicate_node (dfa, dest_node_idx, constraint); + re_node_set_insert (bkref_eclosure, dest_node_idx); + } + dfa->nodes[idx].opr.ctx_info->bkref_eclosure = bkref_eclosure; + } + + return REG_NOERROR; +} + +/* Calculate epsilon closure of NODE. */ + +static re_node_set +calc_eclosure_iter (dfa, node, root) + re_dfa_t *dfa; + int node, root; +{ + unsigned int constraint; + int i, max, incomplete = 0; + re_node_set eclosure; + re_node_set_alloc (&eclosure, 1); + + /* This indicates that we are calculating this node now. + We reference this value to avoid infinite loop. */ + dfa->eclosures[node].nelem = -1; + + constraint = ((dfa->nodes[node].type == ANCHOR) + ? dfa->nodes[node].opr.ctx_type : 0); + + /* Expand each epsilon destination nodes. */ + if (dfa->edests[node].nelem != 0) + for (i = 0; i < dfa->edests[node].nelem; ++i) + { + re_node_set eclosure_elem; + int edest = dfa->edests[node].elems[i]; + /* If calculating the epsilon closure of `edest' is in progress, + return intermediate result. */ + if (dfa->eclosures[edest].nelem == -1) + { + incomplete = 1; + continue; + } + /* If we haven't calculated the epsilon closure of `edest' yet, + calculate now. Otherwise use calculated epsilon closure. */ + if (dfa->eclosures[edest].nelem == 0) + eclosure_elem = calc_eclosure_iter (dfa, edest, 0); + else + eclosure_elem = dfa->eclosures[edest]; + /* Merge the epsilon closure of `edest'. */ + re_node_set_merge (&eclosure, &eclosure_elem); + /* If the epsilon closure of `edest' is incomplete, + the epsilon closure of this node is also incomplete. */ + if (dfa->eclosures[edest].nelem == 0) + { + incomplete = 1; + re_node_set_free (&eclosure_elem); + } + } + + /* If the current node has constraints, duplicate all non-epsilon nodes. + Since they must inherit the constraints. */ + if (constraint) + for (i = 0, max = eclosure.nelem; i < max; ++i) + { + int dest = eclosure.elems[i]; + if (!IS_EPSILON_NODE (dfa->nodes[dest].type)) + { + int dup_dest = duplicate_node (dfa, dest, constraint); + if (dest != dup_dest) + { + re_node_set_remove_at (&eclosure, i--); + re_node_set_insert (&eclosure, dup_dest); + --max; + } + } + } + + /* Epsilon closures include itself. */ + re_node_set_insert (&eclosure, node); + if (incomplete && !root) + dfa->eclosures[node].nelem = 0; + else + dfa->eclosures[node] = eclosure; + return eclosure; +} + +/* Functions for token which are used in the parser. */ + +/* Fetch a token from INPUT. + We must not use this function inside bracket expressions. */ + +static re_token_t +fetch_token (input, syntax) + re_string_t *input; + reg_syntax_t syntax; +{ + re_token_t token; + int consumed_byte; + consumed_byte = peek_token (&token, input, syntax); + re_string_skip_bytes (input, consumed_byte); + return token; +} + +/* Peek a token from INPUT, and return the length of the token. + We must not use this function inside bracket expressions. */ + +static int +peek_token (token, input, syntax) + re_token_t *token; + re_string_t *input; + reg_syntax_t syntax; +{ + unsigned char c; + + if (re_string_eoi (input)) + { + token->type = END_OF_RE; + return 0; + } + + c = re_string_peek_byte (input, 0); + token->opr.c = c; + +#ifdef RE_ENABLE_I18N + token->mb_partial = 0; + if (MB_CUR_MAX > 1 && + !re_string_first_byte (input, re_string_cur_idx (input))) + { + token->type = CHARACTER; + token->mb_partial = 1; + return 1; + } +#endif + if (c == '\\') + { + unsigned char c2; + if (re_string_cur_idx (input) + 1 >= re_string_length (input)) + { + token->type = BACK_SLASH; + return 1; + } + + c2 = re_string_peek_byte_case (input, 1); + token->opr.c = c2; + token->type = CHARACTER; + switch (c2) + { + case '|': + if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) + token->type = OP_ALT; + break; + case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + if (!(syntax & RE_NO_BK_REFS)) + { + token->type = OP_BACK_REF; + token->opr.idx = c2 - '0'; + } + break; + case '<': + if (!(syntax & RE_NO_GNU_OPS)) + { + token->type = ANCHOR; + token->opr.idx = WORD_FIRST; + } + break; + case '>': + if (!(syntax & RE_NO_GNU_OPS)) + { + token->type = ANCHOR; + token->opr.idx = WORD_LAST; + } + break; + case 'b': + if (!(syntax & RE_NO_GNU_OPS)) + { + token->type = ANCHOR; + token->opr.idx = WORD_DELIM; + } + break; + case 'B': + if (!(syntax & RE_NO_GNU_OPS)) + { + token->type = ANCHOR; + token->opr.idx = INSIDE_WORD; + } + break; + case 'w': + if (!(syntax & RE_NO_GNU_OPS)) + token->type = OP_WORD; + break; + case 'W': + if (!(syntax & RE_NO_GNU_OPS)) + token->type = OP_NOTWORD; + break; + case '`': + if (!(syntax & RE_NO_GNU_OPS)) + { + token->type = ANCHOR; + token->opr.idx = BUF_FIRST; + } + break; + case '\'': + if (!(syntax & RE_NO_GNU_OPS)) + { + token->type = ANCHOR; + token->opr.idx = BUF_LAST; + } + break; + case '(': + if (!(syntax & RE_NO_BK_PARENS)) + token->type = OP_OPEN_SUBEXP; + break; + case ')': + if (!(syntax & RE_NO_BK_PARENS)) + token->type = OP_CLOSE_SUBEXP; + break; + case '+': + if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) + token->type = OP_DUP_PLUS; + break; + case '?': + if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) + token->type = OP_DUP_QUESTION; + break; + case '{': + if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) + token->type = OP_OPEN_DUP_NUM; + break; + case '}': + if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) + token->type = OP_CLOSE_DUP_NUM; + break; + default: + break; + } + return 2; + } + + token->type = CHARACTER; + switch (c) + { + case '\n': + if (syntax & RE_NEWLINE_ALT) + token->type = OP_ALT; + break; + case '|': + if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) + token->type = OP_ALT; + break; + case '*': + token->type = OP_DUP_ASTERISK; + break; + case '+': + if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) + token->type = OP_DUP_PLUS; + break; + case '?': + if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) + token->type = OP_DUP_QUESTION; + break; + case '{': + if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) + token->type = OP_OPEN_DUP_NUM; + break; + case '}': + if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) + token->type = OP_CLOSE_DUP_NUM; + break; + case '(': + if (syntax & RE_NO_BK_PARENS) + token->type = OP_OPEN_SUBEXP; + break; + case ')': + if (syntax & RE_NO_BK_PARENS) + token->type = OP_CLOSE_SUBEXP; + break; + case '[': + token->type = OP_OPEN_BRACKET; + break; + case '.': + token->type = OP_PERIOD; + break; + case '^': + if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && + re_string_cur_idx (input) != 0) + { + char prev = re_string_peek_byte (input, -1); + if (prev != '|' && prev != '(' && + (!(syntax & RE_NEWLINE_ALT) || prev != '\n')) + break; + } + token->type = ANCHOR; + token->opr.idx = LINE_FIRST; + break; + case '$': + if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && + re_string_cur_idx (input) + 1 != re_string_length (input)) + { + re_token_t next; + re_string_skip_bytes (input, 1); + peek_token (&next, input, syntax); + re_string_skip_bytes (input, -1); + if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) + break; + } + token->type = ANCHOR; + token->opr.idx = LINE_LAST; + break; + default: + break; + } + return 1; +} + +/* Peek a token from INPUT, and return the length of the token. + We must not use this function out of bracket expressions. */ + +static int +peek_token_bracket (token, input, syntax) + re_token_t *token; + re_string_t *input; + reg_syntax_t syntax; +{ + unsigned char c; + if (re_string_eoi (input)) + { + token->type = END_OF_RE; + return 0; + } + c = re_string_peek_byte (input, 0); + token->opr.c = c; + +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX > 1 && + !re_string_first_byte (input, re_string_cur_idx (input))) + { + token->type = CHARACTER; + return 1; + } +#endif /* RE_ENABLE_I18N */ + + if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)) + { + /* In this case, '\' escape a character. */ + unsigned char c2; + c2 = re_string_peek_byte (input, 1); + token->opr.c = c2; + token->type = CHARACTER; + return 1; + } + if (c == '[') /* '[' is a special char in a bracket exps. */ + { + unsigned char c2; + int token_len; + c2 = re_string_peek_byte (input, 1); + token->opr.c = c2; + token_len = 2; + switch (c2) + { + case '.': + token->type = OP_OPEN_COLL_ELEM; + break; + case '=': + token->type = OP_OPEN_EQUIV_CLASS; + break; + case ':': + if (syntax & RE_CHAR_CLASSES) + { + token->type = OP_OPEN_CHAR_CLASS; + break; + } + /* else fall through. */ + default: + token->type = CHARACTER; + token->opr.c = c; + token_len = 1; + break; + } + return token_len; + } + switch (c) + { + case '-': + token->type = OP_CHARSET_RANGE; + break; + case ']': + token->type = OP_CLOSE_BRACKET; + break; + case '^': + token->type = OP_NON_MATCH_LIST; + break; + default: + token->type = CHARACTER; + } + return 1; +} + +/* Functions for parser. */ + +/* Entry point of the parser. + Parse the regular expression REGEXP and return the structure tree. + If an error is occured, ERR is set by error code, and return NULL. + This function build the following tree, from regular expression : + CAT + / \ + / \ + EOR + + CAT means concatenation. + EOR means end of regular expression. */ + +static bin_tree_t * +parse (regexp, preg, syntax, err) + re_string_t *regexp; + regex_t *preg; + reg_syntax_t syntax; + reg_errcode_t *err; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + bin_tree_t *tree, *eor, *root; + re_token_t current_token; + int new_idx; + current_token = fetch_token (regexp, syntax); + tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); + if (*err != REG_NOERROR && tree == NULL) + return NULL; + new_idx = re_dfa_add_node (dfa, current_token, 0); + eor = create_tree (NULL, NULL, 0, new_idx); + if (tree != NULL) + root = create_tree (tree, eor, CONCAT, 0); + else + root = eor; + if (new_idx == -1 || eor == NULL || root == NULL) + return *err = REG_ESPACE, NULL; + return root; +} + +/* This function build the following tree, from regular expression + |: + ALT + / \ + / \ + + + ALT means alternative, which represents the operator `|'. */ + +static bin_tree_t * +parse_reg_exp (regexp, preg, token, syntax, nest, err) + re_string_t *regexp; + regex_t *preg; + re_token_t *token; + reg_syntax_t syntax; + int nest; + reg_errcode_t *err; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + bin_tree_t *tree, *branch = NULL; + int new_idx; + tree = parse_branch (regexp, preg, token, syntax, nest, err); + if (*err != REG_NOERROR && tree == NULL) + return NULL; + + while (token->type == OP_ALT) + { + re_token_t alt_token = *token; + new_idx = re_dfa_add_node (dfa, alt_token, 0); + *token = fetch_token (regexp, syntax); + if (token->type != OP_ALT && token->type != END_OF_RE + && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) + { + branch = parse_branch (regexp, preg, token, syntax, nest, err); + if (*err != REG_NOERROR && branch == NULL) + { + free_bin_tree (tree); + return NULL; + } + } + tree = create_tree (tree, branch, 0, new_idx); + if (new_idx == -1 || tree == NULL) + return *err = REG_ESPACE, NULL; + } + return tree; +} + +/* This function build the following tree, from regular expression + : + CAT + / \ + / \ + + + CAT means concatenation. */ + +static bin_tree_t * +parse_branch (regexp, preg, token, syntax, nest, err) + re_string_t *regexp; + regex_t *preg; + re_token_t *token; + reg_syntax_t syntax; + int nest; + reg_errcode_t *err; +{ + bin_tree_t *tree, *exp; + tree = parse_expression (regexp, preg, token, syntax, nest, err); + if (*err != REG_NOERROR && tree == NULL) + return NULL; + + while (token->type != OP_ALT && token->type != END_OF_RE + && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) + { + exp = parse_expression (regexp, preg, token, syntax, nest, err); + if (*err != REG_NOERROR && exp == NULL) + { + free_bin_tree (tree); + return NULL; + } + if (tree != NULL && exp != NULL) + { + tree = create_tree (tree, exp, CONCAT, 0); + if (tree == NULL) + return *err = REG_ESPACE, NULL; + } + else if (tree == NULL) + tree = exp; + /* Otherwise exp == NULL, we don't need to create new tree. */ + } + return tree; +} + +/* This function build the following tree, from regular expression a*: + * + | + a +*/ + +static bin_tree_t * +parse_expression (regexp, preg, token, syntax, nest, err) + re_string_t *regexp; + regex_t *preg; + re_token_t *token; + reg_syntax_t syntax; + int nest; + reg_errcode_t *err; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + bin_tree_t *tree; + int new_idx; + switch (token->type) + { + case CHARACTER: + new_idx = re_dfa_add_node (dfa, *token, 0); + tree = create_tree (NULL, NULL, 0, new_idx); + if (new_idx == -1 || tree == NULL) + return *err = REG_ESPACE, NULL; +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX > 1) + { + while (!re_string_eoi (regexp) + && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) + { + bin_tree_t *mbc_remain; + *token = fetch_token (regexp, syntax); + new_idx = re_dfa_add_node (dfa, *token, 0); + mbc_remain = create_tree (NULL, NULL, 0, new_idx); + tree = create_tree (tree, mbc_remain, CONCAT, 0); + if (new_idx == -1 || mbc_remain == NULL || tree == NULL) + return *err = REG_ESPACE, NULL; + } + } +#endif + break; + case OP_OPEN_SUBEXP: + tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); + if (*err != REG_NOERROR && tree == NULL) + return NULL; + break; + case OP_OPEN_BRACKET: + tree = parse_bracket_exp (regexp, dfa, token, syntax, err); + if (*err != REG_NOERROR && tree == NULL) + return NULL; + break; + case OP_BACK_REF: + if (preg->re_nsub < token->opr.idx + || dfa->subexps[token->opr.idx - 1].end == -1) + { + *err = REG_ESUBREG; + return NULL; + } + new_idx = re_dfa_add_node (dfa, *token, 0); + tree = create_tree (NULL, NULL, 0, new_idx); + if (new_idx == -1 || tree == NULL) + return *err = REG_ESPACE, NULL; + ++dfa->nbackref; + dfa->has_mb_node = 1; + break; + case OP_DUP_ASTERISK: + case OP_DUP_PLUS: + case OP_DUP_QUESTION: + case OP_OPEN_DUP_NUM: + if (syntax & RE_CONTEXT_INVALID_OPS) + return *err = REG_BADRPT, NULL; + else if (syntax & RE_CONTEXT_INDEP_OPS) + { + *token = fetch_token (regexp, syntax); + return parse_expression (regexp, preg, token, syntax, nest, err); + } + /* else fall through */ + case OP_CLOSE_SUBEXP: + if ((token->type == OP_CLOSE_SUBEXP) && + !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) + return *err = REG_ERPAREN, NULL; + /* else fall through */ + case OP_CLOSE_DUP_NUM: + /* We treat it as a normal character. */ + + /* Then we can these characters as normal characters. */ + token->type = CHARACTER; + new_idx = re_dfa_add_node (dfa, *token, 0); + tree = create_tree (NULL, NULL, 0, new_idx); + if (new_idx == -1 || tree == NULL) + return *err = REG_ESPACE, NULL; + break; + case ANCHOR: + if (dfa->word_char == NULL) + init_word_char (dfa); + if (token->opr.ctx_type == WORD_DELIM) + { + bin_tree_t *tree_first, *tree_last; + int idx_first, idx_last; + token->opr.ctx_type = WORD_FIRST; + idx_first = re_dfa_add_node (dfa, *token, 0); + tree_first = create_tree (NULL, NULL, 0, idx_first); + token->opr.ctx_type = WORD_LAST; + idx_last = re_dfa_add_node (dfa, *token, 0); + tree_last = create_tree (NULL, NULL, 0, idx_last); + token->type = OP_ALT; + new_idx = re_dfa_add_node (dfa, *token, 0); + tree = create_tree (tree_first, tree_last, 0, new_idx); + if (idx_first == -1 || idx_last == -1 || new_idx == -1 + || tree_first == NULL || tree_last == NULL || tree == NULL) + return *err = REG_ESPACE, NULL; + } + else + { + new_idx = re_dfa_add_node (dfa, *token, 0); + tree = create_tree (NULL, NULL, 0, new_idx); + if (new_idx == -1 || tree == NULL) + return *err = REG_ESPACE, NULL; + } + /* We must return here, since ANCHORs can't be followed + by repetition operators. + eg. RE"^*" is invalid or "", + it must not be "". */ + *token = fetch_token (regexp, syntax); + return tree; + case OP_PERIOD: + new_idx = re_dfa_add_node (dfa, *token, 0); + tree = create_tree (NULL, NULL, 0, new_idx); + if (new_idx == -1 || tree == NULL) + return *err = REG_ESPACE, NULL; + if (MB_CUR_MAX > 1) + dfa->has_mb_node = 1; + break; + case OP_WORD: + tree = build_word_op (dfa, 0, err); + if (*err != REG_NOERROR && tree == NULL) + return NULL; + break; + case OP_NOTWORD: + tree = build_word_op (dfa, 1, err); + if (*err != REG_NOERROR && tree == NULL) + return NULL; + break; + case OP_ALT: + case END_OF_RE: + return NULL; + case BACK_SLASH: + *err = REG_EESCAPE; + return NULL; + default: + /* Must not happen? */ +#ifdef DEBUG + assert (0); +#endif + return NULL; + } + *token = fetch_token (regexp, syntax); + + while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS + || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) + { + tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); + if (*err != REG_NOERROR && tree == NULL) + return *err = REG_ESPACE, NULL; + } + + return tree; +} + +/* This function build the following tree, from regular expression + (): + SUBEXP + | + +*/ + +static bin_tree_t * +parse_sub_exp (regexp, preg, token, syntax, nest, err) + re_string_t *regexp; + regex_t *preg; + re_token_t *token; + reg_syntax_t syntax; + int nest; + reg_errcode_t *err; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + bin_tree_t *tree; + size_t cur_nsub; + cur_nsub = preg->re_nsub++; + if (dfa->subexps_alloc < preg->re_nsub) + { + re_subexp_t *new_array; + dfa->subexps_alloc *= 2; + new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc); + if (new_array == NULL) + { + dfa->subexps_alloc /= 2; + *err = REG_ESPACE; + return NULL; + } + dfa->subexps = new_array; + } + dfa->subexps[cur_nsub].start = dfa->nodes_len; + dfa->subexps[cur_nsub].end = -1; + *token = fetch_token (regexp, syntax); + + /* The subexpression may be a null string. */ + if (token->type == OP_CLOSE_SUBEXP) + { + tree = create_tree (NULL, NULL, SUBEXP, 0); + if (tree == NULL) + return *err = REG_ESPACE, NULL; + dfa->subexps[cur_nsub].end = dfa->nodes_len; + } + else + { + tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); + if (*err != REG_NOERROR && tree == NULL) + return NULL; + dfa->subexps[cur_nsub].end = dfa->nodes_len; + if (token->type != OP_CLOSE_SUBEXP) + { + free_bin_tree (tree); + *err = REG_BADPAT; + return NULL; + } + tree = create_tree (tree, NULL, SUBEXP, 0); + } + return tree; +} + +/* This function parse repetition operators like "*", "+", "{1,3}" etc. */ + +static bin_tree_t * +parse_dup_op (dup_elem, regexp, dfa, token, syntax, err) + bin_tree_t *dup_elem; + re_string_t *regexp; + re_dfa_t *dfa; + re_token_t *token; + reg_syntax_t syntax; + reg_errcode_t *err; +{ + re_token_t dup_token; + bin_tree_t *tree = dup_elem, *work_tree; + int new_idx, start_idx = re_string_cur_idx (regexp); + re_token_t start_token = *token; + if (token->type == OP_OPEN_DUP_NUM) + { + int i, end, start = fetch_number (regexp, token, syntax); + bin_tree_t *elem; + if (start == -1) + start = 0; /* We treat "{,m}" as "{0,m}". */ + if (start != -2 && token->type == OP_CLOSE_DUP_NUM) + { + if (start == 0) + { + /* We treat "{0}" as null string. */ + *token = fetch_token (regexp, syntax); + free_bin_tree (dup_elem); + return NULL; + } + end = start; /* We treat "{n}" as "{n,n}". */ + } + else if (start == -2 || token->type != CHARACTER || token->opr.c != ',') + /* Invalid sequence. */ + goto parse_dup_op_invalid_interval; + else + { + end = fetch_number (regexp, token, syntax); + if (end == -2 || token->type != OP_CLOSE_DUP_NUM) + /* Invalid sequence. */ + goto parse_dup_op_invalid_interval; + } + /* Extract "{n,m}" to "...{0,}". */ + elem = tree; + for (i = 0; i < start; ++i) + if (i != 0) + { + work_tree = duplicate_tree (elem, dfa); + tree = create_tree (tree, work_tree, CONCAT, 0); + if (work_tree == NULL || tree == NULL) + goto parse_dup_op_espace; + } + + if (end == -1) + { + /* We treat "{0,}" as "*". */ + dup_token.type = OP_DUP_ASTERISK; + if (start > 0) + { + elem = duplicate_tree (elem, dfa); + new_idx = re_dfa_add_node (dfa, dup_token, 0); + work_tree = create_tree (elem, NULL, 0, new_idx); + tree = create_tree (tree, work_tree, CONCAT, 0); + if (elem == NULL || new_idx == -1 || work_tree == NULL + || tree == NULL) + goto parse_dup_op_espace; + } + else + { + new_idx = re_dfa_add_node (dfa, dup_token, 0); + tree = create_tree (elem, NULL, 0, new_idx); + if (new_idx == -1 || tree == NULL) + goto parse_dup_op_espace; + } + } + else if (end - start > 0) + { + /* Then extract "{0,m}" to "??...?". */ + dup_token.type = OP_DUP_QUESTION; + if (start > 0) + { + elem = duplicate_tree (elem, dfa); + new_idx = re_dfa_add_node (dfa, dup_token, 0); + elem = create_tree (elem, NULL, 0, new_idx); + tree = create_tree (tree, elem, CONCAT, 0); + if (elem == NULL || new_idx == -1 || tree == NULL) + goto parse_dup_op_espace; + } + else + { + new_idx = re_dfa_add_node (dfa, dup_token, 0); + tree = elem = create_tree (elem, NULL, 0, new_idx); + if (new_idx == -1 || tree == NULL) + goto parse_dup_op_espace; + } + for (i = 1; i < end - start; ++i) + { + work_tree = duplicate_tree (elem, dfa); + tree = create_tree (tree, work_tree, CONCAT, 0); + if (work_tree == NULL || tree == NULL) + return *err = REG_ESPACE, NULL; + } + } + } + else + { + new_idx = re_dfa_add_node (dfa, *token, 0); + tree = create_tree (tree, NULL, 0, new_idx); + if (new_idx == -1 || tree == NULL) + return *err = REG_ESPACE, NULL; + } + *token = fetch_token (regexp, syntax); + return tree; + + parse_dup_op_espace: + free_bin_tree (tree); + *err = REG_ESPACE; + return NULL; + + parse_dup_op_invalid_interval: + if (!(syntax & RE_INVALID_INTERVAL_ORD)) + { + *err = REG_EBRACE; + return NULL; + } + re_string_set_index (regexp, start_idx); + *token = start_token; + token->type = CHARACTER; + return dup_elem; +} + +/* Size of the names for collating symbol/equivalence_class/character_class. + I'm not sure, but maybe enough. */ +#define BRACKET_NAME_BUF_SIZE 32 + +static inline void * +extend_array_for_cset (array, num, alloc, type_size) + void *array; + int num, *alloc, type_size; +{ + void *new_array = array; + if (*alloc == num) + { + if (*alloc == 0) + { + new_array = malloc (type_size); + *alloc = 1; + } + else + { + new_array = realloc (array, type_size * num * 2); + *alloc = 2 * num; + } + } + return new_array; +} + +/* This function parse bracket expression like "[abc]", "[a-c]", + "[[.a-a.]]" etc. */ + +static bin_tree_t * +parse_bracket_exp (regexp, dfa, token, syntax, err) + re_string_t *regexp; + re_dfa_t *dfa; + re_token_t *token; + reg_syntax_t syntax; + reg_errcode_t *err; +{ +#ifdef _LIBC + const unsigned char *collseqmb, *collseqwc; + uint32_t nrules; + int32_t table_size; + const int32_t *symb_table; + const unsigned char *extra; + + /* Local function for parse_bracket_exp. + Seek the collating symbol entry correspondings to NAME. + Return the index of the symbol in the SYMB_TABLE. */ + + static inline int32_t + seek_collating_symbol_entry (name, name_len) + unsigned char *name; + size_t name_len; + { + int32_t hash = elem_hash (name, name_len); + int32_t elem = hash % table_size; + int32_t second = hash % (table_size - 2); + while (symb_table[2 * elem] != 0) + { + /* First compare the hashing value. */ + if (symb_table[2 * elem] == hash + /* Compare the length of the name. */ + && name_len == extra[symb_table[2 * elem + 1]] + /* Compare the name. */ + && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], + name_len) == 0) + { + /* Yep, this is the entry. */ + break; + } + + /* Next entry. */ + elem += second; + } + return elem; + } + + /* Local function for parse_bracket_exp. + Look up the collation sequence value of BR_ELEM. + Return the value if succeeded, UINT_MAX otherwise. */ + + static inline unsigned int + lookup_collation_sequence_value (br_elem) + bracket_elem_t *br_elem; + { + if (br_elem->type == SB_CHAR) + { + /* + if (MB_CUR_MAX == 1) + */ + if (nrules == 0) + return collseqmb[br_elem->opr.ch]; + else + { + wint_t wc = __btowc (br_elem->opr.ch); + return collseq_table_lookup (collseqwc, wc); + } + } + else if (br_elem->type == MB_CHAR) + { + return collseq_table_lookup (collseqwc, br_elem->opr.wch); + } + else if (br_elem->type == COLL_SYM) + { + if (nrules != 0) + { + int32_t elem, idx; + elem = seek_collating_symbol_entry (br_elem->opr.name, + strlen (br_elem->opr.name)); + if (symb_table[2 * elem] != 0) + { + /* We found the entry. */ + idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + /* Skip the byte sequence of the collating element. */ + idx += 1 + extra[idx]; + /* Adjust for the alignment. */ + idx = (idx + 3) & ~3; + /* Skip the multibyte collation sequence value. */ + idx += sizeof (unsigned int); + /* Skip the wide char sequence of the collating element. */ + idx += sizeof (unsigned int) * + (1 + *(unsigned int *) (extra + idx)); + /* Return the collation sequence value. */ + return *(unsigned int *) (extra + idx); + } + else if (symb_table[2 * elem] == 0 && + strlen (br_elem->opr.name) == 1) + { + /* No valid character. Match it as a single byte + character. */ + return collseqmb[br_elem->opr.name[0]]; + } + } + else if (strlen (br_elem->opr.name) == 1) + return collseqmb[br_elem->opr.name[0]]; + } + return UINT_MAX; + } + + /* Local function for parse_bracket_exp. + Build the range expression which starts from START_ELEM, and ends + at END_ELEM. The result are written to MBCSET and SBCSET. + RANGE_ALLOC is the allocated size of mbcset->range_starts, and + mbcset->range_ends, is a pointer argument sinse we may + update it. */ + + static inline reg_errcode_t + build_range_exp (mbcset, sbcset, range_alloc, start_elem, end_elem) + re_charset_t *mbcset; + re_bitset_ptr_t sbcset; + int *range_alloc; + bracket_elem_t *start_elem, *end_elem; + { + unsigned int ch; + uint32_t start_collseq; + uint32_t end_collseq; + + /* Check the space of the arrays. */ + if (*range_alloc == mbcset->nranges) + { + /* There are not enough space, need realloc. */ + uint32_t *new_array_start; + uint32_t *new_array_end; + int new_nranges; + + /* XXX If mbcset->range_starts and mbcset->range_ends are NULL + if *range_alloc == 0 then we do not need the if. */ + if (*range_alloc == 0) + { + new_nranges = 1; + new_array_start = re_malloc (uint32_t, 1); + new_array_end = re_malloc (uint32_t, 1); + } + else + { + new_nranges = 2 * mbcset->nranges; + new_array_start = re_realloc (mbcset->range_starts, uint32_t, + new_nranges); + new_array_end = re_realloc (mbcset->range_ends, uint32_t, + new_nranges); + } + if (new_array_start == NULL || new_array_end == NULL) + return REG_ESPACE; + + mbcset->range_starts = new_array_start; + mbcset->range_ends = new_array_end; + *range_alloc = new_nranges; + } + + if (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS + || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS) + return REG_ERANGE; + + start_collseq = lookup_collation_sequence_value (start_elem); + end_collseq = lookup_collation_sequence_value (end_elem); + /* Check start/end collation sequence values. */ + if (start_collseq == UINT_MAX || end_collseq == UINT_MAX) + return REG_ECOLLATE; + if ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq) + return REG_ERANGE; + + /* Got valid collation sequence values, add them as a new entry. */ + mbcset->range_starts[mbcset->nranges] = start_collseq; + mbcset->range_ends[mbcset->nranges++] = end_collseq; + + /* Build the table for single byte characters. */ + for (ch = 0; ch <= SBC_MAX; ch++) + { + uint32_t ch_collseq; + /* + if (MB_CUR_MAX == 1) + */ + if (nrules == 0) + ch_collseq = collseqmb[ch]; + else + ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch)); + if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) + bitset_set (sbcset, ch); + } + return REG_NOERROR; + } +#endif + + /* Local function for parse_bracket_exp. + Build the collating element which is represented by NAME. + The result are written to MBCSET and SBCSET. + COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a + pointer argument sinse we may update it. */ + + static inline reg_errcode_t + build_collating_symbol (mbcset, sbcset, coll_sym_alloc, name) + re_charset_t *mbcset; + re_bitset_ptr_t sbcset; + int *coll_sym_alloc; + unsigned char *name; + { +#ifdef _LIBC + int32_t elem, idx; + if (nrules != 0) + { + elem = seek_collating_symbol_entry (name, strlen (name)); + if (symb_table[2 * elem] != 0) + { + /* We found the entry. */ + idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + } + else if (symb_table[2 * elem] == 0 && strlen (name) == 1) + { + /* No valid character, treat it as a normal + character. */ + bitset_set (sbcset, name[0]); + return REG_NOERROR; + } + else + return REG_ECOLLATE; + + /* Got valid collation sequence, add it as a new entry. */ + /* Check the space of the arrays. */ + mbcset->coll_syms = extend_array_for_cset (mbcset->coll_syms, + mbcset->ncoll_syms, + coll_sym_alloc, + sizeof (int32_t)); + if (mbcset->coll_syms == NULL) + return REG_ESPACE; + + mbcset->coll_syms[mbcset->ncoll_syms++] = idx; + return REG_NOERROR; + } + else +#endif + { + if (strlen (name) != 1) + return REG_ECOLLATE; + else + { + bitset_set (sbcset, name[0]); + return REG_NOERROR; + } + } + } + re_token_t br_token; + re_bitset_ptr_t sbcset; + re_charset_t *mbcset; + bin_tree_t *work_tree; + int token_len, new_idx; + int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; + int equiv_class_alloc = 0, char_class_alloc = 0; +#ifdef _LIBC + collseqmb = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); + nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); + if (nrules) + { + /* + if (MB_CUR_MAX > 1) + */ + collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); + table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); + symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_SYMB_TABLEMB); + extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_SYMB_EXTRAMB); + } +#endif + sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); + mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); + if (sbcset == NULL || mbcset == NULL) + { + *err = REG_ESPACE; + return NULL; + } + + token_len = peek_token_bracket (token, regexp, syntax); + if (token->type == END_OF_RE) + { + re_free (sbcset); + free_charset (mbcset); + *err = REG_BADPAT; + return NULL; + } + if (token->type == OP_NON_MATCH_LIST) + { + int i; + mbcset->non_match = 1; + if (syntax & RE_HAT_LISTS_NOT_NEWLINE) + bitset_set (sbcset, '\0'); + re_string_skip_bytes (regexp, token_len); /* Skip a token. */ + token_len = peek_token_bracket (token, regexp, syntax); + if (token->type == END_OF_RE) + { + re_free (sbcset); + free_charset (mbcset); + *err = REG_BADPAT; + return NULL; + } + if (MB_CUR_MAX > 1) + for (i = 0; i < SBC_MAX; ++i) + if (__btowc (i) == WEOF) + bitset_set (sbcset, i); + } + + /* We treat the first ']' as a normal character. */ + if (token->type == OP_CLOSE_BRACKET) + token->type = CHARACTER; + + while (1) + { + bracket_elem_t start_elem, end_elem; + unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; + unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; + reg_errcode_t ret; + int token_len2 = 0, is_range_exp = 0; + re_token_t token2; + + start_elem.opr.name = start_name_buf; + ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, + syntax); + if (ret != REG_NOERROR) + goto parse_bracket_exp_espace; + + token_len = peek_token_bracket (token, regexp, syntax); + if (token->type == END_OF_RE) + { + re_free (sbcset); + free_charset (mbcset); + *err = REG_BADPAT; + return NULL; + } + if (token->type == OP_CHARSET_RANGE) + { + re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ + token_len2 = peek_token_bracket (&token2, regexp, syntax); + if (token->type == END_OF_RE) + { + re_free (sbcset); + free_charset (mbcset); + *err = REG_BADPAT; + return NULL; + } + if (token2.type == OP_CLOSE_BRACKET) + { + /* We treat the last '-' as a normal character. */ + re_string_skip_bytes (regexp, -token_len); + token->type = CHARACTER; + } + else + is_range_exp = 1; + } + + if (is_range_exp == 1) + { + end_elem.opr.name = end_name_buf; + ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, + dfa, syntax); + if (ret != REG_NOERROR) + goto parse_bracket_exp_espace; + + token_len = peek_token_bracket (token, regexp, syntax); + if (token->type == END_OF_RE) + { + re_free (sbcset); + free_charset (mbcset); + *err = REG_BADPAT; + return NULL; + } + *err = build_range_exp (mbcset, sbcset, &range_alloc, &start_elem, + &end_elem); + if (*err != REG_NOERROR) + { + re_free (sbcset); + free_charset (mbcset); + return NULL; + } + } + else + { + switch (start_elem.type) + { + case SB_CHAR: + bitset_set (sbcset, start_elem.opr.ch); + break; + case MB_CHAR: + mbcset->mbchars = extend_array_for_cset (mbcset->mbchars, + mbcset->nmbchars, + &mbchar_alloc, + sizeof (wchar_t)); + if (mbcset->mbchars == NULL) + goto parse_bracket_exp_espace; + mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; + break; + case EQUIV_CLASS: + *err = build_equiv_class (mbcset, sbcset, &equiv_class_alloc, + start_elem.opr.name); + if (*err != REG_NOERROR) + { + re_free (sbcset); + free_charset (mbcset); + return NULL; + } + break; + case COLL_SYM: + *err = build_collating_symbol (mbcset, sbcset, &coll_sym_alloc, + start_elem.opr.name); + if (*err != REG_NOERROR) + { + re_free (sbcset); + free_charset (mbcset); + return NULL; + } + break; + case CHAR_CLASS: + ret = build_charclass (mbcset, sbcset, &char_class_alloc, + start_elem.opr.name); + if (ret != REG_NOERROR) + goto parse_bracket_exp_espace; + break; + default: + assert (0); + break; + } + } + if (token->type == OP_CLOSE_BRACKET) + break; + } + + re_string_skip_bytes (regexp, token_len); /* Skip a token. */ + + /* If it is non-matching list. */ + if (mbcset->non_match) + bitset_not (sbcset); + + /* Build a tree for simple bracket. */ + br_token.type = SIMPLE_BRACKET; + br_token.opr.sbcset = sbcset; + new_idx = re_dfa_add_node (dfa, br_token, 0); + work_tree = create_tree (NULL, NULL, 0, new_idx); + if (new_idx == -1 || work_tree == NULL) + goto parse_bracket_exp_espace; + + if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes + || mbcset->nranges || (mbcset->nchar_classes && MB_CUR_MAX > 1)) + { + re_token_t alt_token; + bin_tree_t *mbc_tree; + /* Build a tree for complex bracket. */ + br_token.type = COMPLEX_BRACKET; + br_token.opr.mbcset = mbcset; + dfa->has_mb_node = 1; + new_idx = re_dfa_add_node (dfa, br_token, 0); + mbc_tree = create_tree (NULL, NULL, 0, new_idx); + if (new_idx == -1 || mbc_tree == NULL) + goto parse_bracket_exp_espace; + /* Then join them by ALT node. */ + alt_token.type = OP_ALT; + new_idx = re_dfa_add_node (dfa, alt_token, 0); + work_tree = create_tree (work_tree, mbc_tree, 0, new_idx); + if (new_idx != -1 && mbc_tree != NULL) + return work_tree; + } + else + { + free_charset (mbcset); + return work_tree; + } + + parse_bracket_exp_espace: + free_charset (mbcset); + *err = REG_ESPACE; + return NULL; +} + +static reg_errcode_t +parse_bracket_element (elem, regexp, token, token_len, dfa, syntax) + bracket_elem_t *elem; + re_string_t *regexp; + re_token_t *token; + int token_len; + re_dfa_t *dfa; + reg_syntax_t syntax; +{ +#ifdef RE_ENABLE_I18N + int cur_char_size; + cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); + if (cur_char_size > 1) + { + elem->type = MB_CHAR; + elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); + re_string_skip_bytes (regexp, cur_char_size); + return REG_NOERROR; + } +#endif /* RE_ENABLE_I18N */ + re_string_skip_bytes (regexp, token_len); /* Skip a token. */ + if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS + || token->type == OP_OPEN_EQUIV_CLASS) + return parse_bracket_symbol (elem, regexp, token); + elem->type = SB_CHAR; + elem->opr.ch = token->opr.c; + return REG_NOERROR; +} + +static reg_errcode_t +parse_bracket_symbol (elem, regexp, token) + bracket_elem_t *elem; + re_string_t *regexp; + re_token_t *token; +{ + unsigned char ch, delim = token->opr.c; + int i = 0; + for (;; i++) + { +#ifdef DEBUG + assert (i < BRACKET_NAME_BUF_SIZE); +#endif + if (token->type == OP_OPEN_CHAR_CLASS) + ch = re_string_fetch_byte_case (regexp); + else + ch = re_string_fetch_byte (regexp); + if (ch == delim && re_string_peek_byte (regexp, 0) == ']') + break; + elem->opr.name[i] = ch; + } + re_string_skip_bytes (regexp, 1); + elem->opr.name[i] = '\0'; + switch (token->type) + { + case OP_OPEN_COLL_ELEM: + elem->type = COLL_SYM; + break; + case OP_OPEN_EQUIV_CLASS: + elem->type = EQUIV_CLASS; + break; + case OP_OPEN_CHAR_CLASS: + elem->type = CHAR_CLASS; + break; + default: + break; + } + return REG_NOERROR; +} + + /* Helper function for parse_bracket_exp. + Build the equivalence class which is represented by NAME. + The result are written to MBCSET and SBCSET. + EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, + is a pointer argument sinse we may update it. */ + +static reg_errcode_t +build_equiv_class (mbcset, sbcset, equiv_class_alloc, name) + re_charset_t *mbcset; + re_bitset_ptr_t sbcset; + int *equiv_class_alloc; + const unsigned char *name; +{ +#ifdef _LIBC + uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); + if (nrules != 0) + { + const int32_t *table, *indirect; + const unsigned char *weights, *extra, *cp; + unsigned char char_buf[2]; + int32_t idx1, idx2; + unsigned int ch; + size_t len; + /* This #include defines a local function! */ +# include + /* Calculate the index for equivalence class. */ + cp = name; + table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); + weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_WEIGHTMB); + extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_EXTRAMB); + indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_INDIRECTMB); + idx1 = findidx (&cp); + if (idx1 == 0 || cp < name + strlen (name)) + /* This isn't a valid character. */ + return REG_ECOLLATE; + + /* Build single byte matcing table for this equivalence class. */ + char_buf[1] = '\0'; + len = weights[idx1]; + for (ch = 0; ch < SBC_MAX; ++ch) + { + char_buf[0] = ch; + cp = char_buf; + idx2 = findidx (&cp); +/* + idx2 = table[ch]; +*/ + if (idx2 == 0) + /* This isn't a valid character. */ + continue; + if (len == weights[idx2]) + { + int cnt = 0; + while (cnt <= len && + weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) + ++cnt; + + if (cnt > len) + bitset_set (sbcset, ch); + } + } + /* Check the space of the arrays, and extend if we need. */ + mbcset->equiv_classes = extend_array_for_cset (mbcset->equiv_classes, + mbcset->nequiv_classes, + equiv_class_alloc, + sizeof (int32_t)); + if (mbcset->equiv_classes == NULL) + return REG_ESPACE; + + mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; + } + else +#endif + { + if (strlen (name) != 1) + return REG_ECOLLATE; + bitset_set (sbcset, name[0]); + } + return REG_NOERROR; +} + + /* Helper function for parse_bracket_exp. + Build the character class which is represented by NAME. + The result are written to MBCSET and SBCSET. + CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, + is a pointer argument sinse we may update it. */ + +static reg_errcode_t +build_charclass (mbcset, sbcset, char_class_alloc, name) + re_charset_t *mbcset; + re_bitset_ptr_t sbcset; + int *char_class_alloc; + const unsigned char *name; +{ + int i; + + /* Check the space of the arrays. */ + mbcset->char_classes = extend_array_for_cset (mbcset->char_classes, + mbcset->nchar_classes, + char_class_alloc, + sizeof (wctype_t)); + if (mbcset->char_classes == NULL) + return REG_ESPACE; + + mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); + +#define BUILD_CHARCLASS_LOOP(ctype_func)\ + for (i = 0; i < SBC_MAX; ++i) \ + { \ + if (ctype_func (i)) \ + bitset_set (sbcset, i); \ + } + + if (strcmp (name, "alnum") == 0) + BUILD_CHARCLASS_LOOP (isalnum) + else if (strcmp (name, "cntrl") == 0) + BUILD_CHARCLASS_LOOP (iscntrl) + else if (strcmp (name, "lower") == 0) + BUILD_CHARCLASS_LOOP (islower) + else if (strcmp (name, "space") == 0) + BUILD_CHARCLASS_LOOP (isspace) + else if (strcmp (name, "alpha") == 0) + BUILD_CHARCLASS_LOOP (isalpha) + else if (strcmp (name, "digit") == 0) + BUILD_CHARCLASS_LOOP (isdigit) + else if (strcmp (name, "print") == 0) + BUILD_CHARCLASS_LOOP (isprint) + else if (strcmp (name, "upper") == 0) + BUILD_CHARCLASS_LOOP (isupper) + else if (strcmp (name, "blank") == 0) + BUILD_CHARCLASS_LOOP (isblank) + else if (strcmp (name, "graph") == 0) + BUILD_CHARCLASS_LOOP (isgraph) + else if (strcmp (name, "punct") == 0) + BUILD_CHARCLASS_LOOP (ispunct) + else if (strcmp (name, "xdigit") == 0) + BUILD_CHARCLASS_LOOP (isxdigit) + else + return REG_ECTYPE; + + return REG_NOERROR; +} + +static bin_tree_t * +build_word_op (dfa, not, err) + re_dfa_t *dfa; + int not; + reg_errcode_t *err; +{ + re_bitset_ptr_t sbcset; + re_charset_t *mbcset; + reg_errcode_t ret; + re_token_t br_token; + bin_tree_t *tree; + int new_idx, alloc = 0; + + sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); + mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); + if (sbcset == NULL || mbcset == NULL) + { + *err = REG_ESPACE; + return NULL; + } + + if (not) + { + int i; + mbcset->non_match = 1; + /* + if (syntax & RE_HAT_LISTS_NOT_NEWLINE) + bitset_set(cset->sbcset, '\0'); + */ + if (MB_CUR_MAX > 1) + for (i = 0; i < SBC_MAX; ++i) + if (__btowc (i) == WEOF) + bitset_set (sbcset, i); + } + + ret = build_charclass (mbcset, sbcset, &alloc, "alpha"); + if (ret != REG_NOERROR) + { + re_free (sbcset); + free_charset (mbcset); + *err = REG_ESPACE; + return NULL; + } + + /* If it is non-matching list. */ + if (mbcset->non_match) + bitset_not (sbcset); + + /* Build a tree for simple bracket. */ + br_token.type = SIMPLE_BRACKET; + br_token.opr.sbcset = sbcset; + new_idx = re_dfa_add_node (dfa, br_token, 0); + tree = create_tree (NULL, NULL, 0, new_idx); + if (new_idx == -1 || tree == NULL) + goto build_word_op_espace; + + if (MB_CUR_MAX > 1) + { + re_token_t alt_token; + bin_tree_t *mbc_tree; + /* Build a tree for complex bracket. */ + br_token.type = COMPLEX_BRACKET; + br_token.opr.mbcset = mbcset; + dfa->has_mb_node = 1; + new_idx = re_dfa_add_node (dfa, br_token, 0); + mbc_tree = create_tree (NULL, NULL, 0, new_idx); + if (new_idx == -1 || mbc_tree == NULL) + goto build_word_op_espace; + /* Then join them by ALT node. */ + alt_token.type = OP_ALT; + new_idx = re_dfa_add_node (dfa, alt_token, 0); + tree = create_tree (tree, mbc_tree, 0, new_idx); + if (new_idx != -1 && mbc_tree != NULL) + return tree; + } + else + { + free_charset (mbcset); + return tree; + } + build_word_op_espace: + re_free (sbcset); + free_charset (mbcset); + *err = REG_ESPACE; + return NULL; +} + +/* This is intended for the expressions like "a{1,3}". + Fetch a number from `input', and return the number. + Return -1, if the number field is empty like "{,1}". + Return -2, If an error is occured. */ + +static int +fetch_number (input, token, syntax) + re_string_t *input; + re_token_t *token; + reg_syntax_t syntax; +{ + int num = -1; + unsigned char c; + while (1) + { + *token = fetch_token (input, syntax); + c = token->opr.c; + if (token->type == OP_CLOSE_DUP_NUM || c == ',') + break; + if (token->type != CHARACTER || c < '0' || '9' < c) + return -2; + num = (num == -1) ? c - '0' : num * 10 + c - '0'; + } + if (num > RE_DUP_MAX) + return -2; + return num; +} + +static void +free_charset (re_charset_t *cset) +{ + re_free (cset->mbchars); + re_free (cset->coll_syms); + re_free (cset->equiv_classes); + re_free (cset->range_starts); + re_free (cset->range_ends); + re_free (cset->char_classes); + re_free (cset); +} + +/* Functions for binary tree operation. */ + +/* Create a node of tree. + Note: This function automatically free left and right if malloc fails. */ + +static bin_tree_t * +create_tree (left, right, type, index) + bin_tree_t *left; + bin_tree_t *right; + re_token_type_t type; + int index; +{ + bin_tree_t *tree; + tree = re_malloc (bin_tree_t, 1); + if (tree == NULL) + { + free_bin_tree (left); + free_bin_tree (right); + return NULL; + } + tree->parent = NULL; + tree->left = left; + tree->right = right; + tree->type = type; + tree->node_idx = index; + tree->first = -1; + tree->next = -1; + re_node_set_init_empty (&tree->eclosure); + + if (left != NULL) + left->parent = tree; + if (right != NULL) + right->parent = tree; + return tree; +} + +/* Free the sub tree pointed by TREE. */ + +static void +free_bin_tree (tree) + bin_tree_t *tree; +{ + if (tree == NULL) + return; + /*re_node_set_free (&tree->eclosure);*/ + free_bin_tree (tree->left); + free_bin_tree (tree->right); + re_free (tree); +} + +/* Duplicate the node SRC, and return new node. */ + +static bin_tree_t * +duplicate_tree (src, dfa) + const bin_tree_t *src; + re_dfa_t *dfa; +{ + bin_tree_t *left = NULL, *right = NULL, *new_tree; + int new_node_idx; + /* Since node indies must be according to Post-order of the tree, + we must duplicate the left at first. */ + if (src->left != NULL) + { + left = duplicate_tree (src->left, dfa); + if (left == NULL) + return NULL; + } + + /* Secondaly, duplicate the right. */ + if (src->right != NULL) + { + right = duplicate_tree (src->right, dfa); + if (right == NULL) + { + free_bin_tree (left); + return NULL; + } + } + + /* At last, duplicate itself. */ + if (src->type == NON_TYPE) + { + new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0); + dfa->nodes[new_node_idx].duplicated = 1; + if (new_node_idx == -1) + { + free_bin_tree (left); + free_bin_tree (right); + return NULL; + } + } + else + new_node_idx = src->type; + + new_tree = create_tree (left, right, src->type, new_node_idx); + if (new_tree == NULL) + { + free_bin_tree (left); + free_bin_tree (right); + } + return new_tree; +} diff --git a/posix/regex.h b/posix/regex.h index 8b8bb9d5a0..d1e4b6841a 100644 --- a/posix/regex.h +++ b/posix/regex.h @@ -1,8 +1,8 @@ /* Definitions for data structures and routines for the regular - expression library, version 0.12. - Copyright (C) 1985,1989-93,1995-98,2000,2001 Free Software Foundation, Inc. - This file is part of the GNU C Library. Its master source is NOT part of - the C library, however. The master source lives in /gd/gnu/lib. + expression library. + Copyright (C) 1985,1989-93,1995-98,2000,2001,2002 + Free Software Foundation, Inc. + This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -164,6 +164,10 @@ typedef unsigned long int reg_syntax_t; treated as 'a\{1'. */ #define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1) +/* If this bit is set, then ignore case when matching. + If not set, then case is significant. */ +#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1) + /* This global variable defines the particular regexp syntax to use (for some interfaces). When a regexp is compiled, the syntax used is stored in the pattern buffer, so changing this does not affect diff --git a/posix/regex_internal.c b/posix/regex_internal.c new file mode 100644 index 0000000000..63bed420cd --- /dev/null +++ b/posix/regex_internal.c @@ -0,0 +1,1095 @@ +/* Extended regular expression matching and search library. + Copyright (C) 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Isamu Hasegawa . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _LIBC +# ifndef _RE_DEFINE_LOCALE_FUNCTIONS +# define _RE_DEFINE_LOCALE_FUNCTIONS 1 +# include +# include +# include +# endif +#endif + +/* This is for other GNU distributions with internationalized messages. */ +#if HAVE_LIBINTL_H || defined _LIBC +# include +# ifdef _LIBC +# undef gettext +# define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES) +# endif +#else +# define gettext(msgid) (msgid) +#endif + +#ifndef gettext_noop +/* This define is so xgettext can find the internationalizable + strings. */ +# define gettext_noop(String) String +#endif + +#include "regex.h" +#include "regex_internal.h" + +static void re_string_construct_common (const unsigned char *str, + int len, re_string_t *pstr); +#ifdef RE_ENABLE_I18N +static reg_errcode_t build_wcs_buffer (re_string_t *pstr); +static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr); +#endif /* RE_ENABLE_I18N */ +static reg_errcode_t build_upper_buffer (re_string_t *pstr); +static reg_errcode_t re_string_translate_buffer (re_string_t *pstr, + RE_TRANSLATE_TYPE trans); +static re_dfastate_t *create_newstate_common (re_dfa_t *dfa, + const re_node_set *nodes, + unsigned int hash); +static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa, + const re_node_set *nodes, + unsigned int hash); +static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa, + const re_node_set *nodes, + unsigned int context, + unsigned int hash); +static unsigned int inline calc_state_hash (const re_node_set *nodes, + unsigned int context); + +/* Functions for string operation. */ + +/* Construct string object. */ +static reg_errcode_t +re_string_construct (pstr, str, len, trans) + re_string_t *pstr; + const unsigned char *str; + int len; + RE_TRANSLATE_TYPE trans; +{ + reg_errcode_t ret; + re_string_construct_common (str, len, pstr); +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX >1 && pstr->len > 0) + { + ret = build_wcs_buffer (pstr); + if (ret != REG_NOERROR) + return ret; + } +#endif /* RE_ENABLE_I18N */ + pstr->mbs_case = str; + if (trans != NULL) + { + ret = re_string_translate_buffer (pstr, trans); + if (ret != REG_NOERROR) + return ret; + } + return REG_NOERROR; +} + +/* Construct string object. We use this function instead of + re_string_construct for case insensitive mode. */ + +static reg_errcode_t +re_string_construct_toupper (pstr, str, len, trans) + re_string_t *pstr; + const unsigned char *str; + int len; + RE_TRANSLATE_TYPE trans; +{ + reg_errcode_t ret; + /* Set case sensitive buffer. */ + re_string_construct_common (str, len, pstr); +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX >1) + { + if (pstr->len > 0) + { + ret = build_wcs_upper_buffer (pstr); + if (ret != REG_NOERROR) + return ret; + } + } + else +#endif /* RE_ENABLE_I18N */ + { + if (pstr->len > 0) + { + ret = build_upper_buffer (pstr); + if (ret != REG_NOERROR) + return ret; + } + } + pstr->mbs_case = str; + if (trans != NULL) + { + ret = re_string_translate_buffer (pstr, trans); + if (ret != REG_NOERROR) + return ret; + } + return REG_NOERROR; +} + +/* Helper functions for re_string_construct_*. */ +static void +re_string_construct_common (str, len, pstr) + const unsigned char *str; + int len; + re_string_t *pstr; +{ + pstr->mbs = str; + pstr->cur_idx = 0; + pstr->len = len; +#ifdef RE_ENABLE_I18N + pstr->wcs = NULL; +#endif + pstr->mbs_case = NULL; + pstr->mbs_alloc = 0; + pstr->mbs_case_alloc = 0; +} + +#ifdef RE_ENABLE_I18N + +/* Build wide character buffer for `pstr'. + If the byte sequence of the string are: + (0), (1), (0), (1), + Then wide character buffer will be: + , WEOF , , WEOF , + We use WEOF for padding, they indicate that the position isn't + a first byte of a multibyte character. */ + +static reg_errcode_t +build_wcs_buffer (pstr) + re_string_t *pstr; +{ + mbstate_t state, prev_st; + wchar_t wc; + int char_idx, char_len, mbclen; + + pstr->wcs = re_malloc (wchar_t, pstr->len + 1); + if (pstr->wcs == NULL) + return REG_ESPACE; + + memset (&state, '\0', sizeof (mbstate_t)); + char_len = pstr->len; + for (char_idx = 0; char_idx < char_len ;) + { + int next_idx, remain_len = char_len - char_idx; + prev_st = state; + mbclen = mbrtowc (&wc, pstr->mbs + char_idx, remain_len, &state); + if (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0) + /* We treat these cases as a singlebyte character. */ + { + mbclen = 1; + wc = (wchar_t) pstr->mbs[char_idx++]; + state = prev_st; + } + /* Write wide character and padding. */ + pstr->wcs[char_idx++] = wc; + for (next_idx = char_idx + mbclen - 1; char_idx < next_idx ;) + pstr->wcs[char_idx++] = WEOF; + } + return REG_NOERROR; +} + +static reg_errcode_t +build_wcs_upper_buffer (pstr) + re_string_t *pstr; +{ + mbstate_t state, prev_st; + wchar_t wc; + unsigned char *mbs_upper; + int char_idx, char_len, mbclen; + + pstr->wcs = re_malloc (wchar_t, pstr->len + 1); + mbs_upper = re_malloc (unsigned char, pstr->len + 1); + if (pstr->wcs == NULL || mbs_upper == NULL) + { + pstr->wcs = NULL; + return REG_ESPACE; + } + + memset (&state, '\0', sizeof (mbstate_t)); + char_len = pstr->len; + for (char_idx = 0 ; char_idx < char_len ; char_idx += mbclen) + { + int byte_idx, remain_len = char_len - char_idx; + prev_st = state; + mbclen = mbrtowc (&wc, pstr->mbs + char_idx, remain_len, &state); + if (mbclen == 1) + { + pstr->wcs[char_idx] = wc; + if (islower (pstr->mbs[char_idx])) + mbs_upper[char_idx] = toupper (pstr->mbs[char_idx]); + else + mbs_upper[char_idx] = pstr->mbs[char_idx]; + } + else if (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0) + /* We treat these cases as a singlebyte character. */ + { + mbclen = 1; + pstr->wcs[char_idx] = (wchar_t) pstr->mbs[char_idx]; + mbs_upper[char_idx] = pstr->mbs[char_idx]; + state = prev_st; + } + else /* mbclen > 1 */ + { + pstr->wcs[char_idx] = wc; + if (iswlower (wc)) + wcrtomb (mbs_upper + char_idx, towupper (wc), &prev_st); + else + memcpy (mbs_upper + char_idx, pstr->mbs + char_idx, mbclen); + for (byte_idx = 1 ; byte_idx < mbclen ; byte_idx++) + pstr->wcs[char_idx + byte_idx] = WEOF; + } + } + pstr->mbs = mbs_upper; + pstr->mbs_alloc = 1; + return REG_NOERROR; +} +#endif /* RE_ENABLE_I18N */ + +static reg_errcode_t +build_upper_buffer (pstr) + re_string_t *pstr; +{ + unsigned char *mbs_upper; + int char_idx, char_len; + + mbs_upper = re_malloc (unsigned char, pstr->len + 1); + if (mbs_upper == NULL) + return REG_ESPACE; + + char_len = pstr->len; + for (char_idx = 0 ; char_idx < char_len ; char_idx ++) + { + if (islower (pstr->mbs[char_idx])) + mbs_upper[char_idx] = toupper (pstr->mbs[char_idx]); + else + mbs_upper[char_idx] = pstr->mbs[char_idx]; + } + pstr->mbs = mbs_upper; + pstr->mbs_alloc = 1; + return REG_NOERROR; +} + +/* Apply TRANS to the buffer in PSTR. We assume that wide char buffer + is already constructed if MB_CUR_MAX > 1. */ + +static reg_errcode_t +re_string_translate_buffer (pstr, trans) + re_string_t *pstr; + RE_TRANSLATE_TYPE trans; +{ + int buf_idx; + unsigned char *transed_buf, *transed_case_buf; +#ifdef DEBUG + assert (trans != NULL); +#endif + if (pstr->mbs_alloc) + { + transed_buf = (unsigned char *) pstr->mbs; + transed_case_buf = re_malloc (unsigned char, pstr->len + 1); + if (transed_case_buf == NULL) + return REG_ESPACE; + pstr->mbs_case_alloc = 1; + } + else + { + transed_buf = re_malloc (unsigned char, pstr->len + 1); + if (transed_buf == NULL) + return REG_ESPACE; + transed_case_buf = NULL; + pstr->mbs_alloc = 1; + } + for (buf_idx = 0 ; buf_idx < pstr->len ; buf_idx++) + { +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX > 1 && !re_string_is_single_byte_char (pstr, buf_idx)) + transed_buf[buf_idx] = pstr->mbs[buf_idx]; + else +#endif + transed_buf[buf_idx] = trans[pstr->mbs[buf_idx]]; + if (transed_case_buf) + { +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX > 1 && !re_string_is_single_byte_char (pstr, buf_idx)) + transed_case_buf[buf_idx] = pstr->mbs_case[buf_idx]; + else +#endif + transed_case_buf[buf_idx] = trans[pstr->mbs_case[buf_idx]]; + } + } + if (pstr->mbs_case_alloc == 1) + { + pstr->mbs = transed_buf; + pstr->mbs_case = transed_case_buf; + } + else + { + pstr->mbs = transed_buf; + pstr->mbs_case = transed_buf; + } + return REG_NOERROR; +} + +static void +re_string_destruct (pstr) + re_string_t *pstr; +{ +#ifdef RE_ENABLE_I18N + re_free (pstr->wcs); +#endif /* RE_ENABLE_I18N */ + if (pstr->mbs_alloc) + re_free ((void *) pstr->mbs); + if (pstr->mbs_case_alloc) + re_free ((void *) pstr->mbs_case); +} + +/* Return the context at IDX in INPUT. */ +static unsigned int +re_string_context_at (input, idx, eflags, newline_anchor) + const re_string_t *input; + int idx, eflags, newline_anchor; +{ + int c; + if (idx < 0 || idx == input->len) + { + unsigned int context = 0; + if (idx < 0) + context = CONTEXT_BEGBUF; + else if (idx == input->len) + context = CONTEXT_ENDBUF; + + if ((idx < 0 && !(eflags & REG_NOTBOL)) + || (idx == input->len && !(eflags & REG_NOTEOL))) + return CONTEXT_NEWLINE | context; + else + return context; + } + c = re_string_byte_at (input, idx); + if (IS_WORD_CHAR (c)) + return CONTEXT_WORD; + return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0; +} + +/* Functions for set operation. */ + +static reg_errcode_t +re_node_set_alloc (set, size) + re_node_set *set; + int size; +{ + set->alloc = size; + set->nelem = 0; + set->elems = re_malloc (int, size); + if (set->elems == NULL) + return REG_ESPACE; + return REG_NOERROR; +} + +static reg_errcode_t +re_node_set_init_1 (set, elem) + re_node_set *set; + int elem; +{ + set->alloc = 1; + set->nelem = 1; + set->elems = re_malloc (int, 1); + if (set->elems == NULL) + return REG_ESPACE; + set->elems[0] = elem; + return REG_NOERROR; +} + +static reg_errcode_t +re_node_set_init_2 (set, elem1, elem2) + re_node_set *set; + int elem1, elem2; +{ + set->alloc = 2; + set->elems = re_malloc (int, 2); + if (set->elems == NULL) + return REG_ESPACE; + if (elem1 == elem2) + { + set->nelem = 1; + set->elems[0] = elem1; + } + else + { + set->nelem = 2; + if (elem1 < elem2) + { + set->elems[0] = elem1; + set->elems[1] = elem2; + } + else + { + set->elems[0] = elem2; + set->elems[1] = elem1; + } + } + return REG_NOERROR; +} + +static reg_errcode_t +re_node_set_init_copy (dest, src) + re_node_set *dest; + const re_node_set *src; +{ + dest->nelem = src->nelem; + if (src->nelem > 0) + { + dest->alloc = dest->nelem; + dest->elems = re_malloc (int, dest->alloc); + if (dest->elems == NULL) + return REG_ESPACE; + memcpy (dest->elems, src->elems, src->nelem * sizeof (int)); + } + else + re_node_set_init_empty (dest); + return REG_NOERROR; +} + +static reg_errcode_t +re_node_set_intersect (dest, src1, src2) + re_node_set *dest; + const re_node_set *src1, *src2; +{ + int i1, i2, id; + if (src1->nelem > 0 && src2->nelem > 0) + { + if (src1->nelem + src2->nelem > dest->alloc) + { + int *new_array; + if (dest->alloc == 0) + new_array = re_malloc (int, src1->nelem + src2->nelem); + else + new_array = re_realloc (dest->elems, int, + src1->nelem + src2->nelem); + dest->alloc = src1->nelem + src2->nelem; + if (new_array == NULL) + return REG_ESPACE; + dest->elems = new_array; + } + } + else + { + dest->nelem = 0; + return REG_NOERROR; + } + + for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) + { + if (src1->elems[i1] > src2->elems[i2]) + { + ++i2; + continue; + } + if (src1->elems[i1] == src2->elems[i2]) + dest->elems[id++] = src2->elems[i2++]; + ++i1; + } + dest->nelem = id; + return REG_NOERROR; +} + +static reg_errcode_t +re_node_set_add_intersect (dest, src1, src2) + re_node_set *dest; + const re_node_set *src1, *src2; +{ + int i1, i2, id; + if (src1->nelem > 0 && src2->nelem > 0) + { + if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) + { + int *new_array; + if (dest->alloc == 0) + new_array = re_malloc (int, src1->nelem + src2->nelem); + else + new_array = re_realloc (dest->elems, int, + src1->nelem + src2->nelem + dest->nelem); + dest->alloc = src1->nelem + src2->nelem + dest->nelem; + if (new_array == NULL) + return REG_ESPACE; + dest->elems = new_array; + } + } + else + return REG_NOERROR; + + for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) + { + if (src1->elems[i1] > src2->elems[i2]) + { + ++i2; + continue; + } + if (src1->elems[i1] == src2->elems[i2]) + { + while (id < dest->nelem && dest->elems[id] < src2->elems[i2]) + ++id; + if (id < dest->nelem && dest->elems[id] == src2->elems[i2]) + ++id; + else + { + memmove (dest->elems + id + 1, dest->elems + id, + sizeof (int) * (dest->nelem - id)); + dest->elems[id++] = src2->elems[i2++]; + ++dest->nelem; + } + } + ++i1; + } + return REG_NOERROR; +} + +static reg_errcode_t +re_node_set_init_union (dest, src1, src2) + re_node_set *dest; + const re_node_set *src1, *src2; +{ + int i1, i2, id; + if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) + { + dest->alloc = src1->nelem + src2->nelem; + dest->elems = re_malloc (int, dest->alloc); + if (dest->elems == NULL) + return REG_ESPACE; + } + else + { + if (src1 != NULL && src1->nelem > 0) + return re_node_set_init_copy (dest, src1); + else if (src2 != NULL && src2->nelem > 0) + return re_node_set_init_copy (dest, src2); + else + re_node_set_init_empty (dest); + return REG_NOERROR; + } + for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) + { + if (src1->elems[i1] > src2->elems[i2]) + { + dest->elems[id++] = src2->elems[i2++]; + continue; + } + if (src1->elems[i1] == src2->elems[i2]) + ++i2; + dest->elems[id++] = src1->elems[i1++]; + } + if (i1 < src1->nelem) + { + memcpy (dest->elems + id, src1->elems + i1, + (src1->nelem - i1) * sizeof (int)); + id += src1->nelem - i1; + } + else if (i2 < src2->nelem) + { + memcpy (dest->elems + id, src2->elems + i2, + (src2->nelem - i2) * sizeof (int)); + id += src2->nelem - i2; + } + dest->nelem = id; + return REG_NOERROR; +} + +static reg_errcode_t +re_node_set_merge (dest, src) + re_node_set *dest; + const re_node_set *src; +{ + int si, di; + if (src == NULL || src->nelem == 0) + return REG_NOERROR; + else if (dest == NULL) + { + dest = re_malloc (re_node_set, 1); + return re_node_set_init_copy (dest, src); + } + if (dest->alloc < src->nelem + dest->nelem) + { + dest->alloc = 2 * (src->nelem + dest->alloc); + dest->elems = re_realloc (dest->elems, int, dest->alloc); + } + + for (si = 0, di = 0 ; si < src->nelem && di < dest->nelem ;) + { + int cp_from, ncp, mid, right, src_elem = src->elems[si]; + /* Binary search the spot we will add the new element. */ + right = dest->nelem; + while (di < right) + { + mid = (di + right) / 2; + if (dest->elems[mid] < src_elem) + di = mid + 1; + else + right = mid; + } + if (di >= dest->nelem) + break; + + if (dest->elems[di] == src_elem) + { + /* Skip since, DEST already has the element. */ + ++di; + ++si; + continue; + } + + /* Skip the src elements which are less than dest->elems[di]. */ + cp_from = si; + while (si < src->nelem && src->elems[si] < dest->elems[di]) + ++si; + /* Copy these src elements. */ + ncp = si - cp_from; + memmove (dest->elems + di + ncp, dest->elems + di, + sizeof (int) * (dest->nelem - di)); + memcpy (dest->elems + di, src->elems + cp_from, + sizeof (int) * ncp); + /* Update counters. */ + di += ncp; + dest->nelem += ncp; + } + + /* Copy remaining src elements. */ + if (si < src->nelem) + { + memcpy (dest->elems + di, src->elems + si, + sizeof (int) * (src->nelem - si)); + dest->nelem += src->nelem - si; + } + return REG_NOERROR; +} + +/* Insert the new element ELEM to the re_node_set* SET. + return 0 if SET already has ELEM, + return -1 if an error is occured, return 1 otherwise. */ + +static int +re_node_set_insert (set, elem) + re_node_set *set; + int elem; +{ + int idx, right, mid; + /* In case of the set is empty. */ + if (set->elems == NULL || set->alloc == 0) + { + if (re_node_set_init_1 (set, elem) == REG_NOERROR) + return 1; + else + return -1; + } + + /* Binary search the spot we will add the new element. */ + idx = 0; + right = set->nelem; + while (idx < right) + { + mid = (idx + right) / 2; + if (set->elems[mid] < elem) + idx = mid + 1; + else + right = mid; + } + + /* Realloc if we need. */ + if (set->alloc < set->nelem + 1) + { + int *new_array; + set->alloc = set->alloc * 2; + new_array = re_malloc (int, set->alloc); + if (new_array == NULL) + return -1; + /* Copy the elements they are followed by the new element. */ + if (idx > 0) + memcpy (new_array, set->elems, sizeof (int) * (idx)); + /* Copy the elements which follows the new element. */ + if (set->nelem - idx > 0) + memcpy (new_array + idx + 1, set->elems + idx, + sizeof (int) * (set->nelem - idx)); + set->elems = new_array; + } + else + { + /* Move the elements which follows the new element. */ + if (set->nelem - idx > 0) + memmove (set->elems + idx + 1, set->elems + idx, + sizeof (int) * (set->nelem - idx)); + } + /* Insert the new element. */ + set->elems[idx] = elem; + ++set->nelem; + return 1; +} + +/* Compare two node sets SET1 and SET2. + return 1 if SET1 and SET2 are equivalent, retrun 0 otherwise. */ + +static int +re_node_set_compare (set1, set2) + const re_node_set *set1, *set2; +{ + int i; + if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) + return 0; + for (i = 0 ; i < set1->nelem ; i++) + if (set1->elems[i] != set2->elems[i]) + return 0; + return 1; +} + +/* Return 1 if SET contains the element ELEM, return 0 otherwise. */ + +static int +re_node_set_contains (set, elem) + const re_node_set *set; + int elem; +{ + int idx, right, mid; + if (set->nelem <= 0) + return 0; + + /* Binary search the element. */ + idx = 0; + right = set->nelem - 1; + while (idx < right) + { + mid = (idx + right) / 2; + if (set->elems[mid] < elem) + idx = mid + 1; + else + right = mid; + } + return set->elems[idx] == elem; +} + +static void +re_node_set_remove_at (set, idx) + re_node_set *set; + int idx; +{ + if (idx < 0 || idx >= set->nelem) + return; + if (idx < set->nelem - 1) + memmove (set->elems + idx, set->elems + idx + 1, + sizeof (int) * (set->nelem - idx - 1)); + --set->nelem; +} + + +/* Add the token TOKEN to dfa->nodes, and return the index of the token. + Or return -1, if an error will be occured. */ + +static int +re_dfa_add_node (dfa, token, mode) + re_dfa_t *dfa; + re_token_t token; + int mode; +{ + if (dfa->nodes_len >= dfa->nodes_alloc) + { + re_token_t *new_array; + dfa->nodes_alloc *= 2; + new_array = re_realloc (dfa->nodes, re_token_t, dfa->nodes_alloc); + if (new_array == NULL) + return -1; + else + dfa->nodes = new_array; + if (mode) + { + int *new_firsts, *new_nexts; + re_node_set *new_edests, *new_eclosures, *new_inveclosures; + + new_firsts = re_realloc (dfa->firsts, int, dfa->nodes_alloc); + new_nexts = re_realloc (dfa->nexts, int, dfa->nodes_alloc); + new_edests = re_realloc (dfa->edests, re_node_set, dfa->nodes_alloc); + new_eclosures = re_realloc (dfa->eclosures, re_node_set, + dfa->nodes_alloc); + new_inveclosures = re_realloc (dfa->inveclosures, re_node_set, + dfa->nodes_alloc); + if (new_firsts == NULL || new_nexts == NULL || new_edests == NULL + || new_eclosures == NULL || new_inveclosures == NULL) + return -1; + dfa->firsts = new_firsts; + dfa->nexts = new_nexts; + dfa->edests = new_edests; + dfa->eclosures = new_eclosures; + dfa->inveclosures = new_inveclosures; + } + } + dfa->nodes[dfa->nodes_len] = token; + dfa->nodes[dfa->nodes_len].duplicated = 0; + return dfa->nodes_len++; +} + +static unsigned int inline +calc_state_hash (nodes, context) + const re_node_set *nodes; + unsigned int context; +{ + unsigned int hash = nodes->nelem + context; + int i; + for (i = 0 ; i < nodes->nelem ; i++) + hash += nodes->elems[i]; + return hash; +} + +/* Search for the state whose node_set is equivalent to NODES. + Return the pointer to the state, if we found it in the DFA. + Otherwise create the new one and return it. */ + +static re_dfastate_t * +re_acquire_state (dfa, nodes) + re_dfa_t *dfa; + const re_node_set *nodes; +{ + unsigned int hash; + struct re_state_table_entry *spot; + int i; + if (nodes->nelem == 0) + return NULL; + hash = calc_state_hash (nodes, 0); + spot = dfa->state_table + (hash & dfa->state_hash_mask); + + if (spot->alloc == 0) + { + /* Currently there are only one state in this spot. */ + if (spot->entry.state != NULL && hash == spot->entry.state->hash + && re_node_set_compare (&spot->entry.state->nodes, nodes)) + return spot->entry.state; + } + else + for (i = 0 ; i < spot->num ; i++) + { + re_dfastate_t *state = spot->entry.array[i]; + if (hash != state->hash) + continue; + if (re_node_set_compare (&state->nodes, nodes)) + return state; + } + + /* There are no appropriate state in the dfa, create the new one. */ + return create_ci_newstate (dfa, nodes, hash); +} + +/* Search for the state whose node_set is equivalent to NODES and + whose context is equivalent to CONTEXT. + Return the pointer to the state, if we found it in the DFA. + Otherwise create the new one and return it. */ + +static re_dfastate_t * +re_acquire_state_context (dfa, nodes, context) + re_dfa_t *dfa; + const re_node_set *nodes; + unsigned int context; +{ + unsigned int hash; + struct re_state_table_entry *spot; + int i; + if (nodes->nelem == 0) + return NULL; + hash = calc_state_hash (nodes, context); + spot = dfa->state_table + (hash & dfa->state_hash_mask); + + if (spot->alloc == 0) + { + /* Currently there are only one state in this spot. */ + if (spot->entry.state != NULL && hash == spot->entry.state->hash + && re_node_set_compare (&spot->entry.state->nodes, nodes) + && spot->entry.state->context == context) + return spot->entry.state; + } + else + for (i = 0 ; i < spot->num ; i++) + { + re_dfastate_t *state = spot->entry.array[i]; + if (hash != state->hash) + continue; + if (re_node_set_compare (state->entrance_nodes, nodes) + && state->context == context) + return state; + } + /* There are no appropriate state in `dfa', create the new one. */ + return create_cd_newstate (dfa, nodes, context, hash); +} + +static re_dfastate_t * +create_newstate_common (dfa, nodes, hash) + re_dfa_t *dfa; + const re_node_set *nodes; + unsigned int hash; +{ + re_dfastate_t *newstate; + newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); + re_node_set_init_copy (&newstate->nodes, nodes); + newstate->trtable = NULL; + newstate->trtable_search = NULL; + newstate->hash = hash; + return newstate; +} + +static void +register_state (dfa, newstate, hash) + re_dfa_t *dfa; + re_dfastate_t *newstate; + unsigned int hash; +{ + struct re_state_table_entry *spot; + spot = dfa->state_table + (hash & dfa->state_hash_mask); + + if (spot->alloc <= spot->num) + { + re_dfastate_t **new_array; + + /* XXX Is spot->entry.array == NULL if spot->alloc == 0? If yes + the if can go away and only realloc is needed. */ + if (spot->alloc == 0) + { + spot->alloc = 4; + new_array = re_malloc (re_dfastate_t *, spot->alloc); + if (new_array == NULL) + /* XXX return value */ + return; + new_array[0] = spot->entry.state; + } + else + { + spot->alloc = 2 * spot->num; + new_array = re_realloc (spot->entry.array, re_dfastate_t *, + spot->alloc); + } + spot->entry.array = new_array; + } + spot->entry.array[spot->num++] = newstate; +} + +static re_dfastate_t * +create_ci_newstate (dfa, nodes, hash) + re_dfa_t *dfa; + const re_node_set *nodes; + unsigned int hash; +{ + int i; + re_dfastate_t *newstate; + newstate = create_newstate_common (dfa, nodes, hash); + newstate->entrance_nodes = &newstate->nodes; + + for (i = 0 ; i < nodes->nelem ; i++) + { + re_token_t *node = dfa->nodes + nodes->elems[i]; + re_token_type_t type = node->type; + if (type == CHARACTER) + continue; + + /* If the state has the halt node, the state is a halt state. */ + else if (type == END_OF_RE) + newstate->halt = 1; + else if (type == COMPLEX_BRACKET + || (type == OP_PERIOD && MB_CUR_MAX > 1)) + newstate->accept_mb = 1; + else if (type == OP_BACK_REF) + newstate->has_backref = 1; + else if (type == ANCHOR || OP_CONTEXT_NODE) + { + newstate->has_constraint = 1; + if (type == OP_CONTEXT_NODE + && dfa->nodes[node->opr.ctx_info->entity].type == END_OF_RE) + newstate->halt = 1; + } + } + + register_state (dfa, newstate, hash); + return newstate; +} + +static re_dfastate_t * +create_cd_newstate (dfa, nodes, context, hash) + re_dfa_t *dfa; + const re_node_set *nodes; + unsigned int context, hash; +{ + int i, nctx_nodes = 0; + re_dfastate_t *newstate; + + newstate = create_newstate_common (dfa, nodes, hash); + newstate->context = context; + newstate->entrance_nodes = &newstate->nodes; + + for (i = 0 ; i < nodes->nelem ; i++) + { + unsigned int constraint = 0; + re_token_t *node = dfa->nodes + nodes->elems[i]; + re_token_type_t type = node->type; + if (type == CHARACTER) + continue; + + /* If the state has the halt node, the state is a halt state. */ + else if (type == END_OF_RE) + newstate->halt = 1; + else if (type == COMPLEX_BRACKET + || (type == OP_PERIOD && MB_CUR_MAX > 1)) + newstate->accept_mb = 1; + else if (type == OP_BACK_REF) + newstate->has_backref = 1; + else if (type == ANCHOR) + constraint = node->opr.ctx_type; + else if (type == OP_CONTEXT_NODE) + { + re_token_type_t ctype = dfa->nodes[node->opr.ctx_info->entity].type; + constraint = node->constraint; + if (ctype == END_OF_RE) + newstate->halt = 1; + else if (ctype == OP_BACK_REF) + newstate->has_backref = 1; + else if (ctype == COMPLEX_BRACKET + || (type == OP_PERIOD && MB_CUR_MAX > 1)) + newstate->accept_mb = 1; + } + + if (constraint) + { + if (newstate->entrance_nodes == &newstate->nodes) + { + newstate->entrance_nodes = re_malloc (re_node_set, 1); + if (newstate->entrance_nodes == NULL) + /* XXX Return which value? */ + return NULL; + re_node_set_init_copy (newstate->entrance_nodes, nodes); + nctx_nodes = 0; + newstate->has_constraint = 1; + } + + if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) + { + re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); + ++nctx_nodes; + } + } + } + register_state (dfa, newstate, hash); + return newstate; +} diff --git a/posix/regex_internal.h b/posix/regex_internal.h new file mode 100644 index 0000000000..35f9f4a868 --- /dev/null +++ b/posix/regex_internal.h @@ -0,0 +1,542 @@ +/* Extended regular expression matching and search library. + Copyright (C) 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Isamu Hasegawa . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _REGEX_INTERNAL_H +#define _REGEX_INTERNAL_H 1 + +/* Number of bits in a byte. */ +#define BYTE_BITS 8 +/* Number of single byte character. */ +#define SBC_MAX 256 + +#define COLL_ELEM_LEN_MAX 8 + +/* The character which represents newline. */ +#define NEWLINE_CHAR '\n' + +/* Rename to standard API for using out of glibc. */ +#ifndef _LIBC +# define __wctype wctype +# define __iswctype iswctype +# define __btowc btowc +# define __mempcpy memcpy +#endif + +extern const char re_error_msgid[]; +extern const size_t re_error_msgid_idx[]; + +/* Number of bits in an unsinged int. */ +#define UINT_BITS (sizeof (unsigned int) * BYTE_BITS) +/* Number of unsigned int in an bit_set. */ +#define BITSET_UINTS ((SBC_MAX + UINT_BITS - 1) / UINT_BITS) +typedef unsigned int bitset[BITSET_UINTS]; +typedef unsigned int *re_bitset_ptr_t; + +#define bitset_set(set,i) (set[i / UINT_BITS] |= 1 << i % UINT_BITS) +#define bitset_clear(set,i) (set[i / UINT_BITS] &= ~(1 << i % UINT_BITS)) +#define bitset_contain(set,i) (set[i / UINT_BITS] & (1 << i % UINT_BITS)) +#define bitset_empty(set) memset (set, 0, sizeof (unsigned int) * BITSET_UINTS) +#define bitset_set_all(set) \ + memset (set, 255, sizeof (unsigned int) * BITSET_UINTS) +#define bitset_copy(dest,src) \ + memcpy (dest, src, sizeof (unsigned int) * BITSET_UINTS) +static inline void bitset_not (bitset set); +static inline void bitset_merge (bitset dest, const bitset src); +static inline void bitset_not_merge (bitset dest, const bitset src); + +#define PREV_WORD_CONSTRAINT 0x0001 +#define PREV_NOTWORD_CONSTRAINT 0x0002 +#define NEXT_WORD_CONSTRAINT 0x0004 +#define NEXT_NOTWORD_CONSTRAINT 0x0008 +#define PREV_NEWLINE_CONSTRAINT 0x0010 +#define NEXT_NEWLINE_CONSTRAINT 0x0020 +#define PREV_BEGBUF_CONSTRAINT 0x0040 +#define NEXT_ENDBUF_CONSTRAINT 0x0080 +#define DUMMY_CONSTRAINT 0x0100 + +typedef enum +{ + INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT, + WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT, + WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT, + LINE_FIRST = PREV_NEWLINE_CONSTRAINT, + LINE_LAST = NEXT_NEWLINE_CONSTRAINT, + BUF_FIRST = PREV_BEGBUF_CONSTRAINT, + BUF_LAST = NEXT_ENDBUF_CONSTRAINT, + WORD_DELIM = DUMMY_CONSTRAINT +} re_context_type; + +typedef struct +{ + int alloc; + int nelem; + int *elems; +} re_node_set; + +typedef enum +{ + NON_TYPE = 0, + + /* Token type, these are used only by token. */ + OP_OPEN_SUBEXP, + OP_CLOSE_SUBEXP, + OP_OPEN_BRACKET, + OP_CLOSE_BRACKET, + OP_CHARSET_RANGE, + OP_OPEN_DUP_NUM, + OP_CLOSE_DUP_NUM, + OP_NON_MATCH_LIST, + OP_OPEN_COLL_ELEM, + OP_CLOSE_COLL_ELEM, + OP_OPEN_EQUIV_CLASS, + OP_CLOSE_EQUIV_CLASS, + OP_OPEN_CHAR_CLASS, + OP_CLOSE_CHAR_CLASS, + OP_WORD, + OP_NOTWORD, + BACK_SLASH, + + /* Tree type, these are used only by tree. */ + CONCAT, + ALT, + SUBEXP, + SIMPLE_BRACKET, + COMPLEX_BRACKET, + + /* Node type, These are used by token, node, tree. */ + OP_PERIOD, + CHARACTER, + END_OF_RE, + OP_ALT, + OP_DUP_ASTERISK, + OP_DUP_PLUS, + OP_DUP_QUESTION, + OP_BACK_REF, + ANCHOR, + OP_CONTEXT_NODE, + + /* Dummy marker. */ + END_OF_RE_TOKEN_T +} re_token_type_t; + +typedef struct +{ + /* If this character set is the non-matching list. */ + unsigned int non_match : 1; + + /* Multibyte characters. */ + wchar_t *mbchars; + int nmbchars; + + /* Collating symbols. */ +#ifdef _LIBC + int32_t *coll_syms; +#endif + int ncoll_syms; + + /* Equivalence classes. */ +#ifdef _LIBC + int32_t *equiv_classes; +#endif + int nequiv_classes; + + /* Range expressions. */ +#ifdef _LIBC + uint32_t *range_starts; + uint32_t *range_ends; +#endif + int nranges; + + /* Character classes. */ + wctype_t *char_classes; + int nchar_classes; +} re_charset_t; + +typedef struct +{ + re_token_type_t type; + union + { + unsigned char c; /* for CHARACTER */ + re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */ + re_charset_t *mbcset; /* for COMPLEX_BRACKET */ + int idx; /* for BACK_REF */ + re_context_type ctx_type; /* for ANCHOR */ + struct + { + int entity; /* for OP_CONTEXT_NODE, index of the entity */ + re_node_set *bkref_eclosure; + } *ctx_info; + } opr; + unsigned int constraint : 10; /* context constraint */ + unsigned int duplicated : 1; +#ifdef RE_ENABLE_I18N + unsigned int mb_partial : 1; +#endif +} re_token_t; + +#define IS_EPSILON_NODE(type) \ + ((type) == OP_ALT || (type) == OP_DUP_ASTERISK || (type) == OP_DUP_PLUS || \ + (type) == OP_DUP_QUESTION || (type) == ANCHOR) + +#define ACCEPT_MB_NODE(type) \ + ((type) == COMPLEX_BRACKET || (type) == OP_PERIOD) + +struct re_string_t +{ + /* Store the multibyte string. In case of "case insensitive mode" like + REG_ICASE, upper cases of the string are stored. */ + const unsigned char *mbs; + /* Store the case sensitive multibyte string. In case of + "case insensitive mode", the original string are stored, + otherwise MBS_CASE points the same address that MBS points. */ + const unsigned char *mbs_case; + int cur_idx; + int len; +#ifdef RE_ENABLE_I18N + /* Store the wide character string which is corresponding to MBS. */ + wchar_t *wcs; +#endif + /* 1 if mbs is allocated by regex library. */ + unsigned int mbs_alloc : 1; + /* 1 if mbs_case is allocated by regex library. */ + unsigned int mbs_case_alloc : 1; +}; +typedef struct re_string_t re_string_t; + +static reg_errcode_t re_string_construct (re_string_t *pstr, + const unsigned char *str, int len, + RE_TRANSLATE_TYPE trans); +static reg_errcode_t re_string_construct_toupper (re_string_t *pstr, + const unsigned char *str, + int len, + RE_TRANSLATE_TYPE trans); +static void re_string_destruct (re_string_t *pstr); +#ifdef RE_ENABLE_I18N +static int re_string_elem_size_at (const re_string_t *pstr, int idx); +static inline int re_string_char_size_at (const re_string_t *pstr, int idx); +static inline wint_t re_string_wchar_at (const re_string_t *pstr, int idx); +#endif /* RE_ENABLE_I18N */ +static unsigned int re_string_context_at (const re_string_t *input, int idx, + int eflags, int newline_anchor); +#define re_string_peek_byte(pstr, offset) \ + ((pstr)->mbs[(pstr)->cur_idx + offset]) +#define re_string_peek_byte_case(pstr, offset) \ + ((pstr)->mbs_case[(pstr)->cur_idx + offset]) +#define re_string_fetch_byte(pstr) \ + ((pstr)->mbs[(pstr)->cur_idx++]) +#define re_string_fetch_byte_case(pstr) \ + ((pstr)->mbs_case[(pstr)->cur_idx++]) +#define re_string_first_byte(pstr, idx) \ + ((idx) == (pstr)->len || (pstr)->wcs[idx] != WEOF) +#define re_string_is_single_byte_char(pstr, idx) \ + ((pstr)->wcs[idx] != WEOF && ((pstr)->len == (idx) \ + || (pstr)->wcs[(idx) + 1] != WEOF)) +#define re_string_eoi(pstr) ((pstr)->len == (pstr)->cur_idx) +#define re_string_cur_idx(pstr) ((pstr)->cur_idx) +#define re_string_get_buffer(pstr) ((pstr)->mbs) +#define re_string_length(pstr) ((pstr)->len) +#define re_string_byte_at(pstr,idx) \ + ((pstr)->mbs[idx]) +#define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx)) +#define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx)) + +#define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t))) +#define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t))) +#define re_free(p) free (p) + +struct bin_tree_t +{ + struct bin_tree_t *parent; + struct bin_tree_t *left; + struct bin_tree_t *right; + + /* `node_idx' is the index in dfa->nodes, if `type' == 0. + Otherwise `type' indicate the type of this node. */ + re_token_type_t type; + int node_idx; + + int first; + int next; + re_node_set eclosure; +}; +typedef struct bin_tree_t bin_tree_t; + +struct re_backref_cache_entry +{ + int node; + int from; + int to; + int flag; +}; + +typedef struct +{ + int eflags; + int match_first; + int match_last; + int state_log_top; + /* Back reference cache. */ + int nbkref_ents; + int abkref_ents; + struct re_backref_cache_entry *bkref_ents; + int max_bkref_len; +} re_match_context_t; + + +#define CONTEXT_WORD 1 +#define CONTEXT_NEWLINE (CONTEXT_WORD << 1) +#define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1) +#define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1) + +#define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD) +#define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE) +#define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF) +#define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF) +#define IS_ORDINARY_CONTEXT(c) ((c) == 0) + +#define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_') +#define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR) + +#define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \ + ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \ + || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \ + || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\ + || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context))) + +#define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \ + ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \ + || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \ + || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \ + || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context))) + +struct re_dfastate_t +{ + unsigned int hash; + re_node_set nodes; + re_node_set *entrance_nodes; + struct re_dfastate_t **trtable; + struct re_dfastate_t **trtable_search; + /* If this state is a special state. + A state is a special state if the state is the halt state, or + a anchor. */ + unsigned int context : 2; + unsigned int halt : 1; + /* If this state can accept `multi byte'. + Note that we refer to multibyte characters, and multi character + collating elements as `multi byte'. */ + unsigned int accept_mb : 1; + /* If this state has backreference node(s). */ + unsigned int has_backref : 1; + unsigned int has_constraint : 1; +}; +typedef struct re_dfastate_t re_dfastate_t; + +typedef struct +{ + /* start <= node < end */ + int start; + int end; +} re_subexp_t; + +struct re_state_table_entry +{ + int num; + int alloc; + union + { + re_dfastate_t *state; + re_dfastate_t **array; + } entry; +}; + +struct re_dfa_t +{ + re_bitset_ptr_t word_char; + + /* number of subexpressions `re_nsub' is in regex_t. */ + int subexps_alloc; + re_subexp_t *subexps; + + re_token_t *nodes; + int nodes_alloc; + int nodes_len; + bin_tree_t *str_tree; + int *firsts; + int *nexts; + re_node_set *edests; + re_node_set *eclosures; + re_node_set *inveclosures; + struct re_state_table_entry *state_table; + unsigned int state_hash_mask; + re_dfastate_t *init_state; + re_dfastate_t *init_state_word; + re_dfastate_t *init_state_nl; + re_dfastate_t *init_state_begbuf; + int states_alloc; + int init_node; + int nbackref; /* The number of backreference in this dfa. */ + /* If this dfa has "multibyte node", which is a backreference or + a node which can accept multibyte character or multi character + collating element. */ + unsigned int has_mb_node : 1; +}; +typedef struct re_dfa_t re_dfa_t; + +static reg_errcode_t re_node_set_alloc (re_node_set *set, int size); +static reg_errcode_t re_node_set_init_1 (re_node_set *set, int elem); +static reg_errcode_t re_node_set_init_2 (re_node_set *set, int elem1, + int elem2); +#define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set)) +static reg_errcode_t re_node_set_init_copy (re_node_set *dest, + const re_node_set *src); +static reg_errcode_t re_node_set_intersect (re_node_set *dest, + const re_node_set *src1, + const re_node_set *src2); +static reg_errcode_t re_node_set_add_intersect (re_node_set *dest, + const re_node_set *src1, + const re_node_set *src2); +static reg_errcode_t re_node_set_init_union (re_node_set *dest, + const re_node_set *src1, + const re_node_set *src2); +static reg_errcode_t re_node_set_merge (re_node_set *dest, + const re_node_set *src); +static int re_node_set_insert (re_node_set *set, int elem); +static int re_node_set_compare (const re_node_set *set1, + const re_node_set *set2); +static int re_node_set_contains (const re_node_set *set, int elem); +static void re_node_set_remove_at (re_node_set *set, int idx); +#define re_node_set_empty(p) ((p)->nelem = 0) +#define re_node_set_free(set) re_free ((set)->elems) +static int re_dfa_add_node (re_dfa_t *dfa, re_token_t token, int mode); +static re_dfastate_t *re_acquire_state (re_dfa_t *dfa, + const re_node_set *nodes); +static re_dfastate_t *re_acquire_state_context (re_dfa_t *dfa, + const re_node_set *nodes, + unsigned int context); + + +typedef enum +{ + SB_CHAR, + MB_CHAR, + EQUIV_CLASS, + COLL_SYM, + CHAR_CLASS +} bracket_elem_type; + +typedef struct +{ + bracket_elem_type type; + union + { + unsigned char ch; + unsigned char *name; + wchar_t wch; + } opr; +} bracket_elem_t; + + +/* Inline functions for bitset operation. */ +static inline void +bitset_not (set) + bitset set; +{ + int bitset_i; + for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i) + set[bitset_i] = ~set[bitset_i]; +} + +static inline void +bitset_merge (dest, src) + bitset dest; + const bitset src; +{ + int bitset_i; + for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i) + dest[bitset_i] |= src[bitset_i]; +} + +static inline void +bitset_not_merge (dest, src) + bitset dest; + const bitset src; +{ + int i; + for (i = 0; i < BITSET_UINTS; ++i) + dest[i] |= ~src[i]; +} + +#ifdef RE_ENABLE_I18N +/* Inline functions for re_string. */ +static inline int +re_string_char_size_at (pstr, idx) + const re_string_t *pstr; + int idx; +{ + int byte_idx; + if (MB_CUR_MAX == 1) + return 1; + for (byte_idx = 1; idx + byte_idx < pstr->len; ++byte_idx) + if (pstr->wcs[idx + byte_idx] != WEOF) + break; + return byte_idx; +} + +static inline wint_t +re_string_wchar_at (pstr, idx) + const re_string_t *pstr; + int idx; +{ + if (MB_CUR_MAX == 1) + return (wint_t) pstr->mbs[idx]; + return (wint_t) pstr->wcs[idx]; +} + +static int +re_string_elem_size_at (pstr, idx) + const re_string_t *pstr; + int idx; +{ +#ifdef _LIBC + const unsigned char *p; + const char *extra; + const int32_t *table, *indirect; + int32_t tmp; +# include + uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); + + if (nrules != 0) + { + table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); + extra = (const char *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); + indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_INDIRECTMB); + p = pstr->mbs + idx; + tmp = findidx (&p); + return p - (const unsigned char *) pstr->mbs - idx; + } + else +#endif /* _LIBC */ + return 1; +} +#endif /* RE_ENABLE_I18N */ + +#endif /* _REGEX_INTERNAL_H */ diff --git a/posix/regexec.c b/posix/regexec.c new file mode 100644 index 0000000000..cf8f304b48 --- /dev/null +++ b/posix/regexec.c @@ -0,0 +1,2076 @@ +/* Extended regular expression matching and search library. + Copyright (C) 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Isamu Hasegawa . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef _LIBC +# ifndef _RE_DEFINE_LOCALE_FUNCTIONS +# define _RE_DEFINE_LOCALE_FUNCTIONS 1 +# include +# include +# include +# endif +#endif + +#include "regex.h" +#include "regex_internal.h" + +static void match_ctx_init (re_match_context_t *cache, int eflags, int n); +static void match_ctx_free (re_match_context_t *cache); +static void match_ctx_add_entry (re_match_context_t *cache, int node, int from, + int to); +static int re_search_internal (const regex_t *preg, const char *string, + int length, int start, int range, size_t nmatch, + regmatch_t pmatch[], int eflags); +static inline re_dfastate_t *acquire_init_state_context (const regex_t *preg, + const re_string_t *input, int idx, int eflags); +static int check_matching (const regex_t *preg, re_string_t *input, + re_match_context_t *mctx, re_dfastate_t **state_log, + int start_idx, int fl_search, int fl_longest_match); +static int check_halt_node_context (const re_dfa_t *dfa, int node, + unsigned int context); +static int check_halt_state_context (const regex_t *preg, + const re_dfastate_t *state, + const re_string_t *input, int idx, + int eflags); +static int proceed_next_node (const regex_t *preg, + re_dfastate_t **state_log, + const re_match_context_t *mctx, + const re_string_t *input, + int *pidx, int node, re_node_set *eps_via_nodes); +static void set_regs (const regex_t *preg, re_dfastate_t **state_log, + const re_match_context_t *mctx, const re_string_t *input, + size_t nmatch, regmatch_t *pmatch, int last); +static int sift_states_iter_mb (const regex_t *preg, re_dfastate_t **state_log, + const re_match_context_t *mctx, + const re_string_t *input, int node_idx, + int str_idx, int max_str_idx); +static int sift_states_iter_bkref (const re_dfa_t *dfa, + re_dfastate_t **state_log, + struct re_backref_cache_entry *mctx_entry, + int node_idx, int idx, int match_first, + int match_last); +static void sift_states_backward (const regex_t *preg, + re_dfastate_t **state_log, + const re_match_context_t *mctx, + const re_string_t *input, int last_node); +static void add_epsilon_backreference (const re_dfa_t *dfa, + const re_match_context_t *mctx, + const re_node_set *plog, int idx, + re_node_set *state_buf); +static re_dfastate_t *transit_state (const regex_t *preg, re_dfastate_t *state, + re_string_t *input, int fl_search, + re_dfastate_t **state_log, + re_match_context_t *mctx); +static re_dfastate_t *transit_state_sb (const regex_t *preg, + re_dfastate_t *pstate, + re_string_t *input, int fl_search, + re_match_context_t *mctx); +static void transit_state_mb (const regex_t *preg, re_dfastate_t *pstate, + const re_string_t *input, + re_dfastate_t **state_log, + re_match_context_t *mctx); +static void transit_state_bkref (const regex_t *preg, re_dfastate_t *pstate, + const re_string_t *input, + re_dfastate_t **state_log, + re_match_context_t *mctx); +static void transit_state_bkref_loop (const regex_t *preg, + const re_string_t *input, + re_node_set *nodes, + re_dfastate_t **work_state_log, + re_dfastate_t **state_log, + re_match_context_t *mctx); +static re_dfastate_t **build_trtable (const regex_t *dfa, + const re_dfastate_t *state, + int fl_search); +static int check_node_accept_bytes (const regex_t *preg, int node_idx, + const re_string_t *input, int idx); +static unsigned int find_collation_sequence_value (const unsigned char *mbs, + size_t name_len); +static int group_nodes_into_DFAstates (const regex_t *dfa, + const re_dfastate_t *state, + re_node_set *states_node, + bitset *states_ch); +static int check_node_accept (const regex_t *preg, const re_token_t *node, + const re_string_t *input, int idx, int eflags); + +/* Entry point for POSIX code. */ + +/* regexec searches for a given pattern, specified by PREG, in the + string STRING. + + If NMATCH is zero or REG_NOSUB was set in the cflags argument to + `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at + least NMATCH elements, and we set them to the offsets of the + corresponding matched substrings. + + EFLAGS specifies `execution flags' which affect matching: if + REG_NOTBOL is set, then ^ does not match at the beginning of the + string; if REG_NOTEOL is set, then $ does not match at the end. + + We return 0 if we find a match and REG_NOMATCH if not. */ + +int +regexec (preg, string, nmatch, pmatch, eflags) + const regex_t *preg; + const char *string; + size_t nmatch; + regmatch_t pmatch[]; + int eflags; +{ + int length = strlen (string); + if (preg->no_sub) + return re_search_internal (preg, string, length, 0, length, 0, + NULL, eflags); + else + return re_search_internal (preg, string, length, 0, length, nmatch, + pmatch, eflags); +} +#ifdef _LIBC +weak_alias (__regexec, regexec) +#endif + +/* Entry points for GNU code. */ + +/* re_match is like re_match_2 except it takes only a single string. */ + +int +re_match (buffer, string, length, start, regs) + struct re_pattern_buffer *buffer; + const char *string; + int length, start; + struct re_registers *regs; +{ + int i, nregs, result, rval, eflags = 0; + regmatch_t *pmatch; + + eflags |= (buffer->not_bol) ? REG_NOTBOL : 0; + eflags |= (buffer->not_eol) ? REG_NOTEOL : 0; + + /* We need at least 1 register. */ + nregs = ((regs == NULL) ? 1 + : ((regs->num_regs > buffer->re_nsub) ? buffer->re_nsub + 1 + : regs->num_regs + 1)); + pmatch = re_malloc (regmatch_t, nregs); + if (pmatch == NULL) + return -2; + result = re_search_internal (buffer, string, length, start, 0, + nregs, pmatch, eflags); + + /* If caller wants register contents data back, do it. */ + if (regs && !buffer->no_sub) + { + /* Have the register data arrays been allocated? */ + if (buffer->regs_allocated == REGS_UNALLOCATED) + { /* No. So allocate them with malloc. We need one + extra element beyond `num_regs' for the `-1' marker + GNU code uses. */ + regs->num_regs = ((RE_NREGS > buffer->re_nsub + 1) ? RE_NREGS + : buffer->re_nsub + 1); + regs->start = re_malloc (regoff_t, regs->num_regs); + regs->end = re_malloc (regoff_t, regs->num_regs); + if (regs->start == NULL || regs->end == NULL) + { + re_free (pmatch); + return -2; + } + buffer->regs_allocated = REGS_REALLOCATE; + } + else if (buffer->regs_allocated == REGS_REALLOCATE) + { /* Yes. If we need more elements than were already + allocated, reallocate them. If we need fewer, just + leave it alone. */ + if (regs->num_regs < buffer->re_nsub + 1) + { + regs->num_regs = buffer->re_nsub + 1; + regs->start = re_realloc (regs->start, regoff_t, regs->num_regs); + regs->end = re_realloc (regs->end, regoff_t, regs->num_regs); + if (regs->start == NULL || regs->end == NULL) + { + re_free (pmatch); + return -2; + } + } + } + else + { + /* These braces fend off a "empty body in an else-statement" + warning under GCC when assert expands to nothing. */ + assert (buffer->regs_allocated == REGS_FIXED); + } + } + + /* Restore registers. */ + if (regs != NULL) + { + for (i = 0; i <= nregs; ++i) + { + regs->start[i] = pmatch[i].rm_so; + regs->end[i] = pmatch[i].rm_eo; + } + for ( ; i < regs->num_regs; ++i) + { + regs->start[i] = -1; + regs->end[i] = -1; + } + } + /* Return value is -1 if not match, the length of mathing otherwise. */ + rval = (result) ? -1 : pmatch[0].rm_eo - pmatch[0].rm_so; + re_free (pmatch); + return rval; +} +#ifdef _LIBC +weak_alias (__re_match, re_match) +#endif + +/* re_match_2 matches the compiled pattern in BUFP against the + the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 + and SIZE2, respectively). We start matching at POS, and stop + matching at STOP. + + If REGS is non-null and the `no_sub' field of BUFP is nonzero, we + store offsets for the substring each group matched in REGS. See the + documentation for exactly how many groups we fill. + + We return -1 if no match, -2 if an internal error. + Otherwise, we return the length of the matched substring. */ + +int +re_match_2 (buffer, string1, length1, string2, length2, start, regs, stop) + struct re_pattern_buffer *buffer; + const char *string1, *string2; + int length1, length2, start, stop; + struct re_registers *regs; +{ + int len, ret; + char *str = re_malloc (char, length1 + length2); + if (str == NULL) + return -2; + memcpy (str, string1, length1); + memcpy (str + length1, string2, length2); + len = (length1 + length2 < stop) ? length1 + length2 : stop; + ret = re_match (buffer, str, len, start, regs); + re_free (str); + return ret; +} +#ifdef _LIBC +weak_alias (__re_match_2, re_match_2) +#endif + +/* Like re_search_2, below, but only one string is specified, and + doesn't let you say where to stop matching. */ + +int +re_search (bufp, string, size, startpos, range, regs) + struct re_pattern_buffer *bufp; + const char *string; + int size, startpos, range; + struct re_registers *regs; +{ + int i, nregs, result, real_range, rval, eflags = 0; + regmatch_t *pmatch; + + eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; + eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; + + /* Check for out-of-range. */ + if (startpos < 0 || startpos > size) + return -1; + + /* We need at least 1 register. */ + nregs = ((regs == NULL) ? 1 + : ((regs->num_regs > bufp->re_nsub) ? bufp->re_nsub + 1 + : regs->num_regs + 1)); + pmatch = re_malloc (regmatch_t, nregs); + + /* Correct range if we need. */ + real_range = ((startpos + range > size) ? size - startpos + : ((startpos + range < 0) ? -startpos : range)); + + /* Compile fastmap if we haven't yet. */ + if (bufp->fastmap != NULL && !bufp->fastmap_accurate) + re_compile_fastmap (bufp); + + result = re_search_internal (bufp, string, size, startpos, real_range, + nregs, pmatch, eflags); + + /* If caller wants register contents data back, do it. */ + if (regs && !bufp->no_sub) + { + /* Have the register data arrays been allocated? */ + if (bufp->regs_allocated == REGS_UNALLOCATED) + { /* No. So allocate them with malloc. We need one + extra element beyond `num_regs' for the `-1' marker + GNU code uses. */ + regs->num_regs = ((RE_NREGS > bufp->re_nsub + 1) ? RE_NREGS + : bufp->re_nsub + 1); + regs->start = re_malloc (regoff_t, regs->num_regs); + regs->end = re_malloc (regoff_t, regs->num_regs); + if (regs->start == NULL || regs->end == NULL) + { + re_free (pmatch); + return -2; + } + bufp->regs_allocated = REGS_REALLOCATE; + } + else if (bufp->regs_allocated == REGS_REALLOCATE) + { /* Yes. If we need more elements than were already + allocated, reallocate them. If we need fewer, just + leave it alone. */ + if (regs->num_regs < bufp->re_nsub + 1) + { + regs->num_regs = bufp->re_nsub + 1; + regs->start = re_realloc (regs->start, regoff_t, regs->num_regs); + regs->end = re_realloc (regs->end, regoff_t, regs->num_regs); + if (regs->start == NULL || regs->end == NULL) + { + re_free (pmatch); + return -2; + } + } + } + else + { + /* These braces fend off a "empty body in an else-statement" + warning under GCC when assert expands to nothing. */ + assert (bufp->regs_allocated == REGS_FIXED); + } + } + + /* Restore registers. */ + if (regs != NULL) + { + for (i = 0; i <= bufp->re_nsub; ++i) + { + regs->start[i] = pmatch[i].rm_so; + regs->end[i] = pmatch[i].rm_eo; + } + for ( ; i < regs->num_regs; ++i) + { + regs->start[i] = -1; + regs->end[i] = -1; + } + } + /* Return value is -1 if not match, the position where the mathing starts + otherwise. */ + rval = (result) ? -1 : pmatch[0].rm_so; + re_free (pmatch); + return rval; +} +#ifdef _LIBC +weak_alias (__re_search, re_search) +#endif + +/* Using the compiled pattern in BUFP, first tries to match the virtual + concatenation of STRING1 and STRING2, starting first at index + STARTPOS, then at STARTPOS + 1, and so on. + + STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. + + RANGE is how far to scan while trying to match. RANGE = 0 means try + only at STARTPOS; in general, the last start tried is STARTPOS + + RANGE. + + In REGS, return the indices of the virtual concatenation of STRING1 + and STRING2 that matched the entire BUFP->buffer and its contained + subexpressions. + + Do not consider matching one past the index STOP in the virtual + concatenation of STRING1 and STRING2. + + We return either the position in the strings at which the match was + found, -1 if no match, or -2 if error. */ + +int +re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, + stop) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int length1, length2, start, range, stop; + struct re_registers *regs; +{ + int len, ret; + char *str = re_malloc (char, length1 + length2); + memcpy (str, string1, length1); + memcpy (str + length1, string2, length2); + len = (length1 + length2 < stop) ? length1 + length2 : stop; + ret = re_search (bufp, str, len, start, range, regs); + re_free (str); + return ret; +} +#ifdef _LIBC +weak_alias (__re_search_2, re_search_2) +#endif + +/* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use + this memory for recording register information. STARTS and ENDS + must be allocated using the malloc library routine, and must each + be at least NUM_REGS * sizeof (regoff_t) bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ + +void +re_set_registers (bufp, regs, num_regs, starts, ends) + struct re_pattern_buffer *bufp; + struct re_registers *regs; + unsigned num_regs; + regoff_t *starts, *ends; +{ + if (num_regs) + { + bufp->regs_allocated = REGS_REALLOCATE; + regs->num_regs = num_regs; + regs->start = starts; + regs->end = ends; + } + else + { + bufp->regs_allocated = REGS_UNALLOCATED; + regs->num_regs = 0; + regs->start = regs->end = (regoff_t *) 0; + } +} +#ifdef _LIBC +weak_alias (__re_set_registers, re_set_registers) +#endif + +/* Entry points compatible with 4.2 BSD regex library. We don't define + them unless specifically requested. */ + +#if defined _REGEX_RE_COMP || defined _LIBC +int +# ifdef _LIBC +weak_function +# endif +re_exec (s) + const char *s; +{ + return 0 == regexec (&re_comp_buf, s, 0, NULL, 0); +} +#endif /* _REGEX_RE_COMP */ + +static re_node_set empty_set; + +/* Internal entry point. */ + +/* Searches for a compiled pattern PREG in the string STRING, whose + length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same + mingings with regexec. START, and RANGE have the same meanings + with re_search. + Return 0 if we find a match and REG_NOMATCH if not. + Note: We assume front end functions already check ranges. + (START + RANGE >= 0 && START + RANGE <= LENGTH) */ + +static int +re_search_internal (preg, string, length, start, range, nmatch, pmatch, eflags) + const regex_t *preg; + const char *string; + int length, start, range, eflags; + size_t nmatch; + regmatch_t pmatch[]; +{ + re_dfa_t *dfa = (re_dfa_t *)preg->buffer; + re_string_t input; + re_dfastate_t **state_log; + int fl_longest_match, match_first, match_last = -1; + re_match_context_t mctx; + char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate) + ? preg->fastmap : NULL); + + /* Check if the DFA haven't been compiled. */ + if (preg->used == 0 || dfa->init_state == NULL + || dfa->init_state_word == NULL || dfa->init_state_nl == NULL + || dfa->init_state_begbuf == NULL) + return 1; + + re_node_set_init_empty (&empty_set); + + /* We must check the longest matching, if nmatch > 0. */ + fl_longest_match = (nmatch != 0); + + /* We will log all the DFA states through which the dfa pass, + if nmatch > 1, or this dfa has "multibyte node", which is a + back-reference or a node which can accept multibyte character or + multi character collating element. */ + if (nmatch > 1 || dfa->has_mb_node) + state_log = re_malloc (re_dfastate_t *, length + 1); + else + state_log = NULL; + + if (preg->syntax & RE_ICASE) + re_string_construct_toupper (&input, string, length, preg->translate); + else + re_string_construct (&input, string, length, preg->translate); + + match_ctx_init (&mctx, eflags, dfa->nbackref * 2); + +#ifdef DEBUG + /* We assume front-end functions already check them. */ + assert (start + range >= 0 && start + range <= length); +#endif + + /* Check incrementally whether of not the input string match. */ + for (match_first = start; ;) + { + if ((match_first < length + && (fastmap == NULL + || fastmap[re_string_byte_at (&input, match_first)])) + || preg->can_be_null) + { +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX == 1 || re_string_first_byte (&input, match_first)) +#endif + { + /* We assume that the matching starts from `match_first'. */ + re_string_set_index (&input, match_first); + mctx.match_first = mctx.state_log_top = match_first; + mctx.nbkref_ents = mctx.max_bkref_len = 0; + match_last = check_matching (preg, &input, &mctx, state_log, + match_first, 0, fl_longest_match); + if (match_last != -1) + break; + } + } + /* Update counter. */ + if (range < 0) + { + --match_first; + if (match_first < start + range) + break; + } + else + { + ++match_first; + if (match_first > start + range) + break; + } + } + + /* Set pmatch[] if we need. */ + if (match_last != -1 && nmatch > 0) + { + int reg_idx; + + /* Initialize registers. */ + for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) + pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; + + /* Set the points where matching start/end. */ + pmatch[0].rm_so = mctx.match_first; + mctx.match_last = pmatch[0].rm_eo = match_last; + + if (!preg->no_sub && nmatch > 1) + { + /* We need the ranges of all the subexpressions. */ + int halt_node; + re_dfastate_t *pstate = state_log[match_last]; +#ifdef DEBUG + assert (state_log != NULL); +#endif + halt_node = check_halt_state_context (preg, pstate, &input, + match_last, eflags); + sift_states_backward (preg, state_log, &mctx, &input, halt_node); + set_regs (preg, state_log, &mctx, &input, nmatch, pmatch, halt_node); + } + } + + re_free (state_log); + if (dfa->nbackref) + match_ctx_free (&mctx); + re_string_destruct (&input); + return match_last == -1; +} + +/* Acquire an initial state. + We must select appropriate initial state depending on the context, + since initial states may have constraints like "\<", "^", etc.. */ + +static inline re_dfastate_t * +acquire_init_state_context (preg, input, idx, eflags) + const regex_t *preg; + const re_string_t *input; + int idx, eflags; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + + if (dfa->init_state->has_constraint) + { + unsigned int context; + context = re_string_context_at (input, idx - 1, eflags, + preg->newline_anchor); + if (IS_WORD_CONTEXT (context)) + return dfa->init_state_word; + else if (IS_ORDINARY_CONTEXT (context)) + return dfa->init_state; + else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context)) + return dfa->init_state_begbuf; + else if (IS_NEWLINE_CONTEXT (context)) + return dfa->init_state_nl; + else if (IS_BEGBUF_CONTEXT (context)) + /* It is relatively rare case, then calculate on demand. */ + return re_acquire_state_context (dfa, dfa->init_state->entrance_nodes, + context); + else + /* Must not happen? */ + return dfa->init_state; + } + else + return dfa->init_state; +} + +/* Check whether the regular expression match input string INPUT or not, + and return the index where the matching end, or return -1 if not match. + FL_SEARCH means we must search where the matching starts, + FL_LONGEST_MATCH means we want the POSIX longest matching. */ + +static int +check_matching (preg, input, mctx, state_log, start_idx, fl_search, + fl_longest_match) + const regex_t *preg; + re_string_t *input; + re_match_context_t *mctx; + re_dfastate_t **state_log; + int start_idx, fl_search, fl_longest_match; +{ + int match = 0, match_last = -1; + re_dfastate_t *cur_state; + + cur_state = acquire_init_state_context (preg, input, start_idx, + mctx->eflags); + if (state_log != NULL) + state_log[start_idx] = cur_state; + /* If the RE accepts NULL string. */ + if (cur_state->halt) + { + if (!cur_state->has_constraint + || check_halt_state_context (preg, cur_state, input, start_idx, + mctx->eflags)) + { + if (!fl_longest_match) + return start_idx; + else + { + match_last = start_idx; + match = 1; + } + } + } + + while (!re_string_eoi (input)) + { + cur_state = transit_state (preg, cur_state, input, fl_search && !match, + state_log, mctx); + if (cur_state == NULL) /* Reached at the invalid state. */ + { + int cur_str_idx = re_string_cur_idx (input); + if (fl_search && !match) + { + /* Restart from initial state, since we are searching + the point from where matching start. */ +#ifdef RE_ENABLE_I18N + if (MB_CUR_MAX == 1 || re_string_first_byte (input, cur_str_idx)) +#endif /* RE_ENABLE_I18N */ + cur_state = acquire_init_state_context (preg, input, + cur_str_idx, + mctx->eflags); + if (state_log != NULL) + state_log[cur_str_idx] = cur_state; + } + else if (!fl_longest_match && match) + break; + else /* (fl_longest_match && match) || (!fl_search && !match) */ + { + if (state_log == NULL) + break; + else + { + int max = mctx->state_log_top; + for (; cur_str_idx <= max; ++cur_str_idx) + if (state_log[cur_str_idx] != NULL) + break; + if (cur_str_idx > max) + break; + } + } + } + + if (cur_state != NULL && cur_state->halt) + { + /* Reached at a halt state. + Check the halt state can satisfy the current context. */ + if (!cur_state->has_constraint + || check_halt_state_context (preg, cur_state, input, + re_string_cur_idx (input), + mctx->eflags)) + { + /* We found an appropriate halt state. */ + match_last = re_string_cur_idx (input); + match = 1; + if (!fl_longest_match) + break; + } + } + } + return match_last; +} + +/* Check NODE match the current context. */ + +static int check_halt_node_context (dfa, node, context) + const re_dfa_t *dfa; + int node; + unsigned int context; +{ + int entity; + re_token_type_t type = dfa->nodes[node].type; + if (type == END_OF_RE) + return 1; + if (type != OP_CONTEXT_NODE) + return 0; + entity = dfa->nodes[node].opr.ctx_info->entity; + if (dfa->nodes[entity].type != END_OF_RE + || NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[node].constraint, context)) + return 0; + return 1; +} + +/* Check the halt state STATE match the current context. + Return 0 if not match, if the node, STATE has, is a halt node and + match the context, return the node. */ + +static int +check_halt_state_context (preg, state, input, idx, eflags) + const regex_t *preg; + const re_dfastate_t *state; + const re_string_t *input; + int idx, eflags; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + int i; + unsigned int context; +#ifdef DEBUG + assert (state->halt); +#endif + context = re_string_context_at (input, idx, eflags, preg->newline_anchor); + for (i = 0; i < state->nodes.nelem; ++i) + if (check_halt_node_context (dfa, state->nodes.elems[i], context)) + return state->nodes.elems[i]; + return 0; +} + +/* Compute the next node to which "NFA" transit from NODE. + Return the destination node, and update EPS_VIA_NODES. + ("NFA" is a NFA corresponding to the DFA. */ + +static int +proceed_next_node (preg, state_log, mctx, input, pidx, node, eps_via_nodes) + const regex_t *preg; + re_dfastate_t **state_log; + const re_match_context_t *mctx; + const re_string_t *input; + int *pidx, node; + re_node_set *eps_via_nodes; +{ + re_dfa_t *dfa = (re_dfa_t *)preg->buffer; + int i, dest_node = -1; + if (IS_EPSILON_NODE (dfa->nodes[node].type)) + { + re_node_set_insert (eps_via_nodes, node); + for (i = 0; i < state_log[*pidx]->nodes.nelem; ++i) + { + int candidate = state_log[*pidx]->nodes.elems[i]; + if (!re_node_set_contains (dfa->edests + node, candidate) + && !(dfa->nodes[candidate].type == OP_CONTEXT_NODE + && re_node_set_contains (dfa->edests + node, + dfa->nodes[candidate].opr.ctx_info->entity))) + continue; + dest_node = candidate; + /* In order to avoid infinite loop like "(a*)*". */ + if (!re_node_set_contains (eps_via_nodes, dest_node)) + break; + } +#ifdef DEBUG + assert (dest_node != -1); +#endif + return dest_node; + } + else + { + int naccepted = 0, entity = node; + re_token_type_t type = dfa->nodes[node].type; + if (type == OP_CONTEXT_NODE) + { + entity = dfa->nodes[node].opr.ctx_info->entity; + type = dfa->nodes[entity].type; + } + + if (ACCEPT_MB_NODE (type)) + naccepted = check_node_accept_bytes (preg, entity, input, *pidx); + else if (type == OP_BACK_REF) + { + for (i = 0; i < mctx->nbkref_ents; ++i) + { + if (mctx->bkref_ents[i].node == node + && mctx->bkref_ents[i].from == *pidx) + naccepted = mctx->bkref_ents[i].to - *pidx; + } + if (naccepted == 0) + { + re_node_set_insert (eps_via_nodes, node); + dest_node = dfa->nexts[node]; + if (re_node_set_contains (&state_log[*pidx]->nodes, dest_node)) + return dest_node; + for (i = 0; i < state_log[*pidx]->nodes.nelem; ++i) + { + dest_node = state_log[*pidx]->nodes.elems[i]; + if ((dfa->nodes[dest_node].type == OP_CONTEXT_NODE + && (dfa->nexts[node] + == dfa->nodes[dest_node].opr.ctx_info->entity))) + return dest_node; + } + } + } + + if (naccepted != 0 + || check_node_accept (preg, dfa->nodes + node, input, *pidx, + mctx->eflags)) + { + dest_node = dfa->nexts[node]; + *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; +#ifdef DEBUG + assert (state_log[*pidx] != NULL); +#endif + re_node_set_empty (eps_via_nodes); + return dest_node; + } + } + /* Must not reach here. */ +#ifdef DEBUG + assert (0); +#endif + return 0; +} + +/* Set the positions where the subexpressions are starts/ends to registers + PMATCH. + Note: We assume that pmatch[0] is already set, and + pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */ + +static void +set_regs (preg, state_log, mctx, input, nmatch, pmatch, last_node) + const regex_t *preg; + re_dfastate_t **state_log; + const re_match_context_t *mctx; + const re_string_t *input; + size_t nmatch; + regmatch_t *pmatch; + int last_node; +{ + re_dfa_t *dfa = (re_dfa_t *)preg->buffer; + int idx, cur_node, node_entity, real_nmatch; + re_node_set eps_via_nodes; + int i; +#ifdef DEBUG + assert (nmatch > 1); + assert (state_log != NULL); +#endif + cur_node = dfa->init_node; + real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1; + re_node_set_init_empty (&eps_via_nodes); + for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) + { + node_entity = ((dfa->nodes[cur_node].type == OP_CONTEXT_NODE) + ? dfa->nodes[cur_node].opr.ctx_info->entity : cur_node); + for (i = 1; i < real_nmatch; ++i) + { + if (dfa->subexps[i - 1].start == dfa->subexps[i - 1].end) + { + /* In case of the null subexpression like '()'. */ + if (dfa->subexps[i - 1].start == node_entity) + { + pmatch[i].rm_so = idx; + pmatch[i].rm_eo = idx; + } + } + else if (dfa->subexps[i - 1].start <= node_entity + && node_entity < dfa->subexps[i - 1].end) + { + if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo != -1) + /* We are at the first node of this sub expression. */ + { + pmatch[i].rm_so = idx; + pmatch[i].rm_eo = -1; + } + } + else + { + if (pmatch[i].rm_so != -1 && pmatch[i].rm_eo == -1) + /* We are at the last node of this sub expression. */ + pmatch[i].rm_eo = idx; + } + } + if (idx == pmatch[0].rm_eo && cur_node == last_node) + break; + + /* Proceed to next node. */ + cur_node = proceed_next_node (preg, state_log, mctx, input, &idx, + cur_node, &eps_via_nodes); + } + re_node_set_free (&eps_via_nodes); + return; +} + +#define NUMBER_OF_STATE 1 + +/* This function checks the STATE_LOG from the MCTX->match_last + to MCTX->match_first and sift the nodes in each states according to + the following rules. Updated state_log will be wrote to STATE_LOG. + + Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if... + 1. When STR_IDX == MATCH_LAST(the last index in the state_log): + If `a' isn't the LAST_NODE and `a' can't epsilon transit to + the LAST_NODE, we throw away the node `a'. + 2. When MATCH_FIRST <= STR_IDX < MATCH_LAST and `a' accepts + string `s' and transit to `b': + i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw + away the node `a'. + ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is + throwed away, we throw away the node `a'. + 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b': + i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the + node `a'. + ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away, + we throw away the node `a'. */ + +#define STATE_NODE_CONTAINS(state,node) \ + ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) + +static void +sift_states_backward (preg, state_log, mctx, input, last_node) + const regex_t *preg; + re_dfastate_t **state_log; + const re_match_context_t *mctx; + const re_string_t *input; + int last_node; +{ + re_dfa_t *dfa = (re_dfa_t *)preg->buffer; + re_node_set state_buf; + int str_idx = mctx->match_last; + re_node_set *plog; /* Points the state_log[str_idx]->nodes */ + +#ifdef DEBUG + assert (state_log != NULL && state_log[str_idx] != NULL); +#endif + re_node_set_alloc (&state_buf, NUMBER_OF_STATE); + plog = &state_log[str_idx]->nodes; + + /* Build sifted state_log[str_idx]. It has the nodes which can epsilon + transit to the last_node and the last_node itself. */ + re_node_set_intersect (&state_buf, plog, dfa->inveclosures + last_node); + + if (state_log[str_idx] != NULL && state_log[str_idx]->has_backref) + add_epsilon_backreference (dfa, mctx, plog, str_idx, &state_buf); + + /* Update state log. */ + state_log[str_idx] = re_acquire_state (dfa, &state_buf); + + /* Then check each states in the state_log. */ + while (str_idx > mctx->match_first) + { + int i, j; + /* Update counters. */ + re_node_set_empty (&state_buf); + --str_idx; + plog = ((state_log[str_idx] == NULL) ? &empty_set + : &state_log[str_idx]->nodes); + + /* Then build the next sifted state. + We build the next sifted state on `state_buf', and update + `state_log[str_idx]' with `state_buf'. + Note: + `state_buf' is the sifted state from `state_log[str_idx + 1]'. + `plog' points the node_set of the old `state_log[str_idx]'. */ + for (i = 0; i < plog->nelem; i++) + { + int prev_node = plog->elems[i]; + int entity = prev_node; + int naccepted = 0; + re_token_type_t type = dfa->nodes[prev_node].type; + if (type == OP_CONTEXT_NODE) + { + entity = dfa->nodes[prev_node].opr.ctx_info->entity; + type = dfa->nodes[entity].type; + } + + /* If the node may accept `multi byte'. */ + if (ACCEPT_MB_NODE (type)) + naccepted = sift_states_iter_mb (preg, state_log, mctx, input, + entity, str_idx, + mctx->match_last); + + /* If the node is a back reference. */ + else if (type == OP_BACK_REF) + for (j = 0; j < mctx->nbkref_ents; ++j) + { + naccepted = sift_states_iter_bkref (dfa, state_log, + mctx->bkref_ents + j, + prev_node, str_idx, + mctx->match_first, + mctx->match_last); + if (naccepted) + break; + } + + if (!naccepted + && check_node_accept (preg, dfa->nodes + prev_node, input, + str_idx, mctx->eflags) + && STATE_NODE_CONTAINS (state_log[str_idx + 1], + dfa->nexts[prev_node])) + naccepted = 1; + + if (naccepted == 0) + continue; + + /* `prev_node' may point the entity of the OP_CONTEXT_NODE, + then we use plog->elems[i] instead. */ + re_node_set_add_intersect (&state_buf, plog, + dfa->inveclosures + prev_node); + } + if (state_log[str_idx] != NULL && state_log[str_idx]->has_backref) + add_epsilon_backreference (dfa, mctx, plog, str_idx, &state_buf); + + /* Update state_log. */ + state_log[str_idx] = re_acquire_state (dfa, &state_buf); + } + + re_node_set_free (&state_buf); +} + +/* Helper functions. */ + +static inline void +clean_state_log_if_need (state_log, mctx, next_state_log_idx) + re_dfastate_t **state_log; + re_match_context_t *mctx; + int next_state_log_idx; +{ + int top = mctx->state_log_top; + if (top < next_state_log_idx) + { + memset (state_log + top + 1, '\0', + sizeof (re_dfastate_t *) * (next_state_log_idx - top)); + mctx->state_log_top = next_state_log_idx; + } +} + +static int +sift_states_iter_mb (preg, state_log, mctx, input, node_idx, str_idx, + max_str_idx) + const regex_t *preg; + re_dfastate_t **state_log; + const re_match_context_t *mctx; + const re_string_t *input; + int node_idx, str_idx, max_str_idx; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + int naccepted; + /* Check the node can accept `multi byte'. */ + naccepted = check_node_accept_bytes (preg, node_idx, input, str_idx); + if (naccepted > 0 && str_idx + naccepted <= max_str_idx && + !STATE_NODE_CONTAINS (state_log[str_idx + naccepted], + dfa->nexts[node_idx])) + /* The node can't accept the `multi byte', or the + destination was already throwed away, then the node + could't accept the current input `multi byte'. */ + naccepted = 0; + /* Otherwise, it is sure that the node could accept + `naccepted' bytes input. */ + return naccepted; +} + +static int +sift_states_iter_bkref (dfa, state_log, mctx_entry, node_idx, idx, match_first, + match_last) + const re_dfa_t *dfa; + re_dfastate_t **state_log; + struct re_backref_cache_entry *mctx_entry; + int node_idx, idx, match_first, match_last; +{ + int naccepted = 0; + int from_idx, to_idx; + from_idx = mctx_entry->from; + to_idx = mctx_entry->to; + if (mctx_entry->node == node_idx + && from_idx == idx && to_idx <= match_last + && STATE_NODE_CONTAINS (state_log[to_idx], dfa->nexts[node_idx])) + naccepted = to_idx - from_idx; + return naccepted; +} + +static void +add_epsilon_backreference (dfa, mctx, plog, idx, state_buf) + const re_dfa_t *dfa; + const re_match_context_t *mctx; + const re_node_set *plog; + int idx; + re_node_set *state_buf; +{ + int i, j; + for (i = 0; i < plog->nelem; ++i) + { + int node_idx = plog->elems[i]; + re_token_type_t type = dfa->nodes[node_idx].type; + if (type == OP_CONTEXT_NODE) + type = dfa->nodes[dfa->nodes[node_idx].opr.ctx_info->entity].type; + + if (type == OP_BACK_REF && + !re_node_set_contains (state_buf, node_idx)) + { + for (j = 0; j < mctx->nbkref_ents; ++j) + { + struct re_backref_cache_entry *entry; + entry = mctx->bkref_ents + j; + if (entry->from == entry->to && entry->from == idx) + break; + } + if (j < mctx->nbkref_ents || idx == mctx->match_first) + { + re_node_set_add_intersect (state_buf, plog, + dfa->inveclosures + node_idx); + i = 0; + } + } + } +} + +/* Functions for state transition. */ + +/* Return the next state to which the current state STATE will transit by + accepting the current input byte, and update STATE_LOG if necessary. + If STATE can accept a multibyte char/collating element/back reference + update the destination of STATE_LOG. */ + +static re_dfastate_t * +transit_state (preg, state, input, fl_search, state_log, mctx) + const regex_t *preg; + re_dfastate_t *state, **state_log; + re_string_t *input; + int fl_search; + re_match_context_t *mctx; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_dfastate_t **trtable, *next_state; + unsigned char ch; + + if (state == NULL) + { + next_state = state; + re_string_skip_bytes (input, 1); + } + else + { + /* If the current state can accept multibyte. */ + if (state->accept_mb) + transit_state_mb (preg, state, input, state_log, mctx); + + /* Then decide the next state with the single byte. */ + if (1) + { + /* Use transition table */ + ch = re_string_fetch_byte (input); + trtable = fl_search ? state->trtable_search : state->trtable; + if (trtable == NULL) + { + trtable = build_trtable (preg, state, fl_search); + if (fl_search) + state->trtable_search = trtable; + else + state->trtable = trtable; + } + next_state = trtable[ch]; + } + else + { + /* don't use transition table */ + next_state = transit_state_sb (preg, state, input, fl_search, mctx); + } + } + + /* Update the state_log if we need. */ + if (state_log != NULL) + { + int cur_idx = re_string_cur_idx (input); + if (cur_idx > mctx->state_log_top) + { + state_log[cur_idx] = next_state; + mctx->state_log_top = cur_idx; + } + else if (state_log[cur_idx] == 0) + { + state_log[cur_idx] = next_state; + } + else + { + re_dfastate_t *pstate; + unsigned int context; + re_node_set next_nodes, *log_nodes, *table_nodes = NULL; + /* If (state_log[cur_idx] != 0), it implies that cur_idx is + the destination of a multibyte char/collating element/ + back reference. Then the next state is the union set of + these destinations and the results of the transition table. */ + pstate = state_log[cur_idx]; + log_nodes = pstate->entrance_nodes; + if (next_state != NULL) + { + table_nodes = next_state->entrance_nodes; + re_node_set_init_union (&next_nodes, table_nodes, log_nodes); + } + else + next_nodes = *log_nodes; + /* Note: We already add the nodes of the initial state, + then we don't need to add them here. */ + + context = re_string_context_at (input, re_string_cur_idx (input) - 1, + mctx->eflags, preg->newline_anchor); + next_state = state_log[cur_idx] + = re_acquire_state_context (dfa, &next_nodes, context); + if (table_nodes != NULL) + re_node_set_free (&next_nodes); + } + /* If the next state has back references. */ + if (next_state != NULL && next_state->has_backref) + { + transit_state_bkref (preg, next_state, input, state_log, mctx); + next_state = state_log[cur_idx]; + } + } + return next_state; +} + +/* Helper functions for transit_state. */ + +/* Return the next state to which the current state STATE will transit by + accepting the current input byte. */ + +static re_dfastate_t * +transit_state_sb (preg, state, input, fl_search, mctx) + const regex_t *preg; + re_dfastate_t *state; + re_string_t *input; + int fl_search; + re_match_context_t *mctx; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + re_node_set next_nodes; + re_dfastate_t *next_state; + int node_cnt, cur_str_idx = re_string_cur_idx (input); + unsigned int context; + + re_node_set_alloc (&next_nodes, state->nodes.nelem + 1); + for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt) + { + int cur_node = state->nodes.elems[node_cnt]; + if (check_node_accept (preg, dfa->nodes + cur_node, input, + cur_str_idx, mctx->eflags)) + re_node_set_merge (&next_nodes, + dfa->eclosures + dfa->nexts[cur_node]); + } + if (fl_search) + { +#ifdef RE_ENABLE_I18N + int not_initial = 0; + if (MB_CUR_MAX > 1) + for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt) + if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER) + { + not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial; + break; + } + if (!not_initial) +#endif + re_node_set_merge (&next_nodes, dfa->init_state->entrance_nodes); + } + context = re_string_context_at (input, cur_str_idx, mctx->eflags, + preg->newline_anchor); + next_state = re_acquire_state_context (dfa, &next_nodes, context); + re_node_set_free (&next_nodes); + re_string_skip_bytes (input, 1); + return next_state; +} + +static void +transit_state_mb (preg, pstate, input, state_log, mctx) + const regex_t *preg; + re_dfastate_t *pstate, **state_log; + const re_string_t *input; + re_match_context_t *mctx; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + int i; + + for (i = 0; i < pstate->nodes.nelem; ++i) + { + re_node_set dest_nodes, *new_nodes; + int cur_node_idx = pstate->nodes.elems[i]; + int naccepted = 0, dest_idx; + unsigned int context; + re_dfastate_t *dest_state; + + if (dfa->nodes[cur_node_idx].type == OP_CONTEXT_NODE) + { + context = re_string_context_at (input, re_string_cur_idx (input), + mctx->eflags, preg->newline_anchor); + if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint, + context)) + continue; + cur_node_idx = dfa->nodes[cur_node_idx].opr.ctx_info->entity; + } + + /* How many bytes the node can accepts? */ + if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type)) + naccepted = check_node_accept_bytes (preg, cur_node_idx, input, + re_string_cur_idx (input)); + if (naccepted == 0) + continue; + + /* The node can accepts `naccepted' bytes. */ + dest_idx = re_string_cur_idx (input) + naccepted; + clean_state_log_if_need (state_log, mctx, dest_idx); +#ifdef DEBUG + assert (dfa->nexts[cur_node_idx] != -1); +#endif + /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE, + then we use pstate->nodes.elems[i] instead. */ + new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]]; + + dest_state = state_log[dest_idx]; + if (dest_state == NULL) + dest_nodes = *new_nodes; + else + re_node_set_init_union (&dest_nodes, dest_state->entrance_nodes, + new_nodes); + context = re_string_context_at (input, dest_idx - 1, mctx->eflags, + preg->newline_anchor); + state_log[dest_idx] = re_acquire_state_context (dfa, &dest_nodes, context); + if (dest_state != NULL) + re_node_set_free (&dest_nodes); + } +} + +static void +transit_state_bkref (preg, pstate, input, state_log, mctx) + const regex_t *preg; + re_dfastate_t *pstate, **state_log; + const re_string_t *input; + re_match_context_t *mctx; +{ + re_dfastate_t **work_state_log; + +#ifdef DEBUG + assert (mctx->match_first != -1); +#endif + work_state_log = re_malloc (re_dfastate_t *, re_string_cur_idx (input) + 1); + + transit_state_bkref_loop (preg, input, &pstate->nodes, work_state_log, + state_log, mctx); + + re_free (work_state_log); +} + +/* Caller must allocate `work_state_log'. */ + +static void +transit_state_bkref_loop (preg, input, nodes, work_state_log, state_log, mctx) + const regex_t *preg; + const re_string_t *input; + re_node_set *nodes; + re_dfastate_t **work_state_log, **state_log; + re_match_context_t *mctx; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + int i, j; + regmatch_t *cur_regs = re_malloc (regmatch_t, preg->re_nsub + 1); + int cur_str_idx = re_string_cur_idx (input); + + for (i = 0; i < nodes->nelem; ++i) + { + int dest_str_idx, subexp_idx, prev_nelem, subexp_len; + int node_idx = nodes->elems[i]; + unsigned int context; + re_token_t *node = dfa->nodes + node_idx; + re_dfastate_t *dest_state; + re_node_set *new_dest_nodes; + + /* Check whether `node' is a backreference or not. */ + if (node->type == OP_BACK_REF) + subexp_idx = node->opr.idx; + else if (node->type == OP_CONTEXT_NODE && + dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF) + { + context = re_string_context_at (input, cur_str_idx, mctx->eflags, + preg->newline_anchor); + if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) + continue; + subexp_idx = dfa->nodes[node->opr.ctx_info->entity].opr.idx; + } + else + continue; + + /* `node' is a backreference. + At first, set registers to check the backreference. */ + cur_regs[0].rm_so = mctx->match_first; + cur_regs[0].rm_eo = cur_str_idx; + memcpy (work_state_log + mctx->match_first, + state_log + mctx->match_first, + sizeof (re_dfastate_t *) + * (cur_str_idx - mctx->match_first + 1)); + mctx->match_last = cur_str_idx; + sift_states_backward (preg, work_state_log, mctx, input, node_idx); + if (!STATE_NODE_CONTAINS (work_state_log[mctx->match_first], + dfa->init_node)) + continue; + for (j = 1; j <= preg->re_nsub; ++j) + cur_regs[j].rm_so = cur_regs[j].rm_eo = -1; + set_regs (preg, work_state_log, mctx, input, + subexp_idx + 1, cur_regs, node_idx); + + /* Then check that the backreference can match the input string. */ + subexp_len = cur_regs[subexp_idx].rm_eo - cur_regs[subexp_idx].rm_so; + if (subexp_len < 0 + || (strncmp ((re_string_get_buffer (input) + + cur_regs[subexp_idx].rm_so), + re_string_get_buffer (input) + cur_str_idx, subexp_len) + != 0)) + continue; + + /* Successfully matched, add a new cache entry. */ + dest_str_idx = cur_str_idx + subexp_len; + match_ctx_add_entry (mctx, node_idx, cur_str_idx, dest_str_idx); + clean_state_log_if_need (state_log, mctx, dest_str_idx); + + /* And add the epsilon closures (which is `new_dest_nodes') of + the backreference to appropriate state_log. */ +#ifdef DEBUG + assert (dfa->nexts[node_idx] != -1); +#endif + if (node->type == OP_CONTEXT_NODE && subexp_len == 0) + new_dest_nodes = dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure; + else + new_dest_nodes = dfa->eclosures + dfa->nexts[node_idx]; + context = (IS_WORD_CHAR (re_string_byte_at (input, dest_str_idx - 1)) + ? CONTEXT_WORD : 0); + dest_state = state_log[dest_str_idx]; + + prev_nelem = ((state_log[cur_str_idx] == NULL) ? 0 + : state_log[cur_str_idx]->nodes.nelem); + /* Add `new_dest_node' to state_log. */ + if (dest_state == NULL) + state_log[dest_str_idx] = re_acquire_state_context (dfa, + new_dest_nodes, + context); + else + { + re_node_set dest_nodes; + re_node_set_init_union (&dest_nodes, dest_state->entrance_nodes, + new_dest_nodes); + state_log[dest_str_idx] = re_acquire_state_context (dfa, &dest_nodes, + context); + re_node_set_free (&dest_nodes); + } + + /* We need to check recursively if the backreference can epsilon + transit. */ + if (subexp_len == 0 && state_log[cur_str_idx]->nodes.nelem > prev_nelem) + transit_state_bkref_loop (preg, input, new_dest_nodes, work_state_log, + state_log, mctx); + } + re_free (cur_regs); +} + +/* Build transition table for the state. */ + +static re_dfastate_t ** +build_trtable (preg, state, fl_search) + const regex_t *preg; + const re_dfastate_t *state; + int fl_search; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + int i, j, k, ch; + int ndests; /* Number of the destination states from `state'. */ + re_dfastate_t **trtable, **dest_states, **dest_states_word, **dest_states_nl; + re_node_set follows, *dests_node; + bitset *dests_ch; + bitset acceptable; + + /* We build DFA states which corresponds to the destination nodes + from `state'. `dests_node[i]' represents the nodes which i-th + destination state contains, and `dests_ch[i]' represents the + characters which i-th destination state accepts. */ + dests_node = re_malloc (re_node_set, SBC_MAX); + dests_ch = re_malloc (bitset, SBC_MAX); + + /* Initialize transiton table. */ + trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); + + /* At first, group all nodes belonging to `state' into several + destinations. */ + ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch); + if (ndests == 0) + { + re_free (dests_node); + re_free (dests_ch); + return trtable; + } + + dest_states = re_malloc (re_dfastate_t *, ndests); + dest_states_word = re_malloc (re_dfastate_t *, ndests); + dest_states_nl = re_malloc (re_dfastate_t *, ndests); + bitset_empty (acceptable); + + re_node_set_alloc (&follows, ndests + 1); + /* Then build the states for all destinations. */ + for (i = 0; i < ndests; ++i) + { + int next_node; + re_node_set_empty (&follows); + /* Merge the follows of this destination states. */ + for (j = 0; j < dests_node[i].nelem; ++j) + { + next_node = dfa->nexts[dests_node[i].elems[j]]; + if (next_node != -1) + { + re_node_set_merge (&follows, dfa->eclosures + next_node); + } + } + /* If search flag is set, merge the initial state. */ + if (fl_search) + { +#ifdef RE_ENABLE_I18N + int not_initial = 0; + for (j = 0; j < follows.nelem; ++j) + if (dfa->nodes[follows.elems[j]].type == CHARACTER) + { + not_initial = dfa->nodes[follows.elems[j]].mb_partial; + break; + } + if (!not_initial) +#endif + re_node_set_merge (&follows, dfa->init_state->entrance_nodes); + } + dest_states[i] = re_acquire_state_context (dfa, &follows, 0); + /* If the new state has context constraint, + build appropriate states for these contexts. */ + if (dest_states[i]->has_constraint) + { + dest_states_word[i] = re_acquire_state_context (dfa, &follows, + CONTEXT_WORD); + dest_states_nl[i] = re_acquire_state_context (dfa, &follows, + CONTEXT_NEWLINE); + } + else + { + dest_states_word[i] = dest_states[i]; + dest_states_nl[i] = dest_states[i]; + } + bitset_merge (acceptable, dests_ch[i]); + } + + /* Update the transition table. */ + for (i = 0, ch = 0; i < BITSET_UINTS; ++i) + for (j = 0; j < UINT_BITS; ++j, ++ch) + if ((acceptable[i] >> j) & 1) + { + if (IS_WORD_CHAR (ch)) + { + for (k = 0; k < ndests; ++k) + if ((dests_ch[k][i] >> j) & 1) + trtable[ch] = dest_states_word[k]; + } + else /* not WORD_CHAR */ + { + for (k = 0; k < ndests; ++k) + if ((dests_ch[k][i] >> j) & 1) + trtable[ch] = dest_states[k]; + } + } + /* new line */ + for (k = 0; k < ndests; ++k) + if (bitset_contain (acceptable, NEWLINE_CHAR)) + trtable[NEWLINE_CHAR] = dest_states_nl[k]; + + re_free (dest_states_nl); + re_free (dest_states_word); + re_free (dest_states); + + re_node_set_free (&follows); + for (i = 0; i < ndests; ++i) + re_node_set_free (dests_node + i); + + re_free (dests_ch); + re_free (dests_node); + + return trtable; +} + +/* Group all nodes belonging to STATE into several destinations. + Then for all destinations, set the nodes belonging to the destination + to DESTS_NODE[i] and set the characters accepted by the destination + to DEST_CH[i]. This function return the number of destinations. */ + +static int +group_nodes_into_DFAstates (preg, state, dests_node, dests_ch) + const regex_t *preg; + const re_dfastate_t *state; + re_node_set *dests_node; + bitset *dests_ch; +{ + const re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + int i, j, k; + int ndests; /* Number of the destinations from `state'. */ + bitset accepts; /* Characters a node can accept. */ + const re_node_set *cur_nodes = &state->nodes; + bitset_empty (accepts); + ndests = 0; + + /* For all the nodes belonging to `state', */ + for (i = 0; i < cur_nodes->nelem; ++i) + { + unsigned int constraint = 0; + re_token_t *node = &dfa->nodes[cur_nodes->elems[i]]; + re_token_type_t type = node->type; + + if (type == OP_CONTEXT_NODE) + { + constraint = node->constraint; + node = dfa->nodes + node->opr.ctx_info->entity; + type = node->type; + } + + /* Enumerate all single byte character this node can accept. */ + if (type == CHARACTER) + bitset_set (accepts, node->opr.c); + else if (type == SIMPLE_BRACKET) + { + bitset_merge (accepts, node->opr.sbcset); + } + else if (type == OP_PERIOD) + { + bitset_set_all (accepts); + if (!(preg->syntax & RE_DOT_NEWLINE)) + bitset_clear (accepts, '\n'); + if (preg->syntax & RE_DOT_NOT_NULL) + bitset_clear (accepts, '\0'); + } + else + continue; + + /* Check the `accepts' and sift the characters which are not + match it the context. */ + if (constraint) + { + if (constraint & NEXT_WORD_CONSTRAINT) + for (j = 0; j < BITSET_UINTS; ++j) + accepts[j] &= dfa->word_char[j]; + else if (constraint & NEXT_NOTWORD_CONSTRAINT) + for (j = 0; j < BITSET_UINTS; ++j) + accepts[j] &= ~dfa->word_char[j]; + else if (constraint & NEXT_NEWLINE_CONSTRAINT) + { + int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); + bitset_empty (accepts); + if (accepts_newline) + bitset_set (accepts, NEWLINE_CHAR); + else + continue; + } + } + + /* Then divide `accepts' into DFA states, or create a new + state. */ + for (j = 0; j < ndests; ++j) + { + bitset intersec; /* Intersection sets, see below. */ + bitset remains; + /* Flags, see below. */ + int has_intersec, not_subset, not_consumed; + + /* Optimization, skip if this state doesn't accept the character. */ + if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) + continue; + + /* Enumerate the intersection set of this state and `accepts'. */ + has_intersec = 0; + for (k = 0; k < BITSET_UINTS; ++k) + has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; + /* And skip if the intersection set is empty. */ + if (!has_intersec) + continue; + + /* Then check if this state is a subset of `accepts'. */ + not_subset = not_consumed = 0; + for (k = 0; k < BITSET_UINTS; ++k) + { + not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; + not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; + } + + /* If this state isn't a subset of `accepts', create a + new group state, which has the `remains'. */ + if (not_subset) + { + bitset_copy (dests_ch[ndests], remains); + bitset_copy (dests_ch[j], intersec); + re_node_set_init_copy (dests_node + ndests, &dests_node[j]); + ++ndests; + } + + /* Put the position in the current group. */ + re_node_set_insert (&dests_node[j], cur_nodes->elems[i]); + + /* If all characters are consumed, go to next node. */ + if (!not_consumed) + break; + } + /* Some characters remain, create a new group. */ + if (j == ndests) + { + bitset_copy (dests_ch[ndests], accepts); + re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]); + ++ndests; + bitset_empty (accepts); + } + } + return ndests; +} + +/* Check how many bytes the node `dfa->nodes[node_idx]' accepts. */ + +static int +check_node_accept_bytes (preg, node_idx, input, str_idx) + const regex_t *preg; + int node_idx, str_idx; + const re_string_t *input; +{ + const re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + const re_token_t *node = dfa->nodes + node_idx; + int elem_len = re_string_elem_size_at (input, str_idx); + int char_len = re_string_char_size_at (input, str_idx); + int i, j; +#ifdef _LIBC + uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); +#endif /* _LIBC */ + if (elem_len <= 1 && char_len <= 1) + return 0; + if (node->type == OP_PERIOD) + { + if ((!(preg->syntax & RE_DOT_NEWLINE) && + re_string_byte_at (input, str_idx) == '\n') || + ((preg->syntax & RE_DOT_NOT_NULL) && + re_string_byte_at (input, str_idx) == '\0')) + return 0; + return char_len; + } + else if (node->type == COMPLEX_BRACKET) + { + const re_charset_t *cset = node->opr.mbcset; + const unsigned char *pin = re_string_get_buffer (input) + str_idx; +#ifdef _LIBC + if (nrules != 0) + { + int match_len = 0; + unsigned int in_collseq = 0; + const int32_t *table, *indirect; + const unsigned char *weights, *extra, *collseqwc; + int32_t idx; + wchar_t wc = 0; + /* This #include defines a local function! */ +# include + + /* match with collating_symbol? */ + if (cset->ncoll_syms) + extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); + for (i = 0; i < cset->ncoll_syms; ++i) + { + const unsigned char *coll_sym = extra + cset->coll_syms[i]; + /* Compare the length of input collating element and + the length of current collating element. */ + if (*coll_sym != elem_len) + continue; + /* Compare each bytes. */ + for (j = 0; j < *coll_sym; j++) + if (pin[j] != coll_sym[1 + j]) + break; + if (j == *coll_sym) + { + /* Match if every bytes is equal. */ + match_len = j; + goto check_node_accept_bytes_match; + } + } + + if (cset->nranges || cset->nchar_classes || cset->nmbchars) + wc = re_string_wchar_at (input, str_idx); + + if (cset->nranges) + { + if (elem_len <= char_len) + { + collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); + in_collseq = collseq_table_lookup (collseqwc, wc); + } + else + in_collseq = find_collation_sequence_value (pin, elem_len); + } + /* match with range expression? */ + for (i = 0; i < cset->nranges; ++i) + if (cset->range_starts[i] <= in_collseq + && in_collseq <= cset->range_ends[i]) + { + match_len = elem_len; + goto check_node_accept_bytes_match; + } + + /* match with equivalence_class? */ + if (cset->nequiv_classes) + { + const unsigned char *cp = pin; + table = (const int32_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); + weights = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); + extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); + indirect = (const int32_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); + idx = findidx (&cp); + if (idx > 0) + for (i = 0; i < cset->nequiv_classes; ++i) + { + int32_t equiv_class_idx = cset->equiv_classes[i]; + size_t weight_len = weights[idx]; + if (weight_len == weights[equiv_class_idx]) + { + int cnt = 0; + while (cnt <= weight_len + && (weights[equiv_class_idx + 1 + cnt] + == weights[idx + 1 + cnt])) + ++cnt; + if (cnt > weight_len) + { + match_len = elem_len; + goto check_node_accept_bytes_match; + } + } + } + } + + /* match with multibyte character? */ + for (i = 0; i < cset->nmbchars; ++i) + if (wc == cset->mbchars[i]) + { + match_len = char_len; + goto check_node_accept_bytes_match; + } + + /* match with character_class? */ + for (i = 0; i < cset->nchar_classes; ++i) + { + wctype_t wt = cset->char_classes[i]; + if (__iswctype (wc, wt)) + { + match_len = char_len; + goto check_node_accept_bytes_match; + } + } + + check_node_accept_bytes_match: + if (!cset->non_match) + return match_len; + else + { + if (match_len > 0) + return 0; + else + return re_string_elem_size_at (input, str_idx); + } + } +#endif + } + return 0; +} + +#ifdef _LIBC +static unsigned int +find_collation_sequence_value (mbs, mbs_len) + const unsigned char *mbs; + size_t mbs_len; +{ + uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); + if (nrules == 0) + { + if (mbs_len == 1) + { + /* No valid character. Match it as a single byte character. */ + const unsigned char *collseq = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); + return collseq[mbs[0]]; + } + return UINT_MAX; + } + else + { + int32_t idx; + const unsigned char *extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); + + for (idx = 0; ;) + { + int mbs_cnt, found = 0; + int32_t elem_mbs_len; + /* Skip the name of collating element name. */ + idx = idx + extra[idx] + 1; + elem_mbs_len = extra[idx++]; + if (mbs_len == elem_mbs_len) + { + for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt) + if (extra[idx + mbs_cnt] != mbs[mbs_cnt]) + break; + if (mbs_cnt == elem_mbs_len) + /* Found the entry. */ + found = 1; + } + /* Skip the byte sequence of the collating element. */ + idx += elem_mbs_len; + /* Adjust for the alignment. */ + idx = (idx + 3) & ~3; + /* Skip the collation sequence value. */ + idx += sizeof (uint32_t); + /* Skip the wide char sequence of the collating element. */ + idx = idx + sizeof (uint32_t) * (extra[idx] + 1); + /* If we found the entry, return the sequence value. */ + if (found) + return *(uint32_t *) (extra + idx); + /* Skip the collation sequence value. */ + idx += sizeof (uint32_t); + } + } +} +#endif + +/* Check whether the node accepts the byte which is IDX-th + byte of the INPUT. */ + +static int +check_node_accept (preg, node, input, idx, eflags) + const regex_t *preg; + const re_token_t *node; + const re_string_t *input; + int idx, eflags; +{ + const re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + const re_token_t *cur_node; + unsigned char ch; + if (node->type == OP_CONTEXT_NODE) + { + /* The node has constraints. Check whether the current context + satisfies the constraints. */ + unsigned int context = re_string_context_at (input, idx, eflags, + preg->newline_anchor); + if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) + return 0; + cur_node = dfa->nodes + node->opr.ctx_info->entity; + } + else + cur_node = node; + + ch = re_string_byte_at (input, idx); + if (cur_node->type == CHARACTER) + return cur_node->opr.c == ch; + else if (cur_node->type == SIMPLE_BRACKET) + return bitset_contain (cur_node->opr.sbcset, ch); + else if (cur_node->type == OP_PERIOD) + return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE)) + || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL))); + else + return 0; +} + +/* Functions for matching context. */ + +static void +match_ctx_init (mctx, eflags, n) + re_match_context_t *mctx; + int eflags; + int n; +{ + mctx->eflags = eflags; + mctx->match_first = mctx->match_last = -1; + if (n > 0) + mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); + else + mctx->bkref_ents = NULL; + mctx->nbkref_ents = 0; + mctx->abkref_ents = n; + mctx->max_bkref_len = 0; +} + +static void +match_ctx_free (mctx) + re_match_context_t *mctx; +{ + re_free (mctx->bkref_ents); +} + +/* Add a new backreference entry to the cache. */ + +static void +match_ctx_add_entry (mctx, node, from, to) + re_match_context_t *mctx; + int node, from, to; +{ + if (mctx->nbkref_ents >= mctx->abkref_ents) + { + mctx->bkref_ents = re_realloc (mctx->bkref_ents, + struct re_backref_cache_entry, + mctx->abkref_ents * 2); + memset (mctx->bkref_ents + mctx->nbkref_ents, '\0', + sizeof (struct re_backref_cache_entry) * mctx->abkref_ents); + mctx->abkref_ents *= 2; + } + mctx->bkref_ents[mctx->nbkref_ents].node = node; + mctx->bkref_ents[mctx->nbkref_ents].from = from; + mctx->bkref_ents[mctx->nbkref_ents++].to = to; + if (mctx->max_bkref_len < to - from) + mctx->max_bkref_len = to - from; +}