From c202c2c50523dd6721e9e2a9c80c1dc018f373bc Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Tue, 10 Sep 2002 18:40:35 +0000 Subject: [PATCH] Update. 2002-09-10 Isamu Hasegawa * posix/regexec.c (build_trtable): Fix the destination of newline to prevent wrong states from overwriting. Append break statements to optimization. 2002-09-10 Isamu Hasegawa * posix/regcomp.c: Wrap #include wchar.h and wctype.h in #if. (build_range_exp): Add castings to strlen invocations. (build_collating_symbol): Restore the type of characters from "char" to "unsigned char", and supplement castings. (build_collating_symbol): Likewise. (build_equiv_class): Likewise. (build_charclass): Likewise. (seek_collating_symbol_entry): Likewise. (parse_bracket_exp): Likewise. (build_word_op): Supplement a casting. * posix/regex_internal.c: Wrap #include wchar.h and wctype.h in #if. (re_string_allocate): Fix castings. (re_string_construct): Likewise. (re_string_construct_common): Likewise. (re_string_realloc_buffers): Likewise. (build_wcs_buffer): Likewise. (build_wcs_upper_buffer): Likewise. (re_string_skip_chars): Likewise. (re_string_reconstruct): Likewise. * posix/regex_internal.h: Restore the type of characters in re_string_t and bracket_elem_t from "char" to "unsigned char". (re_string_elem_size_at): Fix castings. * posix/regexec.c: Wrap #include wchar.h and wctype.h in #if. (transit_state_bkref_loop): Restore the type of characters from "char" to "unsigned char", and append a cast to "char*" pointer in array subscript. (check_node_accept_bytes): Likewise. (find_collation_sequence_value): Likewise. --- ChangeLog | 37 +++++++++++++++++++++ localedata/ChangeLog | 5 +++ localedata/locales/bg_BG | 8 ++--- posix/regex_internal.c | 42 +++++++++++++----------- posix/regex_internal.h | 15 +++++---- posix/regexec.c | 69 +++++++++++++++++++++++++++++----------- 6 files changed, 129 insertions(+), 47 deletions(-) diff --git a/ChangeLog b/ChangeLog index 412c302f04..43e8f46503 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,40 @@ +2002-09-10 Isamu Hasegawa + + * posix/regexec.c (build_trtable): Fix the destination of + newline to prevent wrong states from overwriting. + Append break statements to optimization. + +2002-09-10 Isamu Hasegawa + + * posix/regcomp.c: Wrap #include wchar.h and wctype.h in #if. + (build_range_exp): Add castings to strlen invocations. + (build_collating_symbol): Restore the type of characters from "char" + to "unsigned char", and supplement castings. + (build_collating_symbol): Likewise. + (build_equiv_class): Likewise. + (build_charclass): Likewise. + (seek_collating_symbol_entry): Likewise. + (parse_bracket_exp): Likewise. + (build_word_op): Supplement a casting. + * posix/regex_internal.c: Wrap #include wchar.h and wctype.h in #if. + (re_string_allocate): Fix castings. + (re_string_construct): Likewise. + (re_string_construct_common): Likewise. + (re_string_realloc_buffers): Likewise. + (build_wcs_buffer): Likewise. + (build_wcs_upper_buffer): Likewise. + (re_string_skip_chars): Likewise. + (re_string_reconstruct): Likewise. + * posix/regex_internal.h: Restore the type of characters in + re_string_t and bracket_elem_t from "char" to "unsigned char". + (re_string_elem_size_at): Fix castings. + * posix/regexec.c: Wrap #include wchar.h and wctype.h in #if. + (transit_state_bkref_loop): Restore the type of characters from + "char" to "unsigned char", and append a cast to "char*" pointer in + array subscript. + (check_node_accept_bytes): Likewise. + (find_collation_sequence_value): Likewise. + 2002-09-10 Hartvig Ekner * sysdeps/mips/memcpy.S: New file. diff --git a/localedata/ChangeLog b/localedata/ChangeLog index f9d40bd137..d31e094f32 100644 --- a/localedata/ChangeLog +++ b/localedata/ChangeLog @@ -1,3 +1,8 @@ +2002-09-10 Ulrich Drepper + + * localedata/locales/bg_BG: Update LC_IDENTIFICATION info. + Patch by Yanko Kaneti . + 2002-09-01 Roland McGrath * tst-ctype.c (main): Use nl_langinfo instead of __ctype_b global. diff --git a/localedata/locales/bg_BG b/localedata/locales/bg_BG index 31cd3e5dc7..2736174c4a 100644 --- a/localedata/locales/bg_BG +++ b/localedata/locales/bg_BG @@ -21,10 +21,10 @@ contact "Delyan Toshev" email "delyant@yahoo.com" tel "" fax "" -language "bg" -territory "BG" -revision "2.0" -date "2001-11-16" +language "Bulgarian" +territory "Bulgaria" +revision "2.0.1" +date "2002-09-10" category "bg_BG:2000";LC_IDENTIFICATION category "bg_BG:2000";LC_CTYPE diff --git a/posix/regex_internal.c b/posix/regex_internal.c index c4400a8c23..116543a6da 100644 --- a/posix/regex_internal.c +++ b/posix/regex_internal.c @@ -24,8 +24,13 @@ #include #include #include -#include -#include + +#if defined HAVE_WCHAR_H || defined _LIBC +# include +#endif /* HAVE_WCHAR_H || _LIBC */ +#if defined HAVE_WCTYPE_H || defined _LIBC +# include +#endif /* HAVE_WCTYPE_H || _LIBC */ #ifdef _LIBC # ifndef _RE_DEFINE_LOCALE_FUNCTIONS @@ -99,7 +104,8 @@ re_string_allocate (pstr, str, len, init_len, trans, icase) if (BE (ret != REG_NOERROR, 0)) return ret; - pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case : (char *) str); + pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case + : (unsigned char *) str); pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case; pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr) || MB_CUR_MAX > 1) ? pstr->valid_len : len; @@ -127,7 +133,8 @@ re_string_construct (pstr, str, len, trans, icase) if (BE (ret != REG_NOERROR, 0)) return ret; } - pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case : (char *) str); + pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case + : (unsigned char *) str); pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case; if (icase) @@ -176,13 +183,13 @@ re_string_realloc_buffers (pstr, new_buf_len) #endif /* RE_ENABLE_I18N */ if (MBS_ALLOCATED (pstr)) { - pstr->mbs = re_realloc (pstr->mbs, char, new_buf_len); + pstr->mbs = re_realloc (pstr->mbs, unsigned char, new_buf_len); if (BE (pstr->mbs == NULL, 0)) return REG_ESPACE; } if (MBS_CASE_ALLOCATED (pstr)) { - pstr->mbs_case = re_realloc (pstr->mbs_case, char, new_buf_len); + pstr->mbs_case = re_realloc (pstr->mbs_case, unsigned char, new_buf_len); if (BE (pstr->mbs_case == NULL, 0)) return REG_ESPACE; if (!MBS_ALLOCATED (pstr)) @@ -202,7 +209,7 @@ re_string_construct_common (str, len, pstr, trans, icase) int icase; { memset (pstr, '\0', sizeof (re_string_t)); - pstr->raw_mbs = str; + pstr->raw_mbs = (const unsigned char *) str; pstr->len = len; pstr->trans = trans; pstr->icase = icase ? 1 : 0; @@ -235,8 +242,8 @@ build_wcs_buffer (pstr) wchar_t wc; remain_len = end_idx - byte_idx; prev_st = pstr->cur_state; - mbclen = mbrtowc (&wc, pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, - remain_len, &pstr->cur_state); + mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx + + byte_idx), remain_len, &pstr->cur_state); if (BE (mbclen == (size_t) -2, 0)) { /* The buffer doesn't have enough space, finish to build. */ @@ -254,9 +261,8 @@ build_wcs_buffer (pstr) /* Apply the translateion if we need. */ if (pstr->trans != NULL && mbclen == 1) { - int ch = *((unsigned char *) pstr->raw_mbs + pstr->raw_mbs_idx - + byte_idx); - pstr->mbs_case[byte_idx] = pstr->trans[ch]; + int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]]; + pstr->mbs_case[byte_idx] = ch; } /* Write wide character and padding. */ pstr->wcs[byte_idx++] = wc; @@ -284,8 +290,8 @@ build_wcs_upper_buffer (pstr) wchar_t wc; remain_len = end_idx - byte_idx; prev_st = pstr->cur_state; - mbclen = mbrtowc (&wc, pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, - remain_len, &pstr->cur_state); + mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx + + byte_idx), remain_len, &pstr->cur_state); if (BE (mbclen == (size_t) -2, 0)) { /* The buffer doesn't have enough space, finish to build. */ @@ -310,7 +316,7 @@ build_wcs_upper_buffer (pstr) else /* mbclen > 1 */ { if (iswlower (wc)) - wcrtomb (pstr->mbs + byte_idx, towupper (wc), &prev_st); + wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st); else memcpy (pstr->mbs + byte_idx, pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); @@ -340,7 +346,7 @@ re_string_skip_chars (pstr, new_raw_idx) { int remain_len = pstr->len - rawbuf_idx; prev_st = pstr->cur_state; - mbclen = mbrlen (pstr->raw_mbs + rawbuf_idx, remain_len, + mbclen = mbrlen ((const char *) pstr->raw_mbs + rawbuf_idx, remain_len, &pstr->cur_state); if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) { @@ -420,9 +426,9 @@ re_string_reconstruct (pstr, idx, eflags, newline) pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF : CONTEXT_NEWLINE | CONTEXT_BEGBUF); if (!MBS_CASE_ALLOCATED (pstr)) - pstr->mbs_case = (char *) pstr->raw_mbs; + pstr->mbs_case = (unsigned char *) pstr->raw_mbs; if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr)) - pstr->mbs = (char *) pstr->raw_mbs; + pstr->mbs = (unsigned char *) pstr->raw_mbs; offset = idx; } diff --git a/posix/regex_internal.h b/posix/regex_internal.h index 574bf652b8..9f1f9826f2 100644 --- a/posix/regex_internal.h +++ b/posix/regex_internal.h @@ -228,15 +228,15 @@ struct re_string_t { /* Indicate the raw buffer which is the original string passed as an argument of regexec(), re_search(), etc.. */ - const char *raw_mbs; + const unsigned char *raw_mbs; /* Store the multibyte string. In case of "case insensitive mode" like REG_ICASE, upper cases of the string are stored, otherwise MBS points the same address that RAW_MBS points. */ - char *mbs; + unsigned char *mbs; /* Store the case sensitive multibyte string. In case of "case insensitive mode", the original string are stored, otherwise MBS_CASE points the same address that MBS points. */ - char *mbs_case; + unsigned char *mbs_case; #ifdef RE_ENABLE_I18N /* Store the wide character string which is corresponding to MBS. */ wint_t *wcs; @@ -512,7 +512,7 @@ typedef struct union { unsigned char ch; - char *name; + unsigned char *name; wchar_t wch; } opr; } bracket_elem_t; @@ -580,7 +580,7 @@ re_string_elem_size_at (pstr, idx) int idx; { #ifdef _LIBC - const char *extra, *p; + const unsigned char *p, *extra; const int32_t *table, *indirect; int32_t tmp; # include @@ -589,11 +589,12 @@ re_string_elem_size_at (pstr, idx) if (nrules != 0) { table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); - extra = (const char *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); + extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); p = pstr->mbs + idx; - tmp = findidx ((const unsigned char **) &p); + tmp = findidx (&p); return p - pstr->mbs - idx; } else diff --git a/posix/regexec.c b/posix/regexec.c index 4a9c64a191..142127883d 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -23,8 +23,13 @@ #include #include #include -#include -#include + +#if defined HAVE_WCHAR_H || defined _LIBC +# include +#endif /* HAVE_WCHAR_H || _LIBC */ +#if defined HAVE_WCTYPE_H || defined _LIBC +# include +#endif /* HAVE_WCTYPE_H || _LIBC */ #ifdef _LIBC # ifndef _RE_DEFINE_LOCALE_FUNCTIONS @@ -123,7 +128,7 @@ static re_dfastate_t **build_trtable (const regex_t *dfa, static int check_node_accept_bytes (const regex_t *preg, int node_idx, const re_string_t *input, int idx); # ifdef _LIBC -static unsigned int find_collation_sequence_value (const char *mbs, +static unsigned int find_collation_sequence_value (const unsigned char *mbs, size_t name_len); # endif /* _LIBC */ #endif /* RE_ENABLE_I18N */ @@ -1674,7 +1679,7 @@ transit_state_bkref_loop (preg, nodes, work_state_log, mctx) if (BE (err != REG_NOERROR, 0)) return err; } - buf = re_string_get_buffer (mctx->input); + buf = (char *) re_string_get_buffer (mctx->input); if (strncmp (buf + cur_regs[subexp_idx].rm_so, buf + cur_str_idx, subexp_len) != 0) continue; @@ -1855,27 +1860,51 @@ build_trtable (preg, state, fl_search) } /* Update the transition table. */ + /* For all characters ch...: */ for (i = 0, ch = 0; i < BITSET_UINTS; ++i) for (j = 0; j < UINT_BITS; ++j, ++ch) if ((acceptable[i] >> j) & 1) { + /* The current state accepts the character ch. */ if (IS_WORD_CHAR (ch)) { for (k = 0; k < ndests; ++k) if ((dests_ch[k][i] >> j) & 1) - trtable[ch] = dest_states_word[k]; + { + /* k-th destination accepts the word character ch. */ + trtable[ch] = dest_states_word[k]; + /* There must be only one destination which accepts + character ch. See group_nodes_into_DFAstates. */ + break; + } } else /* not WORD_CHAR */ { for (k = 0; k < ndests; ++k) if ((dests_ch[k][i] >> j) & 1) - trtable[ch] = dest_states[k]; + { + /* k-th destination accepts the non-word character ch. */ + trtable[ch] = dest_states[k]; + /* There must be only one destination which accepts + character ch. See group_nodes_into_DFAstates. */ + break; + } } } /* new line */ - for (k = 0; k < ndests; ++k) - if (bitset_contain (acceptable, NEWLINE_CHAR)) - trtable[NEWLINE_CHAR] = dest_states_nl[k]; + if (bitset_contain (acceptable, NEWLINE_CHAR)) + { + /* The current state accepts newline character. */ + for (k = 0; k < ndests; ++k) + if (bitset_contain (dests_ch[k], NEWLINE_CHAR)) + { + /* k-th destination accepts newline character. */ + trtable[NEWLINE_CHAR] = dest_states_nl[k]; + /* There must be only one destination which accepts + newline. See group_nodes_into_DFAstates. */ + break; + } + } re_free (dest_states_nl); re_free (dest_states_word); @@ -2069,7 +2098,7 @@ check_node_accept_bytes (preg, node_idx, input, str_idx) { const re_charset_t *cset = node->opr.mbcset; # ifdef _LIBC - const char *pin = re_string_get_buffer (input) + str_idx; + const unsigned char *pin = re_string_get_buffer (input) + str_idx; # endif /* _LIBC */ int match_len = 0; wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) @@ -2098,17 +2127,19 @@ check_node_accept_bytes (preg, node_idx, input, str_idx) { unsigned int in_collseq = 0; const int32_t *table, *indirect; - const char *weights, *extra, *collseqwc; + const unsigned char *weights, *extra; + const char *collseqwc; int32_t idx; /* This #include defines a local function! */ # include /* match with collating_symbol? */ if (cset->ncoll_syms) - extra = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); + extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); for (i = 0; i < cset->ncoll_syms; ++i) { - const char *coll_sym = extra + cset->coll_syms[i]; + const unsigned char *coll_sym = extra + cset->coll_syms[i]; /* Compare the length of input collating element and the length of current collating element. */ if (*coll_sym != elem_len) @@ -2147,11 +2178,13 @@ check_node_accept_bytes (preg, node_idx, input, str_idx) /* match with equivalence_class? */ if (cset->nequiv_classes) { - const unsigned char *cp = (const unsigned char *) pin; + const unsigned char *cp = pin; table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); - weights = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); - extra = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); + weights = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); + extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); idx = findidx (&cp); @@ -2215,7 +2248,7 @@ check_node_accept_bytes (preg, node_idx, input, str_idx) # ifdef _LIBC static unsigned int find_collation_sequence_value (mbs, mbs_len) - const char *mbs; + const unsigned char *mbs; size_t mbs_len; { uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); @@ -2226,7 +2259,7 @@ find_collation_sequence_value (mbs, mbs_len) /* No valid character. Match it as a single byte character. */ const unsigned char *collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); - return collseq[*(unsigned char *) mbs]; + return collseq[mbs[0]]; } return UINT_MAX; }