Update.

* posix/regex_internal.h: Add forward declaration of re_dfa_t. Replace last two parameters of re_string_allocate and re_string_construct with pointer to DFA. (re_dfa_t): Add map_notascii field. * posix/regcomp.c (re_compile_internal): Add call of re_string_construct. (init_dfa): Initialize mpa_notascii. * posix/regex_internal.c: Adjust definitions of re_string_allocate and re_string_construct. Pass DFA to re_string_construct. Adjust definition. Initialize map_notascii field. (build_wcs_upper_buffer): If map_notascii is zero use simplfied method to map ASCII values to upper case. * posix/regex.c: Include localeinfo.h. * posix/regexec.c: Adjust call of re_string_allocate. * locale/langinfo.h: Add _NL_CTYPE_MAP_TO_NONASCII. * locale/localeinfo.h (LIMAGIC): Change value. * locale/categories.def. Add entry for _NL_CTYPE_MAP_TO_NONASCII. * locale/C-ctype.h: Likewise. * locale/programs/ld-ctype.c: Compute whether any mapping maps from ASCII to non-ASCII value. Write out that value.
2024-12-22 19:00:07 +00:00 · 2003-11-16 07:14:28 +00:00 · 2003-11-16 07:14:28 +00:00 · f0c7c524bb
commit f0c7c524bb
parent 2def87644d
13 changed files with 201 additions and 87 deletions
--- a/23
+++ b/23
@ -1,5 +1,28 @@
 2003-11-15  Ulrich Drepper  <drepper@redhat.com>
 	* posix/regex_internal.h: Add forward declaration of re_dfa_t.
 	Replace last two parameters of re_string_allocate and
 	re_string_construct with pointer to DFA.
 	(re_dfa_t): Add map_notascii field.
 	* posix/regcomp.c (re_compile_internal): Add call of
 	re_string_construct.
 	(init_dfa): Initialize mpa_notascii.
 	* posix/regex_internal.c: Adjust definitions of re_string_allocate
 	and re_string_construct.
 	Pass DFA to re_string_construct.  Adjust definition.  Initialize
 	map_notascii field.
 	(build_wcs_upper_buffer): If map_notascii is zero use simplfied
 	method to map ASCII values to upper case.
 	* posix/regex.c: Include localeinfo.h.
 	* posix/regexec.c: Adjust call of re_string_allocate.
 	* locale/langinfo.h: Add _NL_CTYPE_MAP_TO_NONASCII.
 	* locale/localeinfo.h (LIMAGIC): Change value.
 	* locale/categories.def. Add entry for _NL_CTYPE_MAP_TO_NONASCII.
 	* locale/C-ctype.h: Likewise.
 	* locale/programs/ld-ctype.c: Compute whether any mapping maps from
 	ASCII to non-ASCII value.  Write out that value.
 	* wcsmbs/mbsinit.c: Undef mbsinit and __mbsinit.
 	* include/wchar.h: Provide inline versions of mbsinit and __mbsinit.
--- a/locale/C-ctype.c
+++ b/locale/C-ctype.c
@ -1,4 +1,4 @@
-/* Copyright (C) 1995-1999, 2000, 2001, 2002 Free Software Foundation, Inc.
+/* Copyright (C) 1995-2002, 2003 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1995.
@ -528,7 +528,7 @@ _nl_C_LC_CTYPE_width attribute_hidden =
 };
 /* Number of fields with fixed meanings, starting at 0.  */
-#define NR_FIXED 70
+#define NR_FIXED 71
 /* Number of class fields, starting at CLASS_OFFSET.  */
 #define NR_CLASSES 12
 /* Number of map fields, starting at MAP_OFFSET.  */
@ -665,6 +665,8 @@ const struct locale_data _nl_C_LC_CTYPE attribute_hidden =
    { .word = 0 },
    /* _NL_CTYPE_TRANSLIT_IGNORE */
    { .wstr = NULL },
    /* _NL_CTYPE_MAP_TO_NONASCII */
    { .word = 0 },
    /* NR_CLASSES wctype_tables */
    { .string = (const char *) _nl_C_LC_CTYPE_class_upper.header },
    { .string = (const char *) _nl_C_LC_CTYPE_class_lower.header },
--- a/locale/categories.def
+++ b/locale/categories.def
@ -1,5 +1,5 @@
 /* Definition of all available locale categories and their items.  -*- C -*-
-   Copyright (C) 1995-2001, 2002 Free Software Foundation, Inc.
+   Copyright (C) 1995-2001, 2002, 2003 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
@ -133,6 +133,7 @@ DEFINE_CATEGORY
  DEFINE_ELEMENT (_NL_CTYPE_TRANSLIT_DEFAULT_MISSING, "ctype-translit-default-missing", std, wstring)
  DEFINE_ELEMENT (_NL_CTYPE_TRANSLIT_IGNORE_LEN, "ctype-translit-ignore-len", std, word)
  DEFINE_ELEMENT (_NL_CTYPE_TRANSLIT_IGNORE, "ctype-translit-ignore", std, string)
  DEFINE_ELEMENT (_NL_CTYPE_MAP_TO_NONASCII, "map-to-nonascii", std, word)
  ), _nl_postload_ctype)
--- a/locale/langinfo.h
+++ b/locale/langinfo.h
@ -1,5 +1,5 @@
 /* Access to locale-dependent parameters.
-   Copyright (C) 1995-99,2000,01,02 Free Software Foundation, Inc.
+   Copyright (C) 1995-2002, 2003 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
@ -334,6 +334,7 @@ enum
  _NL_CTYPE_TRANSLIT_DEFAULT_MISSING,
  _NL_CTYPE_TRANSLIT_IGNORE_LEN,
  _NL_CTYPE_TRANSLIT_IGNORE,
  _NL_CTYPE_MAP_TO_NONASCII,
  _NL_CTYPE_EXTRA_MAP_1,
  _NL_CTYPE_EXTRA_MAP_2,
  _NL_CTYPE_EXTRA_MAP_3,
--- a/locale/localeinfo.h
+++ b/locale/localeinfo.h
@ -1,5 +1,5 @@
 /* Declarations for internal libc locale interfaces
-   Copyright (C) 1995-2001, 2002 Free Software Foundation, Inc.
+   Copyright (C) 1995-2001, 2002, 2003 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
@ -31,7 +31,7 @@
 #include <intl/loadinfo.h>	/* For loaded_l10nfile definition.  */
 /* Magic number at the beginning of a locale data file for CATEGORY.  */
-#define	LIMAGIC(category)	((unsigned int) (0x20000828 ^ (category)))
+#define	LIMAGIC(category)	((unsigned int) (0x20031115 ^ (category)))
 /* Two special weight constants for the collation data.  */
 #define IGNORE_CHAR	2
--- a/locale/programs/ld-ctype.c
+++ b/locale/programs/ld-ctype.c
@ -1,4 +1,4 @@
-/* Copyright (C) 1995-1999, 2000, 2001, 2002 Free Software Foundation, Inc.
+/* Copyright (C) 1995-2002, 2003 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
@ -181,6 +181,8 @@ struct locale_ctype_t
  const char *default_missing_file;
  size_t default_missing_lineno;
  uint32_t to_nonascii;
  /* The arrays for the binary representation.  */
  char_class_t *ctype_b;
  char_class32_t *ctype32_b;
@ -1035,6 +1037,10 @@ ctype_output (struct localedef_t *locale, const struct charmap_t *charmap,
 	    idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
 	    break;
 	  CTYPE_DATA (_NL_CTYPE_MAP_TO_NONASCII,
 		      &ctype->to_nonascii, sizeof (uint32_t));
 	  case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
 	    iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
 	    iov[2 + elem + offset].iov_len = sizeof (uint32_t);
@ -2706,6 +2712,14 @@ with character code range values one must use the absolute ellipsis `...'"));
 	      if (!ignore_content)
 		{
 		  /* Check whether the mapping converts from an ASCII value
 		     to a non-ASCII value.  */
 		  if (from_seq != NULL && from_seq->nbytes == 1
 		      && isascii (from_seq->bytes[0])
 		      && to_seq != NULL && (to_seq->nbytes != 1
 					    || !isascii (to_seq->bytes[0])))
 		    ctype->to_nonascii = 1;
 		  if (mapidx < 2 && from_seq != NULL && to_seq != NULL
 		      && from_seq->nbytes == 1 && to_seq->nbytes == 1)
 		    /* We can use this value.  */
--- a/localedata/ChangeLog
+++ b/localedata/ChangeLog
@ -1,3 +1,7 @@
 2003-11-15  Ulrich Drepper  <drepper@redhat.com>
 	* Makefile (tst-leaks-ENV): Add LOCPATH.
 2003-11-11  Jakub Jelinek  <jakub@redhat.com>
 	* Makefile (LOCALES): Add tr_TR.UTF-8.
--- a/localedata/Makefile
+++ b/localedata/Makefile
@ -287,6 +287,7 @@ tst-setlocale-ENV = LOCPATH=$(common-objpfx)localedata LC_ALL=ja_JP.EUC-JP
 bug-iconv-trans-ENV = LOCPATH=$(common-objpfx)localedata
-tst-leaks-ENV = MALLOC_TRACE=$(objpfx)tst-leaks.mtrace
+tst-leaks-ENV = MALLOC_TRACE=$(objpfx)tst-leaks.mtrace \
 		LOCPATH=$(common-objpfx)localedata
 $(objpfx)mtrace-tst-leaks: $(objpfx)tst-leaks.out
 	$(common-objpfx)malloc/mtrace $(objpfx)tst-leaks.mtrace > $@
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@ -748,8 +748,7 @@ re_compile_internal (preg, pattern, length, syntax)
 #endif
  err = re_string_construct (&regexp, pattern, length, preg->translate,
-			     syntax & RE_ICASE, dfa->mb_cur_max,
+			     syntax & RE_ICASE, dfa);
 			     dfa->is_utf8);
  if (BE (err != REG_NOERROR, 0))
    {
      re_free (dfa);
@ -828,6 +827,8 @@ init_dfa (dfa, pat_len)
  if (dfa->mb_cur_max > 1
      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
    dfa->is_utf8 = 1;
  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 		       != 0);
 #endif
  if (BE (dfa->nodes == NULL || dfa->state_table == NULL
--- a/posix/regex.c
+++ b/posix/regex.c
@ -39,6 +39,8 @@
 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
 	__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
 # include "../locale/localeinfo.h"
 #endif
 /* POSIX says that <sys/types.h> must be included (by the caller) before
--- a/posix/regex_internal.c
+++ b/posix/regex_internal.c
@ -21,7 +21,7 @@
 static void re_string_construct_common (const char *str, int len,
 					re_string_t *pstr,
 					RE_TRANSLATE_TYPE trans, int icase,
-					int mb_cur_max, int is_utf8);
+					const re_dfa_t *dfa);
 #ifdef RE_ENABLE_I18N
 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
 				 wint_t *last_wc);
@ -47,17 +47,16 @@ static unsigned int inline calc_state_hash (const re_node_set *nodes,
   re_string_reconstruct before using the object.  */
 static reg_errcode_t
-re_string_allocate (pstr, str, len, init_len, trans, icase,
+re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
 		    mb_cur_max, is_utf8)
     re_string_t *pstr;
     const char *str;
-     int len, init_len, icase, mb_cur_max, is_utf8;
+     int len, init_len, icase;
     RE_TRANSLATE_TYPE trans;
     const re_dfa_t *dfa;
 {
  reg_errcode_t ret;
  int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
-  re_string_construct_common (str, len, pstr, trans, icase,
+  re_string_construct_common (str, len, pstr, trans, icase, dfa);
 			      mb_cur_max, is_utf8);
  pstr->stop = pstr->len;
  ret = re_string_realloc_buffers (pstr, init_buf_len);
@ -68,22 +67,22 @@ re_string_allocate (pstr, str, len, init_len, trans, icase,
 		    : (unsigned char *) str);
  pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
  pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr)
-		     || mb_cur_max > 1) ? pstr->valid_len : len;
+		     || dfa->mb_cur_max > 1) ? pstr->valid_len : len;
  return REG_NOERROR;
 }
 /* This function allocate the buffers, and initialize them.  */
 static reg_errcode_t
-re_string_construct (pstr, str, len, trans, icase, mb_cur_max, is_utf8)
+re_string_construct (pstr, str, len, trans, icase, dfa)
     re_string_t *pstr;
     const char *str;
-     int len, icase, mb_cur_max, is_utf8;
+     int len, icase;
     RE_TRANSLATE_TYPE trans;
     const re_dfa_t *dfa;
 {
  reg_errcode_t ret;
-  re_string_construct_common (str, len, pstr, trans, icase,
+  re_string_construct_common (str, len, pstr, trans, icase, dfa);
 			      mb_cur_max, is_utf8);
  pstr->stop = pstr->len;
  /* Set 0 so that this function can initialize whole buffers.  */
  pstr->valid_len = 0;
@ -101,7 +100,7 @@ re_string_construct (pstr, str, len, trans, icase, mb_cur_max, is_utf8)
  if (icase)
    {
 #ifdef RE_ENABLE_I18N
-      if (mb_cur_max > 1)
+      if (dfa->mb_cur_max > 1)
 	build_wcs_upper_buffer (pstr);
      else
 #endif /* RE_ENABLE_I18N  */
@ -110,7 +109,7 @@ re_string_construct (pstr, str, len, trans, icase, mb_cur_max, is_utf8)
  else
    {
 #ifdef RE_ENABLE_I18N
-      if (mb_cur_max > 1)
+      if (dfa->mb_cur_max > 1)
 	build_wcs_buffer (pstr);
      else
 #endif /* RE_ENABLE_I18N  */
@ -167,20 +166,22 @@ re_string_realloc_buffers (pstr, new_buf_len)
 static void
-re_string_construct_common (str, len, pstr, trans, icase, mb_cur_max, is_utf8)
+re_string_construct_common (str, len, pstr, trans, icase, dfa)
     const char *str;
     int len;
     re_string_t *pstr;
     RE_TRANSLATE_TYPE trans;
-     int icase, mb_cur_max, is_utf8;
+     int icase;
     const re_dfa_t *dfa;
 {
  memset (pstr, '\0', sizeof (re_string_t));
  pstr->raw_mbs = (const unsigned char *) str;
  pstr->len = len;
  pstr->trans = trans;
  pstr->icase = icase ? 1 : 0;
-  pstr->mb_cur_max = mb_cur_max;
+  pstr->mb_cur_max = dfa->mb_cur_max;
-  pstr->is_utf8 = is_utf8;
+  pstr->is_utf8 = dfa->is_utf8;
  pstr->map_notascii = dfa->map_notascii;
 }
 #ifdef RE_ENABLE_I18N
@ -253,46 +254,109 @@ build_wcs_upper_buffer (pstr)
  /* Build the buffers from pstr->valid_len to either pstr->len or
     pstr->bufs_len.  */
  end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
 #ifdef _LIBC
  /* The following optimization assumes that the wchar_t encoding is
     always ISO 10646.  */
  if (! pstr->map_notascii && pstr->trans == NULL)
    for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
      if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 	  && mbsinit (&pstr->cur_state))
 	{
 	  /* In case of a singlebyte character.  */
 	  pstr->mbs[byte_idx]
 	    = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 	  /* The next step uses the assumption that wchar_t is encoded
 	     with ISO 10646: all ASCII values can be converted like this.  */
 	  pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 	  ++byte_idx;
 	}
      else
 	{
 	  wchar_t wc;
 	  remain_len = end_idx - byte_idx;
 	  prev_st = pstr->cur_state;
-      mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
+	  mbclen = mbrtowc (&wc,
 			    ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 			     + byte_idx), remain_len, &pstr->cur_state);
-      if (BE (mbclen == (size_t) -2, 0))
+	  if (BE (mbclen > 1, 1))
 	    {
 	      if (iswlower (wc))
 		wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc),
 			 &prev_st);
 	      else
 		memcpy (pstr->mbs + byte_idx,
 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 	      pstr->wcs[byte_idx++] = towupper (wc);
 	      /* Write paddings.  */
 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 		pstr->wcs[byte_idx++] = WEOF;
 	    }
 	  else if (mbclen == (size_t) -1 || mbclen == 0)
 	    {
 	      /* In case of a singlebyte character.  */
 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 	      /* Apply the translation if we need.  */
 	      if (BE (pstr->trans != NULL, 0) && mbclen == 1)
 		{
 		  ch = pstr->trans[ch];
 		  pstr->mbs_case[byte_idx] = ch;
 		}
 	      pstr->wcs[byte_idx] = towupper (wc);
 	      pstr->mbs[byte_idx++] = toupper (ch);
 	      if (BE (mbclen == (size_t) -1, 0))
 		pstr->cur_state = prev_st;
 	    }
 	  else
 	    {
 	      /* The buffer doesn't have enough space, finish to build.  */
 	      pstr->cur_state = prev_st;
 	      break;
 	    }
-      else if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0)
+	}
  else
 #endif
    for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
      {
 	wchar_t wc;
 	remain_len = end_idx - byte_idx;
 	prev_st = pstr->cur_state;
 	mbclen = mbrtowc (&wc,
 			  ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 			   + byte_idx), remain_len, &pstr->cur_state);
 	if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0)
 	  {
 	    /* In case of a singlebyte character.  */
 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 	    /* Apply the translation if we need.  */
-	  if (pstr->trans != NULL && mbclen == 1)
+	    if (BE (pstr->trans != NULL, 0) && mbclen == 1)
 	      {
 		ch = pstr->trans[ch];
 		pstr->mbs_case[byte_idx] = ch;
 	      }
-	  pstr->wcs[byte_idx] = iswlower (wc) ? towupper (wc) : wc;
+	    pstr->wcs[byte_idx] = towupper (wc);
-	  pstr->mbs[byte_idx++] = islower (ch) ? toupper (ch) : ch;
+	    pstr->mbs[byte_idx++] = toupper (ch);
 	    if (BE (mbclen == (size_t) -1, 0))
 	      pstr->cur_state = prev_st;
 	  }
-      else /* mbclen > 1 */
+	else if (BE (mbclen != (size_t) -2, 1))
 	  {
 	    if (iswlower (wc))
 	      wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st);
 	    else
 	      memcpy (pstr->mbs + byte_idx,
 		      pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
-	  pstr->wcs[byte_idx++] = iswlower (wc) ? towupper (wc) : wc;
+	    pstr->wcs[byte_idx++] = towupper (wc);
 	    /* Write paddings.  */
 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 	      pstr->wcs[byte_idx++] = WEOF;
 	  }
 	else
 	  {
 	    /* The buffer doesn't have enough space, finish to build.  */
 	    pstr->cur_state = prev_st;
 	    break;
 	  }
      }
  pstr->valid_len = byte_idx;
 }
--- a/posix/regex_internal.h
+++ b/posix/regex_internal.h
@ -335,6 +335,7 @@ struct re_string_t
  /* 1 if REG_ICASE.  */
  unsigned int icase : 1;
  unsigned int is_utf8 : 1;
  unsigned int map_notascii : 1;
  int mb_cur_max;
 };
 typedef struct re_string_t re_string_t;
@ -345,15 +346,16 @@ typedef struct re_string_t re_string_t;
 #define MBS_CASE_ALLOCATED(pstr) (pstr->trans != NULL)
 struct re_dfa_t;
 typedef struct re_dfa_t re_dfa_t;
 #ifndef RE_NO_INTERNAL_PROTOTYPES
 static reg_errcode_t re_string_allocate (re_string_t *pstr, const char *str,
 					 int len, int init_len,
 					 RE_TRANSLATE_TYPE trans, int icase,
-					 int mb_cur_max, int is_utf8);
+					 const re_dfa_t *dfa);
 static reg_errcode_t re_string_construct (re_string_t *pstr, const char *str,
 					  int len, RE_TRANSLATE_TYPE trans,
-					  int icase, int mb_cur_max,
+					  int icase, const re_dfa_t *dfa);
 					  int is_utf8);
 static reg_errcode_t re_string_reconstruct (re_string_t *pstr, int idx,
 					    int eflags, int newline);
 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
@ -610,9 +612,9 @@ struct re_dfa_t
     collating element.  */
  unsigned int has_mb_node : 1;
  unsigned int is_utf8 : 1;
  unsigned int map_notascii : 1;
  int mb_cur_max;
 };
 typedef struct re_dfa_t re_dfa_t;
 #ifndef RE_NO_INTERNAL_PROTOTYPES
 static reg_errcode_t re_node_set_alloc (re_node_set *set, int size);
--- a/posix/regexec.c
+++ b/posix/regexec.c
@ -605,8 +605,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
  fl_longest_match = (nmatch != 0 || dfa->nbackref);
  err = re_string_allocate (&input, string, length, dfa->nodes_len + 1,
-			    preg->translate, preg->syntax & RE_ICASE,
+			    preg->translate, preg->syntax & RE_ICASE, dfa);
 			    dfa->mb_cur_max, dfa->is_utf8);
  if (BE (err != REG_NOERROR, 0))
    goto free_return;
  input.stop = stop;