Improve strcoll with strdiff.

This patch improves strcoll hot case by finding first byte that mismatches. That is in likely case enough to determine comparison result.
2024-11-22 04:50:07 +00:00 · 2015-05-12 11:37:52 +02:00 · 2015-05-12 11:37:52 +02:00 · f13c2a8dff
commit f13c2a8dff
parent 34cb304e5a
8 changed files with 71 additions and 2 deletions
--- a/11
+++ b/11
@ -1,3 +1,14 @@
 2015-05-12  Leonhard Holz <leonhard.holz@web.de>
 	* locale/categories.def: Define _NL_COLLATE_ENCODING_TYPE.
 	* locale/langinfo.h: Add _NL_COLLATE_ENCODING_TYPE to attribute list.
 	* locale/localeinfo.h: Add enum collation_encoding_type.
 	* locale/C-collate.c: Set _NL_COLLATE_ENCODING_TYPE to 8bit.
 	* programs/ld-collate.c (collate_output): Add encoding type info.
 	* string/strcoll_l.c (STRDIFF): New function.
 	* (STRCOLL): Use STRDIFF to skip over equal prefix.
 	* wcsmbs/wcscoll_l.c: Define STRDIFF.
 2015-05-11  Joseph Myers  <joseph@codesourcery.com>
 	[BZ #18397]
--- a/locale/C-collate.c
+++ b/locale/C-collate.c
@ -144,6 +144,8 @@ const struct __locale_data _nl_C_LC_COLLATE attribute_hidden =
    /* _NL_COLLATE_COLLSEQWC */
    { .string = (const char *) collseqwc },
    /* _NL_COLLATE_CODESET */
-    { .string = _nl_C_codeset }
+    { .string = _nl_C_codeset },
    /* _NL_COLLATE_ENCODING_TYPE */
    { .word = __cet_8bit }
  }
 };
--- a/locale/categories.def
+++ b/locale/categories.def
@ -58,6 +58,7 @@ DEFINE_CATEGORY
  DEFINE_ELEMENT (_NL_COLLATE_COLLSEQMB,        "collate-collseqmb",        std, wstring)
  DEFINE_ELEMENT (_NL_COLLATE_COLLSEQWC,        "collate-collseqwc",        std, wstring)
  DEFINE_ELEMENT (_NL_COLLATE_CODESET,		"collate-codeset",	    std, string)
  DEFINE_ELEMENT (_NL_COLLATE_ENCODING_TYPE,   "collate-encoding-type",    std, word)
  ), NO_POSTLOAD)
--- a/locale/langinfo.h
+++ b/locale/langinfo.h
@ -255,6 +255,7 @@ enum
  _NL_COLLATE_COLLSEQMB,
  _NL_COLLATE_COLLSEQWC,
  _NL_COLLATE_CODESET,
  _NL_COLLATE_ENCODING_TYPE,
  _NL_NUM_LC_COLLATE,
  /* LC_CTYPE category: character classification.
--- a/locale/localeinfo.h
+++ b/locale/localeinfo.h
@ -110,6 +110,14 @@ enum coll_sort_rule
  sort_mask
 };
 /* Collation encoding type.  */
 enum collation_encoding_type
 {
  __cet_other,
  __cet_8bit,
  __cet_utf8
 };
 /* We can map the types of the entries into a few categories.  */
 enum value_type
 {
--- a/locale/programs/ld-collate.c
+++ b/locale/programs/ld-collate.c
@ -32,6 +32,7 @@
 #include "linereader.h"
 #include "locfile.h"
 #include "elem-hash.h"
 #include "../localeinfo.h"
 /* Uncomment the following line in the production version.  */
 /* #define NDEBUG 1 */
@ -2130,6 +2131,8 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
 	  /* The words have to be handled specially.  */
 	  if (idx == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
 	    add_locale_uint32 (&file, 0);
 	  else if (idx == _NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE))
 	    add_locale_uint32 (&file, __cet_other);
 	  else
 	    add_locale_empty (&file);
 	}
@ -2493,6 +2496,12 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
  add_locale_raw_data (&file, collate->mbseqorder, 256);
  add_locale_collseq_table (&file, &collate->wcseqorder);
  add_locale_string (&file, charmap->code_set_name);
  if (strcmp (charmap->code_set_name, "UTF-8") == 0)
    add_locale_uint32 (&file, __cet_utf8);
  else if (charmap->mb_cur_max == 1)
    add_locale_uint32 (&file, __cet_8bit);
  else
    add_locale_uint32 (&file, __cet_other);
  write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file);
  obstack_free (&weightpool, NULL);
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@ -29,6 +29,7 @@
 # define STRING_TYPE char
 # define USTRING_TYPE unsigned char
 # define STRCOLL __strcoll_l
 # define STRDIFF __strdiff
 # define STRCMP strcmp
 # define WEIGHT_H "../locale/weight.h"
 # define SUFFIX	MB
@ -41,6 +42,20 @@
 #include "../locale/localeinfo.h"
 #include WEIGHT_H
 #define MASK_UTF8_7BIT  (1 << 7)
 #define MASK_UTF8_START (3 << 6)
 size_t
 STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
 {
  size_t n;
  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
    continue;
  return n;
 }
 /* Track status while looking for sequences in a string.  */
 typedef struct
 {
@ -255,9 +270,29 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
  const USTRING_TYPE *extra;
  const int32_t *indirect;
  /* In case there is no locale specific sort order (C / POSIX).  */
  if (nrules == 0)
    return STRCMP (s1, s2);
  /* Fast forward to the position of the first difference.  Needs to be
     encoding aware as the byte-by-byte comparison can stop in the middle
     of a char sequence for multibyte encodings like UTF-8.  */
  uint_fast32_t encoding =
    current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
  if (encoding != __cet_other)
    {
      size_t diff = STRDIFF (s1, s2);
      if (diff > 0)
 	{
 	  if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
 	    do
 	      diff--;
 	    while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
 	  s1 += diff;
 	  s2 += diff;
 	}
    }
  /* Catch empty strings.  */
  if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
    return (*s1 != '\0') - (*s2 != '\0');
@ -321,7 +356,8 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
 		     byte-level comparison to ensure that we don't waste time
 		     going through multiple passes for totally equal strings
 		     before proceeding to subsequent passes.  */
-		  if (pass == 0 && STRCMP (s1, s2) == 0)
+		  if (pass == 0 && encoding == __cet_other &&
 		      STRCMP (s1, s2) == 0)
 		    return result;
 		  else
 		    break;
--- a/wcsmbs/wcscoll_l.c
+++ b/wcsmbs/wcscoll_l.c
@ -23,6 +23,7 @@
 #define STRING_TYPE wchar_t
 #define USTRING_TYPE wint_t
 #define STRCOLL __wcscoll_l
 #define STRDIFF __wcsdiff
 #define STRCMP wcscmp
 #define WEIGHT_H "../locale/weightwc.h"
 #define SUFFIX	WC