Improve strcoll with strdiff.

This patch improves strcoll hot case by finding first byte that
mismatches. That is in likely case enough to determine comparison
result.
This commit is contained in:
Leonhard Holz 2015-05-12 11:37:52 +02:00 committed by Ondřej Bílka
parent 34cb304e5a
commit f13c2a8dff
8 changed files with 71 additions and 2 deletions

View File

@ -1,3 +1,14 @@
2015-05-12 Leonhard Holz <leonhard.holz@web.de>
* locale/categories.def: Define _NL_COLLATE_ENCODING_TYPE.
* locale/langinfo.h: Add _NL_COLLATE_ENCODING_TYPE to attribute list.
* locale/localeinfo.h: Add enum collation_encoding_type.
* locale/C-collate.c: Set _NL_COLLATE_ENCODING_TYPE to 8bit.
* programs/ld-collate.c (collate_output): Add encoding type info.
* string/strcoll_l.c (STRDIFF): New function.
* (STRCOLL): Use STRDIFF to skip over equal prefix.
* wcsmbs/wcscoll_l.c: Define STRDIFF.
2015-05-11 Joseph Myers <joseph@codesourcery.com> 2015-05-11 Joseph Myers <joseph@codesourcery.com>
[BZ #18397] [BZ #18397]

View File

@ -144,6 +144,8 @@ const struct __locale_data _nl_C_LC_COLLATE attribute_hidden =
/* _NL_COLLATE_COLLSEQWC */ /* _NL_COLLATE_COLLSEQWC */
{ .string = (const char *) collseqwc }, { .string = (const char *) collseqwc },
/* _NL_COLLATE_CODESET */ /* _NL_COLLATE_CODESET */
{ .string = _nl_C_codeset } { .string = _nl_C_codeset },
/* _NL_COLLATE_ENCODING_TYPE */
{ .word = __cet_8bit }
} }
}; };

View File

@ -58,6 +58,7 @@ DEFINE_CATEGORY
DEFINE_ELEMENT (_NL_COLLATE_COLLSEQMB, "collate-collseqmb", std, wstring) DEFINE_ELEMENT (_NL_COLLATE_COLLSEQMB, "collate-collseqmb", std, wstring)
DEFINE_ELEMENT (_NL_COLLATE_COLLSEQWC, "collate-collseqwc", std, wstring) DEFINE_ELEMENT (_NL_COLLATE_COLLSEQWC, "collate-collseqwc", std, wstring)
DEFINE_ELEMENT (_NL_COLLATE_CODESET, "collate-codeset", std, string) DEFINE_ELEMENT (_NL_COLLATE_CODESET, "collate-codeset", std, string)
DEFINE_ELEMENT (_NL_COLLATE_ENCODING_TYPE, "collate-encoding-type", std, word)
), NO_POSTLOAD) ), NO_POSTLOAD)

View File

@ -255,6 +255,7 @@ enum
_NL_COLLATE_COLLSEQMB, _NL_COLLATE_COLLSEQMB,
_NL_COLLATE_COLLSEQWC, _NL_COLLATE_COLLSEQWC,
_NL_COLLATE_CODESET, _NL_COLLATE_CODESET,
_NL_COLLATE_ENCODING_TYPE,
_NL_NUM_LC_COLLATE, _NL_NUM_LC_COLLATE,
/* LC_CTYPE category: character classification. /* LC_CTYPE category: character classification.

View File

@ -110,6 +110,14 @@ enum coll_sort_rule
sort_mask sort_mask
}; };
/* Collation encoding type. */
enum collation_encoding_type
{
__cet_other,
__cet_8bit,
__cet_utf8
};
/* We can map the types of the entries into a few categories. */ /* We can map the types of the entries into a few categories. */
enum value_type enum value_type
{ {

View File

@ -32,6 +32,7 @@
#include "linereader.h" #include "linereader.h"
#include "locfile.h" #include "locfile.h"
#include "elem-hash.h" #include "elem-hash.h"
#include "../localeinfo.h"
/* Uncomment the following line in the production version. */ /* Uncomment the following line in the production version. */
/* #define NDEBUG 1 */ /* #define NDEBUG 1 */
@ -2130,6 +2131,8 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
/* The words have to be handled specially. */ /* The words have to be handled specially. */
if (idx == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB)) if (idx == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
add_locale_uint32 (&file, 0); add_locale_uint32 (&file, 0);
else if (idx == _NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE))
add_locale_uint32 (&file, __cet_other);
else else
add_locale_empty (&file); add_locale_empty (&file);
} }
@ -2493,6 +2496,12 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
add_locale_raw_data (&file, collate->mbseqorder, 256); add_locale_raw_data (&file, collate->mbseqorder, 256);
add_locale_collseq_table (&file, &collate->wcseqorder); add_locale_collseq_table (&file, &collate->wcseqorder);
add_locale_string (&file, charmap->code_set_name); add_locale_string (&file, charmap->code_set_name);
if (strcmp (charmap->code_set_name, "UTF-8") == 0)
add_locale_uint32 (&file, __cet_utf8);
else if (charmap->mb_cur_max == 1)
add_locale_uint32 (&file, __cet_8bit);
else
add_locale_uint32 (&file, __cet_other);
write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file); write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file);
obstack_free (&weightpool, NULL); obstack_free (&weightpool, NULL);

View File

@ -29,6 +29,7 @@
# define STRING_TYPE char # define STRING_TYPE char
# define USTRING_TYPE unsigned char # define USTRING_TYPE unsigned char
# define STRCOLL __strcoll_l # define STRCOLL __strcoll_l
# define STRDIFF __strdiff
# define STRCMP strcmp # define STRCMP strcmp
# define WEIGHT_H "../locale/weight.h" # define WEIGHT_H "../locale/weight.h"
# define SUFFIX MB # define SUFFIX MB
@ -41,6 +42,20 @@
#include "../locale/localeinfo.h" #include "../locale/localeinfo.h"
#include WEIGHT_H #include WEIGHT_H
#define MASK_UTF8_7BIT (1 << 7)
#define MASK_UTF8_START (3 << 6)
size_t
STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
{
size_t n;
for (n = 0; *s != '\0' && *s++ == *t++; ++n)
continue;
return n;
}
/* Track status while looking for sequences in a string. */ /* Track status while looking for sequences in a string. */
typedef struct typedef struct
{ {
@ -255,9 +270,29 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
const USTRING_TYPE *extra; const USTRING_TYPE *extra;
const int32_t *indirect; const int32_t *indirect;
/* In case there is no locale specific sort order (C / POSIX). */
if (nrules == 0) if (nrules == 0)
return STRCMP (s1, s2); return STRCMP (s1, s2);
/* Fast forward to the position of the first difference. Needs to be
encoding aware as the byte-by-byte comparison can stop in the middle
of a char sequence for multibyte encodings like UTF-8. */
uint_fast32_t encoding =
current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
if (encoding != __cet_other)
{
size_t diff = STRDIFF (s1, s2);
if (diff > 0)
{
if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
do
diff--;
while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
s1 += diff;
s2 += diff;
}
}
/* Catch empty strings. */ /* Catch empty strings. */
if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0')) if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
return (*s1 != '\0') - (*s2 != '\0'); return (*s1 != '\0') - (*s2 != '\0');
@ -321,7 +356,8 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
byte-level comparison to ensure that we don't waste time byte-level comparison to ensure that we don't waste time
going through multiple passes for totally equal strings going through multiple passes for totally equal strings
before proceeding to subsequent passes. */ before proceeding to subsequent passes. */
if (pass == 0 && STRCMP (s1, s2) == 0) if (pass == 0 && encoding == __cet_other &&
STRCMP (s1, s2) == 0)
return result; return result;
else else
break; break;

View File

@ -23,6 +23,7 @@
#define STRING_TYPE wchar_t #define STRING_TYPE wchar_t
#define USTRING_TYPE wint_t #define USTRING_TYPE wint_t
#define STRCOLL __wcscoll_l #define STRCOLL __wcscoll_l
#define STRDIFF __wcsdiff
#define STRCMP wcscmp #define STRCMP wcscmp
#define WEIGHT_H "../locale/weightwc.h" #define WEIGHT_H "../locale/weightwc.h"
#define SUFFIX WC #define SUFFIX WC