Speedup stringsearch for two byte strings

Uses the lower byte with memchr which is
significantly faster than a naive compare

Performance difference with bench (http://hastebin.com/xuxexataso.js):

old                             new

single character                single character
Κ found at 922                  Κ found at 922
3324                            616
㎡ found at 13217               ㎡ found at 13217
42366                           4931
က found at 4096                 က found at 4096
13369                           9836
＀ found at 65280                ＀ found at 65280
207472                          36149
ᆬ found at 65445                ᆬ found at 65445
209344                          36666
  found at 8197                   found at 8197
26731                           11757
倂 found at 20482               倂 found at 20482
66071                           17193

linear search                   linear search
ΚΛ found at 922                 ΚΛ found at 922
4112                            504
㎡㎢ found at 13217             ㎡㎢ found at 13217
55105                           5119
ᆬᆭ found at 65445               ᆬᆭ found at 65445
268016                          35496

linear + bmh search             linear + bmh search
ΚΛΜΝΞΟΠΡ found at 922           ΚΛΜΝΞΟΠΡ found at 922
2897                            522
ᆬᆭᄃᄄᄅᆰᆱᆲ found at 65445         ᆬᆭᄃᄄᄅᆰᆱᆲ found at 65445
167687                          158465

Review URL: https://codereview.chromium.org/1303033012

Cr-Commit-Position: refs/heads/master@{#30587}
This commit is contained in:
karl 2015-09-04 05:37:39 -07:00 committed by Commit bot
parent 082730a440
commit fced280f37
3 changed files with 49 additions and 37 deletions

View File

@ -67,6 +67,7 @@ Johan Bergström <johan@bergstroem.nu>
Jonathan Liu <net147@gmail.com>
JunHo Seo <sejunho@gmail.com>
Kang-Hao (Kenny) Lu <kennyluck@csail.mit.edu>
Karl Skomski <karl@skomski.com>
Luis Reis <luis.m.reis@gmail.com>
Luke Zarko <lukezarko@gmail.com>
Maciej Małecki <me@mmalecki.com>

View File

@ -190,6 +190,38 @@ class StringSearch : private StringSearchBase {
};
template <typename PatternChar, typename SubjectChar>
int FindFirstCharacter(Vector<const PatternChar> pattern,
Vector<const SubjectChar> subject, int index) {
PatternChar pattern_first_char = pattern[0];
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(memchr(
subject.start() + index, pattern_first_char, subject.length() - index));
if (char_pos == NULL) return -1;
return static_cast<int>(char_pos - subject.start());
} else {
const uint8_t search_low_byte =
static_cast<uint8_t>(pattern_first_char & 0xFF);
const SubjectChar search_char =
static_cast<SubjectChar>(pattern_first_char);
int pos = index;
do {
const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
memchr(subject.start() + pos, search_low_byte,
(subject.length() - pos) * sizeof(SubjectChar)));
if (char_pos == NULL) return -1;
pos = static_cast<int>(char_pos - subject.start());
if (IsAligned(reinterpret_cast<uintptr_t>(char_pos),
sizeof(SubjectChar))) {
if (subject[pos] == search_char) return pos;
}
} while (++pos < subject.length());
}
return -1;
}
//---------------------------------------------------------------------
// Single Character Pattern Search Strategy
//---------------------------------------------------------------------
@ -201,26 +233,15 @@ int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
int index) {
DCHECK_EQ(1, search->pattern_.length());
PatternChar pattern_first_char = search->pattern_[0];
int i = index;
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
memchr(subject.start() + i,
pattern_first_char,
subject.length() - i));
if (pos == NULL) return -1;
return static_cast<int>(pos - subject.start());
return FindFirstCharacter(search->pattern_, subject, index);
} else {
if (sizeof(PatternChar) > sizeof(SubjectChar)) {
if (exceedsOneByte(pattern_first_char)) {
return -1;
}
}
SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
int n = subject.length();
while (i < n) {
if (subject[i++] == search_char) return i - 1;
}
return -1;
return FindFirstCharacter(search->pattern_, subject, index);
}
}
@ -254,20 +275,12 @@ int StringSearch<PatternChar, SubjectChar>::LinearSearch(
Vector<const PatternChar> pattern = search->pattern_;
DCHECK(pattern.length() > 1);
int pattern_length = pattern.length();
PatternChar pattern_first_char = pattern[0];
int i = index;
int n = subject.length() - pattern_length;
while (i <= n) {
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
memchr(subject.start() + i,
pattern_first_char,
n - i + 1));
if (pos == NULL) return -1;
i = static_cast<int>(pos - subject.start()) + 1;
} else {
if (subject[i++] != pattern_first_char) continue;
}
i = FindFirstCharacter(pattern, subject, i);
if (i == -1) return -1;
i++;
// Loop extracted to separate function to allow using return to do
// a deeper break.
if (CharCompare(pattern.start() + 1,
@ -505,22 +518,11 @@ int StringSearch<PatternChar, SubjectChar>::InitialSearch(
// We know our pattern is at least 2 characters, we cache the first so
// the common case of the first character not matching is faster.
PatternChar pattern_first_char = pattern[0];
for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
badness++;
if (badness <= 0) {
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
memchr(subject.start() + i,
pattern_first_char,
n - i + 1));
if (pos == NULL) {
return -1;
}
i = static_cast<int>(pos - subject.start());
} else {
if (subject[i] != pattern_first_char) continue;
}
i = FindFirstCharacter(pattern, subject, i);
if (i == -1) return -1;
int j = 1;
do {
if (pattern[j] != subject[i + j]) {

View File

@ -77,6 +77,15 @@ assertEquals(-1, twoByteString.indexOf("\u0391\u03a3\u0395"),
//single char pattern
assertEquals(4, twoByteString.indexOf("\u0395"));
// test string with alignment traps
var alignmentString = "\u1122\u2211\u2222\uFF00\u00FF\u00FF";
assertEquals(2, alignmentString.indexOf("\u2222"));
assertEquals(4, alignmentString.indexOf("\u00FF\u00FF"));
var longAlignmentString = "\uFF00" + "\u00FF".repeat(10);
assertEquals(1,
longAlignmentString.indexOf("\u00FF".repeat(10)));
// Test complex string indexOf algorithms. Only trigger for long strings.
// Long string that isn't a simple repeat of a shorter string.