Speedup stringsearch for two byte strings
Uses the lower byte with memchr which is significantly faster than a naive compare Performance difference with bench (http://hastebin.com/xuxexataso.js): old new single character single character Κ found at 922 Κ found at 922 3324 616 ㎡ found at 13217 ㎡ found at 13217 42366 4931 က found at 4096 က found at 4096 13369 9836 found at 65280 found at 65280 207472 36149 ᆬ found at 65445 ᆬ found at 65445 209344 36666 found at 8197 found at 8197 26731 11757 倂 found at 20482 倂 found at 20482 66071 17193 linear search linear search ΚΛ found at 922 ΚΛ found at 922 4112 504 ㎡㎢ found at 13217 ㎡㎢ found at 13217 55105 5119 ᆬᆭ found at 65445 ᆬᆭ found at 65445 268016 35496 linear + bmh search linear + bmh search ΚΛΜΝΞΟΠΡ found at 922 ΚΛΜΝΞΟΠΡ found at 922 2897 522 ᆬᆭᄃᄄᄅᆰᆱᆲ found at 65445 ᆬᆭᄃᄄᄅᆰᆱᆲ found at 65445 167687 158465 Review URL: https://codereview.chromium.org/1303033012 Cr-Commit-Position: refs/heads/master@{#30587}
This commit is contained in:
parent
082730a440
commit
fced280f37
1
AUTHORS
1
AUTHORS
@ -67,6 +67,7 @@ Johan Bergström <johan@bergstroem.nu>
|
||||
Jonathan Liu <net147@gmail.com>
|
||||
JunHo Seo <sejunho@gmail.com>
|
||||
Kang-Hao (Kenny) Lu <kennyluck@csail.mit.edu>
|
||||
Karl Skomski <karl@skomski.com>
|
||||
Luis Reis <luis.m.reis@gmail.com>
|
||||
Luke Zarko <lukezarko@gmail.com>
|
||||
Maciej Małecki <me@mmalecki.com>
|
||||
|
@ -190,6 +190,38 @@ class StringSearch : private StringSearchBase {
|
||||
};
|
||||
|
||||
|
||||
template <typename PatternChar, typename SubjectChar>
|
||||
int FindFirstCharacter(Vector<const PatternChar> pattern,
|
||||
Vector<const SubjectChar> subject, int index) {
|
||||
PatternChar pattern_first_char = pattern[0];
|
||||
|
||||
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
|
||||
const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(memchr(
|
||||
subject.start() + index, pattern_first_char, subject.length() - index));
|
||||
if (char_pos == NULL) return -1;
|
||||
return static_cast<int>(char_pos - subject.start());
|
||||
} else {
|
||||
const uint8_t search_low_byte =
|
||||
static_cast<uint8_t>(pattern_first_char & 0xFF);
|
||||
const SubjectChar search_char =
|
||||
static_cast<SubjectChar>(pattern_first_char);
|
||||
int pos = index;
|
||||
do {
|
||||
const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
|
||||
memchr(subject.start() + pos, search_low_byte,
|
||||
(subject.length() - pos) * sizeof(SubjectChar)));
|
||||
if (char_pos == NULL) return -1;
|
||||
pos = static_cast<int>(char_pos - subject.start());
|
||||
if (IsAligned(reinterpret_cast<uintptr_t>(char_pos),
|
||||
sizeof(SubjectChar))) {
|
||||
if (subject[pos] == search_char) return pos;
|
||||
}
|
||||
} while (++pos < subject.length());
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Single Character Pattern Search Strategy
|
||||
//---------------------------------------------------------------------
|
||||
@ -201,26 +233,15 @@ int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
|
||||
int index) {
|
||||
DCHECK_EQ(1, search->pattern_.length());
|
||||
PatternChar pattern_first_char = search->pattern_[0];
|
||||
int i = index;
|
||||
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
|
||||
const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
|
||||
memchr(subject.start() + i,
|
||||
pattern_first_char,
|
||||
subject.length() - i));
|
||||
if (pos == NULL) return -1;
|
||||
return static_cast<int>(pos - subject.start());
|
||||
return FindFirstCharacter(search->pattern_, subject, index);
|
||||
} else {
|
||||
if (sizeof(PatternChar) > sizeof(SubjectChar)) {
|
||||
if (exceedsOneByte(pattern_first_char)) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
|
||||
int n = subject.length();
|
||||
while (i < n) {
|
||||
if (subject[i++] == search_char) return i - 1;
|
||||
}
|
||||
return -1;
|
||||
return FindFirstCharacter(search->pattern_, subject, index);
|
||||
}
|
||||
}
|
||||
|
||||
@ -254,20 +275,12 @@ int StringSearch<PatternChar, SubjectChar>::LinearSearch(
|
||||
Vector<const PatternChar> pattern = search->pattern_;
|
||||
DCHECK(pattern.length() > 1);
|
||||
int pattern_length = pattern.length();
|
||||
PatternChar pattern_first_char = pattern[0];
|
||||
int i = index;
|
||||
int n = subject.length() - pattern_length;
|
||||
while (i <= n) {
|
||||
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
|
||||
const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
|
||||
memchr(subject.start() + i,
|
||||
pattern_first_char,
|
||||
n - i + 1));
|
||||
if (pos == NULL) return -1;
|
||||
i = static_cast<int>(pos - subject.start()) + 1;
|
||||
} else {
|
||||
if (subject[i++] != pattern_first_char) continue;
|
||||
}
|
||||
i = FindFirstCharacter(pattern, subject, i);
|
||||
if (i == -1) return -1;
|
||||
i++;
|
||||
// Loop extracted to separate function to allow using return to do
|
||||
// a deeper break.
|
||||
if (CharCompare(pattern.start() + 1,
|
||||
@ -505,22 +518,11 @@ int StringSearch<PatternChar, SubjectChar>::InitialSearch(
|
||||
|
||||
// We know our pattern is at least 2 characters, we cache the first so
|
||||
// the common case of the first character not matching is faster.
|
||||
PatternChar pattern_first_char = pattern[0];
|
||||
for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
|
||||
badness++;
|
||||
if (badness <= 0) {
|
||||
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
|
||||
const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
|
||||
memchr(subject.start() + i,
|
||||
pattern_first_char,
|
||||
n - i + 1));
|
||||
if (pos == NULL) {
|
||||
return -1;
|
||||
}
|
||||
i = static_cast<int>(pos - subject.start());
|
||||
} else {
|
||||
if (subject[i] != pattern_first_char) continue;
|
||||
}
|
||||
i = FindFirstCharacter(pattern, subject, i);
|
||||
if (i == -1) return -1;
|
||||
int j = 1;
|
||||
do {
|
||||
if (pattern[j] != subject[i + j]) {
|
||||
|
@ -77,6 +77,15 @@ assertEquals(-1, twoByteString.indexOf("\u0391\u03a3\u0395"),
|
||||
//single char pattern
|
||||
assertEquals(4, twoByteString.indexOf("\u0395"));
|
||||
|
||||
// test string with alignment traps
|
||||
var alignmentString = "\u1122\u2211\u2222\uFF00\u00FF\u00FF";
|
||||
assertEquals(2, alignmentString.indexOf("\u2222"));
|
||||
assertEquals(4, alignmentString.indexOf("\u00FF\u00FF"));
|
||||
|
||||
var longAlignmentString = "\uFF00" + "\u00FF".repeat(10);
|
||||
assertEquals(1,
|
||||
longAlignmentString.indexOf("\u00FF".repeat(10)));
|
||||
|
||||
// Test complex string indexOf algorithms. Only trigger for long strings.
|
||||
|
||||
// Long string that isn't a simple repeat of a shorter string.
|
||||
|
Loading…
Reference in New Issue
Block a user