Faster ascii string case conversion.

Review URL: http://codereview.chromium.org/4189001

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@5713 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
vitalyr@chromium.org 2010-10-26 18:14:45 +00:00
parent 3d8e94863d
commit e197c930e1
3 changed files with 158 additions and 25 deletions

View File

@ -201,9 +201,11 @@ const int kIntptrSize = sizeof(intptr_t); // NOLINT
#if V8_HOST_ARCH_64_BIT #if V8_HOST_ARCH_64_BIT
const int kPointerSizeLog2 = 3; const int kPointerSizeLog2 = 3;
const intptr_t kIntptrSignBit = V8_INT64_C(0x8000000000000000); const intptr_t kIntptrSignBit = V8_INT64_C(0x8000000000000000);
const uintptr_t kUintptrAllBitsSet = V8_UINT64_C(0xFFFFFFFFFFFFFFFF);
#else #else
const int kPointerSizeLog2 = 2; const int kPointerSizeLog2 = 2;
const intptr_t kIntptrSignBit = 0x80000000; const intptr_t kIntptrSignBit = 0x80000000;
const uintptr_t kUintptrAllBitsSet = 0xFFFFFFFFu;
#endif #endif
// Mask for the sign bit in a smi. // Mask for the sign bit in a smi.

View File

@ -4718,39 +4718,127 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(
namespace { namespace {
static const uintptr_t kOneInEveryByte = kUintptrAllBitsSet / 0xFF;
// Given a word and two range boundaries returns a word with high bit
// set in every byte iff the corresponding input byte was strictly in
// the range (m, n). All the other bits in the result are cleared.
// This function is only useful when it can be inlined and the
// boundaries are statically known.
// Requires: all bytes in the input word and the boundaries must be
// ascii (less than 0x7F).
static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) {
// Every byte in an ascii string is less than or equal to 0x7F.
ASSERT((w & (kOneInEveryByte * 0x7F)) == w);
// Use strict inequalities since in edge cases the function could be
// further simplified.
ASSERT(0 < m && m < n && n < 0x7F);
// Has high bit set in every w byte less than n.
uintptr_t tmp1 = kOneInEveryByte * (0x7F + n) - w;
// Has high bit set in every w byte greater than m.
uintptr_t tmp2 = w + kOneInEveryByte * (0x7F - m);
return (tmp1 & tmp2 & (kOneInEveryByte * 0x80));
}
enum AsciiCaseConversion {
ASCII_TO_LOWER,
ASCII_TO_UPPER
};
template <AsciiCaseConversion dir>
struct FastAsciiConverter {
static bool Convert(char* dst, char* src, int length) {
#ifdef DEBUG
char* saved_dst = dst;
char* saved_src = src;
#endif
// We rely on the distance between upper and lower case letters
// being a known power of 2.
ASSERT('a' - 'A' == (1 << 5));
// Boundaries for the range of input characters than require conversion.
const char lo = (dir == ASCII_TO_LOWER) ? 'A' - 1 : 'a' - 1;
const char hi = (dir == ASCII_TO_LOWER) ? 'Z' + 1 : 'z' + 1;
bool changed = false;
char* const limit = src + length;
#ifdef V8_HOST_CAN_READ_UNALIGNED
// Process the prefix of the input that requires no conversion one
// (machine) word at a time.
while (src <= limit - sizeof(uintptr_t)) {
uintptr_t w = *reinterpret_cast<uintptr_t*>(src);
if (AsciiRangeMask(w, lo, hi) != 0) {
changed = true;
break;
}
*reinterpret_cast<uintptr_t*>(dst) = w;
src += sizeof(uintptr_t);
dst += sizeof(uintptr_t);
}
// Process the remainder of the input performing conversion when
// required one word at a time.
while (src <= limit - sizeof(uintptr_t)) {
uintptr_t w = *reinterpret_cast<uintptr_t*>(src);
uintptr_t m = AsciiRangeMask(w, lo, hi);
// The mask has high (7th) bit set in every byte that needs
// conversion and we know that the distance between cases is
// 1 << 5.
*reinterpret_cast<uintptr_t*>(dst) = w ^ (m >> 2);
src += sizeof(uintptr_t);
dst += sizeof(uintptr_t);
}
#endif
// Process the last few bytes of the input (or the whole input if
// unaligned access is not supported).
while (src < limit) {
char c = *src;
if (lo < c && c < hi) {
c ^= (1 << 5);
changed = true;
}
*dst = c;
++src;
++dst;
}
#ifdef DEBUG
CheckConvert(saved_dst, saved_src, length, changed);
#endif
return changed;
}
#ifdef DEBUG
static void CheckConvert(char* dst, char* src, int length, bool changed) {
bool expected_changed = false;
for (int i = 0; i < length; i++) {
if (dst[i] == src[i]) continue;
expected_changed = true;
if (dir == ASCII_TO_LOWER) {
ASSERT('A' <= src[i] && src[i] <= 'Z');
ASSERT(dst[i] == src[i] + ('a' - 'A'));
} else {
ASSERT(dir == ASCII_TO_UPPER);
ASSERT('a' <= src[i] && src[i] <= 'z');
ASSERT(dst[i] == src[i] - ('a' - 'A'));
}
}
ASSERT(expected_changed == changed);
}
#endif
};
struct ToLowerTraits { struct ToLowerTraits {
typedef unibrow::ToLowercase UnibrowConverter; typedef unibrow::ToLowercase UnibrowConverter;
static bool ConvertAscii(char* dst, char* src, int length) { typedef FastAsciiConverter<ASCII_TO_LOWER> AsciiConverter;
bool changed = false;
for (int i = 0; i < length; ++i) {
char c = src[i];
if ('A' <= c && c <= 'Z') {
c += ('a' - 'A');
changed = true;
}
dst[i] = c;
}
return changed;
}
}; };
struct ToUpperTraits { struct ToUpperTraits {
typedef unibrow::ToUppercase UnibrowConverter; typedef unibrow::ToUppercase UnibrowConverter;
static bool ConvertAscii(char* dst, char* src, int length) { typedef FastAsciiConverter<ASCII_TO_UPPER> AsciiConverter;
bool changed = false;
for (int i = 0; i < length; ++i) {
char c = src[i];
if ('a' <= c && c <= 'z') {
c -= ('a' - 'A');
changed = true;
}
dst[i] = c;
}
return changed;
}
}; };
} // namespace } // namespace
@ -4780,7 +4868,7 @@ MUST_USE_RESULT static MaybeObject* ConvertCase(
if (!maybe_o->ToObject(&o)) return maybe_o; if (!maybe_o->ToObject(&o)) return maybe_o;
} }
SeqAsciiString* result = SeqAsciiString::cast(o); SeqAsciiString* result = SeqAsciiString::cast(o);
bool has_changed_character = ConvertTraits::ConvertAscii( bool has_changed_character = ConvertTraits::AsciiConverter::Convert(
result->GetChars(), SeqAsciiString::cast(s)->GetChars(), length); result->GetChars(), SeqAsciiString::cast(s)->GetChars(), length);
return has_changed_character ? result : s; return has_changed_character ? result : s;
} }

View File

@ -25,4 +25,47 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Flags: --random-seed=17
assertEquals("ΚΟΣΜΟΣ ΚΟΣΜΟΣ".toLowerCase(), "κοσμος κοσμος"); assertEquals("ΚΟΣΜΟΣ ΚΟΣΜΟΣ".toLowerCase(), "κοσμος κοσμος");
var A_CODE = "A".charCodeAt(0);
var Z_CODE = "Z".charCodeAt(0);
var a_CODE = "a".charCodeAt(0);
var z_CODE = "z".charCodeAt(0);
function charCodeToLower(charCode) {
if (A_CODE <= charCode && charCode <= Z_CODE) {
return charCode + a_CODE - A_CODE;
}
return charCode;
}
function charCodeToUpper(charCode) {
if (a_CODE <= charCode && charCode <= z_CODE) {
return charCode - (a_CODE - A_CODE);
}
return charCode;
}
function test(length) {
var str = "";
var strLower = "";
var strUpper = "";
for (var i = 0; i < length; i++) {
var c = Math.round(0x7f * Math.random());
str += String.fromCharCode(c);
strLower += String.fromCharCode(charCodeToLower(c));
strUpper += String.fromCharCode(charCodeToUpper(c));
}
assertEquals(strLower, str.toLowerCase());
assertEquals(strUpper, str.toUpperCase());
}
for (var i = 1; i <= 128; i <<= 1); {
for (var j = 0; j < 8; j++) {
for (var k = 0; k < 3; k++) {
test(i + j);
}
}
}