From e197c930e11270fdef153a94e66df95c6020d58e Mon Sep 17 00:00:00 2001 From: "vitalyr@chromium.org" Date: Tue, 26 Oct 2010 18:14:45 +0000 Subject: [PATCH] Faster ascii string case conversion. Review URL: http://codereview.chromium.org/4189001 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@5713 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/globals.h | 2 + src/runtime.cc | 138 +++++++++++++++++++++++++++++------- test/mjsunit/string-case.js | 43 +++++++++++ 3 files changed, 158 insertions(+), 25 deletions(-) diff --git a/src/globals.h b/src/globals.h index 6f9db3b3d2..c218f80dc1 100644 --- a/src/globals.h +++ b/src/globals.h @@ -201,9 +201,11 @@ const int kIntptrSize = sizeof(intptr_t); // NOLINT #if V8_HOST_ARCH_64_BIT const int kPointerSizeLog2 = 3; const intptr_t kIntptrSignBit = V8_INT64_C(0x8000000000000000); +const uintptr_t kUintptrAllBitsSet = V8_UINT64_C(0xFFFFFFFFFFFFFFFF); #else const int kPointerSizeLog2 = 2; const intptr_t kIntptrSignBit = 0x80000000; +const uintptr_t kUintptrAllBitsSet = 0xFFFFFFFFu; #endif // Mask for the sign bit in a smi. diff --git a/src/runtime.cc b/src/runtime.cc index a4696ea808..f701c03113 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -4718,39 +4718,127 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper( namespace { +static const uintptr_t kOneInEveryByte = kUintptrAllBitsSet / 0xFF; + + +// Given a word and two range boundaries returns a word with high bit +// set in every byte iff the corresponding input byte was strictly in +// the range (m, n). All the other bits in the result are cleared. +// This function is only useful when it can be inlined and the +// boundaries are statically known. +// Requires: all bytes in the input word and the boundaries must be +// ascii (less than 0x7F). +static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) { + // Every byte in an ascii string is less than or equal to 0x7F. + ASSERT((w & (kOneInEveryByte * 0x7F)) == w); + // Use strict inequalities since in edge cases the function could be + // further simplified. + ASSERT(0 < m && m < n && n < 0x7F); + // Has high bit set in every w byte less than n. + uintptr_t tmp1 = kOneInEveryByte * (0x7F + n) - w; + // Has high bit set in every w byte greater than m. + uintptr_t tmp2 = w + kOneInEveryByte * (0x7F - m); + return (tmp1 & tmp2 & (kOneInEveryByte * 0x80)); +} + + +enum AsciiCaseConversion { + ASCII_TO_LOWER, + ASCII_TO_UPPER +}; + + +template +struct FastAsciiConverter { + static bool Convert(char* dst, char* src, int length) { +#ifdef DEBUG + char* saved_dst = dst; + char* saved_src = src; +#endif + // We rely on the distance between upper and lower case letters + // being a known power of 2. + ASSERT('a' - 'A' == (1 << 5)); + // Boundaries for the range of input characters than require conversion. + const char lo = (dir == ASCII_TO_LOWER) ? 'A' - 1 : 'a' - 1; + const char hi = (dir == ASCII_TO_LOWER) ? 'Z' + 1 : 'z' + 1; + bool changed = false; + char* const limit = src + length; +#ifdef V8_HOST_CAN_READ_UNALIGNED + // Process the prefix of the input that requires no conversion one + // (machine) word at a time. + while (src <= limit - sizeof(uintptr_t)) { + uintptr_t w = *reinterpret_cast(src); + if (AsciiRangeMask(w, lo, hi) != 0) { + changed = true; + break; + } + *reinterpret_cast(dst) = w; + src += sizeof(uintptr_t); + dst += sizeof(uintptr_t); + } + // Process the remainder of the input performing conversion when + // required one word at a time. + while (src <= limit - sizeof(uintptr_t)) { + uintptr_t w = *reinterpret_cast(src); + uintptr_t m = AsciiRangeMask(w, lo, hi); + // The mask has high (7th) bit set in every byte that needs + // conversion and we know that the distance between cases is + // 1 << 5. + *reinterpret_cast(dst) = w ^ (m >> 2); + src += sizeof(uintptr_t); + dst += sizeof(uintptr_t); + } +#endif + // Process the last few bytes of the input (or the whole input if + // unaligned access is not supported). + while (src < limit) { + char c = *src; + if (lo < c && c < hi) { + c ^= (1 << 5); + changed = true; + } + *dst = c; + ++src; + ++dst; + } +#ifdef DEBUG + CheckConvert(saved_dst, saved_src, length, changed); +#endif + return changed; + } + +#ifdef DEBUG + static void CheckConvert(char* dst, char* src, int length, bool changed) { + bool expected_changed = false; + for (int i = 0; i < length; i++) { + if (dst[i] == src[i]) continue; + expected_changed = true; + if (dir == ASCII_TO_LOWER) { + ASSERT('A' <= src[i] && src[i] <= 'Z'); + ASSERT(dst[i] == src[i] + ('a' - 'A')); + } else { + ASSERT(dir == ASCII_TO_UPPER); + ASSERT('a' <= src[i] && src[i] <= 'z'); + ASSERT(dst[i] == src[i] - ('a' - 'A')); + } + } + ASSERT(expected_changed == changed); + } +#endif +}; + + struct ToLowerTraits { typedef unibrow::ToLowercase UnibrowConverter; - static bool ConvertAscii(char* dst, char* src, int length) { - bool changed = false; - for (int i = 0; i < length; ++i) { - char c = src[i]; - if ('A' <= c && c <= 'Z') { - c += ('a' - 'A'); - changed = true; - } - dst[i] = c; - } - return changed; - } + typedef FastAsciiConverter AsciiConverter; }; struct ToUpperTraits { typedef unibrow::ToUppercase UnibrowConverter; - static bool ConvertAscii(char* dst, char* src, int length) { - bool changed = false; - for (int i = 0; i < length; ++i) { - char c = src[i]; - if ('a' <= c && c <= 'z') { - c -= ('a' - 'A'); - changed = true; - } - dst[i] = c; - } - return changed; - } + typedef FastAsciiConverter AsciiConverter; }; } // namespace @@ -4780,7 +4868,7 @@ MUST_USE_RESULT static MaybeObject* ConvertCase( if (!maybe_o->ToObject(&o)) return maybe_o; } SeqAsciiString* result = SeqAsciiString::cast(o); - bool has_changed_character = ConvertTraits::ConvertAscii( + bool has_changed_character = ConvertTraits::AsciiConverter::Convert( result->GetChars(), SeqAsciiString::cast(s)->GetChars(), length); return has_changed_character ? result : s; } diff --git a/test/mjsunit/string-case.js b/test/mjsunit/string-case.js index 13dcd3ea81..283e703fc3 100644 --- a/test/mjsunit/string-case.js +++ b/test/mjsunit/string-case.js @@ -25,4 +25,47 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Flags: --random-seed=17 + assertEquals("ΚΟΣΜΟΣ ΚΟΣΜΟΣ".toLowerCase(), "κοσμος κοσμος"); + +var A_CODE = "A".charCodeAt(0); +var Z_CODE = "Z".charCodeAt(0); +var a_CODE = "a".charCodeAt(0); +var z_CODE = "z".charCodeAt(0); + +function charCodeToLower(charCode) { + if (A_CODE <= charCode && charCode <= Z_CODE) { + return charCode + a_CODE - A_CODE; + } + return charCode; +} + +function charCodeToUpper(charCode) { + if (a_CODE <= charCode && charCode <= z_CODE) { + return charCode - (a_CODE - A_CODE); + } + return charCode; +} + +function test(length) { + var str = ""; + var strLower = ""; + var strUpper = ""; + for (var i = 0; i < length; i++) { + var c = Math.round(0x7f * Math.random()); + str += String.fromCharCode(c); + strLower += String.fromCharCode(charCodeToLower(c)); + strUpper += String.fromCharCode(charCodeToUpper(c)); + } + assertEquals(strLower, str.toLowerCase()); + assertEquals(strUpper, str.toUpperCase()); +} + +for (var i = 1; i <= 128; i <<= 1); { + for (var j = 0; j < 8; j++) { + for (var k = 0; k < 3; k++) { + test(i + j); + } + } +}