Faster ascii string case conversion.
Review URL: http://codereview.chromium.org/4189001 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@5713 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
parent
3d8e94863d
commit
e197c930e1
@ -201,9 +201,11 @@ const int kIntptrSize = sizeof(intptr_t); // NOLINT
|
|||||||
#if V8_HOST_ARCH_64_BIT
|
#if V8_HOST_ARCH_64_BIT
|
||||||
const int kPointerSizeLog2 = 3;
|
const int kPointerSizeLog2 = 3;
|
||||||
const intptr_t kIntptrSignBit = V8_INT64_C(0x8000000000000000);
|
const intptr_t kIntptrSignBit = V8_INT64_C(0x8000000000000000);
|
||||||
|
const uintptr_t kUintptrAllBitsSet = V8_UINT64_C(0xFFFFFFFFFFFFFFFF);
|
||||||
#else
|
#else
|
||||||
const int kPointerSizeLog2 = 2;
|
const int kPointerSizeLog2 = 2;
|
||||||
const intptr_t kIntptrSignBit = 0x80000000;
|
const intptr_t kIntptrSignBit = 0x80000000;
|
||||||
|
const uintptr_t kUintptrAllBitsSet = 0xFFFFFFFFu;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Mask for the sign bit in a smi.
|
// Mask for the sign bit in a smi.
|
||||||
|
138
src/runtime.cc
138
src/runtime.cc
@ -4718,39 +4718,127 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
static const uintptr_t kOneInEveryByte = kUintptrAllBitsSet / 0xFF;
|
||||||
|
|
||||||
|
|
||||||
|
// Given a word and two range boundaries returns a word with high bit
|
||||||
|
// set in every byte iff the corresponding input byte was strictly in
|
||||||
|
// the range (m, n). All the other bits in the result are cleared.
|
||||||
|
// This function is only useful when it can be inlined and the
|
||||||
|
// boundaries are statically known.
|
||||||
|
// Requires: all bytes in the input word and the boundaries must be
|
||||||
|
// ascii (less than 0x7F).
|
||||||
|
static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) {
|
||||||
|
// Every byte in an ascii string is less than or equal to 0x7F.
|
||||||
|
ASSERT((w & (kOneInEveryByte * 0x7F)) == w);
|
||||||
|
// Use strict inequalities since in edge cases the function could be
|
||||||
|
// further simplified.
|
||||||
|
ASSERT(0 < m && m < n && n < 0x7F);
|
||||||
|
// Has high bit set in every w byte less than n.
|
||||||
|
uintptr_t tmp1 = kOneInEveryByte * (0x7F + n) - w;
|
||||||
|
// Has high bit set in every w byte greater than m.
|
||||||
|
uintptr_t tmp2 = w + kOneInEveryByte * (0x7F - m);
|
||||||
|
return (tmp1 & tmp2 & (kOneInEveryByte * 0x80));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
enum AsciiCaseConversion {
|
||||||
|
ASCII_TO_LOWER,
|
||||||
|
ASCII_TO_UPPER
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <AsciiCaseConversion dir>
|
||||||
|
struct FastAsciiConverter {
|
||||||
|
static bool Convert(char* dst, char* src, int length) {
|
||||||
|
#ifdef DEBUG
|
||||||
|
char* saved_dst = dst;
|
||||||
|
char* saved_src = src;
|
||||||
|
#endif
|
||||||
|
// We rely on the distance between upper and lower case letters
|
||||||
|
// being a known power of 2.
|
||||||
|
ASSERT('a' - 'A' == (1 << 5));
|
||||||
|
// Boundaries for the range of input characters than require conversion.
|
||||||
|
const char lo = (dir == ASCII_TO_LOWER) ? 'A' - 1 : 'a' - 1;
|
||||||
|
const char hi = (dir == ASCII_TO_LOWER) ? 'Z' + 1 : 'z' + 1;
|
||||||
|
bool changed = false;
|
||||||
|
char* const limit = src + length;
|
||||||
|
#ifdef V8_HOST_CAN_READ_UNALIGNED
|
||||||
|
// Process the prefix of the input that requires no conversion one
|
||||||
|
// (machine) word at a time.
|
||||||
|
while (src <= limit - sizeof(uintptr_t)) {
|
||||||
|
uintptr_t w = *reinterpret_cast<uintptr_t*>(src);
|
||||||
|
if (AsciiRangeMask(w, lo, hi) != 0) {
|
||||||
|
changed = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
*reinterpret_cast<uintptr_t*>(dst) = w;
|
||||||
|
src += sizeof(uintptr_t);
|
||||||
|
dst += sizeof(uintptr_t);
|
||||||
|
}
|
||||||
|
// Process the remainder of the input performing conversion when
|
||||||
|
// required one word at a time.
|
||||||
|
while (src <= limit - sizeof(uintptr_t)) {
|
||||||
|
uintptr_t w = *reinterpret_cast<uintptr_t*>(src);
|
||||||
|
uintptr_t m = AsciiRangeMask(w, lo, hi);
|
||||||
|
// The mask has high (7th) bit set in every byte that needs
|
||||||
|
// conversion and we know that the distance between cases is
|
||||||
|
// 1 << 5.
|
||||||
|
*reinterpret_cast<uintptr_t*>(dst) = w ^ (m >> 2);
|
||||||
|
src += sizeof(uintptr_t);
|
||||||
|
dst += sizeof(uintptr_t);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// Process the last few bytes of the input (or the whole input if
|
||||||
|
// unaligned access is not supported).
|
||||||
|
while (src < limit) {
|
||||||
|
char c = *src;
|
||||||
|
if (lo < c && c < hi) {
|
||||||
|
c ^= (1 << 5);
|
||||||
|
changed = true;
|
||||||
|
}
|
||||||
|
*dst = c;
|
||||||
|
++src;
|
||||||
|
++dst;
|
||||||
|
}
|
||||||
|
#ifdef DEBUG
|
||||||
|
CheckConvert(saved_dst, saved_src, length, changed);
|
||||||
|
#endif
|
||||||
|
return changed;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
static void CheckConvert(char* dst, char* src, int length, bool changed) {
|
||||||
|
bool expected_changed = false;
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
if (dst[i] == src[i]) continue;
|
||||||
|
expected_changed = true;
|
||||||
|
if (dir == ASCII_TO_LOWER) {
|
||||||
|
ASSERT('A' <= src[i] && src[i] <= 'Z');
|
||||||
|
ASSERT(dst[i] == src[i] + ('a' - 'A'));
|
||||||
|
} else {
|
||||||
|
ASSERT(dir == ASCII_TO_UPPER);
|
||||||
|
ASSERT('a' <= src[i] && src[i] <= 'z');
|
||||||
|
ASSERT(dst[i] == src[i] - ('a' - 'A'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ASSERT(expected_changed == changed);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
struct ToLowerTraits {
|
struct ToLowerTraits {
|
||||||
typedef unibrow::ToLowercase UnibrowConverter;
|
typedef unibrow::ToLowercase UnibrowConverter;
|
||||||
|
|
||||||
static bool ConvertAscii(char* dst, char* src, int length) {
|
typedef FastAsciiConverter<ASCII_TO_LOWER> AsciiConverter;
|
||||||
bool changed = false;
|
|
||||||
for (int i = 0; i < length; ++i) {
|
|
||||||
char c = src[i];
|
|
||||||
if ('A' <= c && c <= 'Z') {
|
|
||||||
c += ('a' - 'A');
|
|
||||||
changed = true;
|
|
||||||
}
|
|
||||||
dst[i] = c;
|
|
||||||
}
|
|
||||||
return changed;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
struct ToUpperTraits {
|
struct ToUpperTraits {
|
||||||
typedef unibrow::ToUppercase UnibrowConverter;
|
typedef unibrow::ToUppercase UnibrowConverter;
|
||||||
|
|
||||||
static bool ConvertAscii(char* dst, char* src, int length) {
|
typedef FastAsciiConverter<ASCII_TO_UPPER> AsciiConverter;
|
||||||
bool changed = false;
|
|
||||||
for (int i = 0; i < length; ++i) {
|
|
||||||
char c = src[i];
|
|
||||||
if ('a' <= c && c <= 'z') {
|
|
||||||
c -= ('a' - 'A');
|
|
||||||
changed = true;
|
|
||||||
}
|
|
||||||
dst[i] = c;
|
|
||||||
}
|
|
||||||
return changed;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
@ -4780,7 +4868,7 @@ MUST_USE_RESULT static MaybeObject* ConvertCase(
|
|||||||
if (!maybe_o->ToObject(&o)) return maybe_o;
|
if (!maybe_o->ToObject(&o)) return maybe_o;
|
||||||
}
|
}
|
||||||
SeqAsciiString* result = SeqAsciiString::cast(o);
|
SeqAsciiString* result = SeqAsciiString::cast(o);
|
||||||
bool has_changed_character = ConvertTraits::ConvertAscii(
|
bool has_changed_character = ConvertTraits::AsciiConverter::Convert(
|
||||||
result->GetChars(), SeqAsciiString::cast(s)->GetChars(), length);
|
result->GetChars(), SeqAsciiString::cast(s)->GetChars(), length);
|
||||||
return has_changed_character ? result : s;
|
return has_changed_character ? result : s;
|
||||||
}
|
}
|
||||||
|
@ -25,4 +25,47 @@
|
|||||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
// Flags: --random-seed=17
|
||||||
|
|
||||||
assertEquals("ΚΟΣΜΟΣ ΚΟΣΜΟΣ".toLowerCase(), "κοσμος κοσμος");
|
assertEquals("ΚΟΣΜΟΣ ΚΟΣΜΟΣ".toLowerCase(), "κοσμος κοσμος");
|
||||||
|
|
||||||
|
var A_CODE = "A".charCodeAt(0);
|
||||||
|
var Z_CODE = "Z".charCodeAt(0);
|
||||||
|
var a_CODE = "a".charCodeAt(0);
|
||||||
|
var z_CODE = "z".charCodeAt(0);
|
||||||
|
|
||||||
|
function charCodeToLower(charCode) {
|
||||||
|
if (A_CODE <= charCode && charCode <= Z_CODE) {
|
||||||
|
return charCode + a_CODE - A_CODE;
|
||||||
|
}
|
||||||
|
return charCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
function charCodeToUpper(charCode) {
|
||||||
|
if (a_CODE <= charCode && charCode <= z_CODE) {
|
||||||
|
return charCode - (a_CODE - A_CODE);
|
||||||
|
}
|
||||||
|
return charCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
function test(length) {
|
||||||
|
var str = "";
|
||||||
|
var strLower = "";
|
||||||
|
var strUpper = "";
|
||||||
|
for (var i = 0; i < length; i++) {
|
||||||
|
var c = Math.round(0x7f * Math.random());
|
||||||
|
str += String.fromCharCode(c);
|
||||||
|
strLower += String.fromCharCode(charCodeToLower(c));
|
||||||
|
strUpper += String.fromCharCode(charCodeToUpper(c));
|
||||||
|
}
|
||||||
|
assertEquals(strLower, str.toLowerCase());
|
||||||
|
assertEquals(strUpper, str.toUpperCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var i = 1; i <= 128; i <<= 1); {
|
||||||
|
for (var j = 0; j < 8; j++) {
|
||||||
|
for (var k = 0; k < 3; k++) {
|
||||||
|
test(i + j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user