Optimize case conversion with icu_case_mapping
Use FastAsciiConvert (as used by Unibrow) for i18n-aware case conversion with --icu_case_mapping. Move FastAsciiConvert to src/string-case.cc so that it can be used by both runtime-{string,i18n}. Add more tests. BUG=v8:4477,v8:4476 TEST=intl/general/case* Review-Url: https://codereview.chromium.org/2533983006 Cr-Commit-Position: refs/heads/master@{#41821}
This commit is contained in:
parent
4c640be19b
commit
af38272dd9
2
BUILD.gn
2
BUILD.gn
@ -1691,6 +1691,8 @@ v8_source_set("v8_base") {
|
||||
"src/startup-data-util.h",
|
||||
"src/string-builder.cc",
|
||||
"src/string-builder.h",
|
||||
"src/string-case.cc",
|
||||
"src/string-case.h",
|
||||
"src/string-search.h",
|
||||
"src/string-stream.cc",
|
||||
"src/string-stream.h",
|
||||
|
@ -2121,27 +2121,16 @@ OverrideFunction(GlobalString.prototype, 'normalize', function() {
|
||||
);
|
||||
|
||||
function ToLowerCaseI18N() {
|
||||
if (!IS_UNDEFINED(new.target)) {
|
||||
throw %make_type_error(kOrdinaryFunctionCalledAsConstructor);
|
||||
}
|
||||
CHECK_OBJECT_COERCIBLE(this, "String.prototype.toLowerCase");
|
||||
var s = TO_STRING(this);
|
||||
return %StringToLowerCaseI18N(s);
|
||||
return %StringToLowerCaseI18N(TO_STRING(this));
|
||||
}
|
||||
|
||||
function ToUpperCaseI18N() {
|
||||
if (!IS_UNDEFINED(new.target)) {
|
||||
throw %make_type_error(kOrdinaryFunctionCalledAsConstructor);
|
||||
}
|
||||
CHECK_OBJECT_COERCIBLE(this, "String.prototype.toUpperCase");
|
||||
var s = TO_STRING(this);
|
||||
return %StringToUpperCaseI18N(s);
|
||||
return %StringToUpperCaseI18N(TO_STRING(this));
|
||||
}
|
||||
|
||||
function ToLocaleLowerCaseI18N(locales) {
|
||||
if (!IS_UNDEFINED(new.target)) {
|
||||
throw %make_type_error(kOrdinaryFunctionCalledAsConstructor);
|
||||
}
|
||||
CHECK_OBJECT_COERCIBLE(this, "String.prototype.toLocaleLowerCase");
|
||||
return LocaleConvertCase(TO_STRING(this), locales, false);
|
||||
}
|
||||
@ -2149,9 +2138,6 @@ function ToLocaleLowerCaseI18N(locales) {
|
||||
%FunctionSetLength(ToLocaleLowerCaseI18N, 0);
|
||||
|
||||
function ToLocaleUpperCaseI18N(locales) {
|
||||
if (!IS_UNDEFINED(new.target)) {
|
||||
throw %make_type_error(kOrdinaryFunctionCalledAsConstructor);
|
||||
}
|
||||
CHECK_OBJECT_COERCIBLE(this, "String.prototype.toLocaleUpperCase");
|
||||
return LocaleConvertCase(TO_STRING(this), locales, true);
|
||||
}
|
||||
|
@ -8,13 +8,15 @@
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "src/api.h"
|
||||
#include "src/api-natives.h"
|
||||
#include "src/api.h"
|
||||
#include "src/arguments.h"
|
||||
#include "src/factory.h"
|
||||
#include "src/i18n.h"
|
||||
#include "src/isolate-inl.h"
|
||||
#include "src/messages.h"
|
||||
#include "src/string-case.h"
|
||||
#include "src/utils.h"
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/calendar.h"
|
||||
@ -1041,15 +1043,14 @@ bool ToUpperFastASCII(const Vector<const Char>& src,
|
||||
const uint16_t sharp_s = 0xDF;
|
||||
|
||||
template <typename Char>
|
||||
bool ToUpperOneByte(const Vector<const Char>& src,
|
||||
Handle<SeqOneByteString> result, int* sharp_s_count) {
|
||||
bool ToUpperOneByte(const Vector<const Char>& src, uint8_t* dest,
|
||||
int* sharp_s_count) {
|
||||
// Still pretty-fast path for the input with non-ASCII Latin-1 characters.
|
||||
|
||||
// There are two special cases.
|
||||
// 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.
|
||||
// 2. Lower case sharp-S converts to "SS" (two characters)
|
||||
*sharp_s_count = 0;
|
||||
int32_t index = 0;
|
||||
for (auto it = src.begin(); it != src.end(); ++it) {
|
||||
uint16_t ch = static_cast<uint16_t>(*it);
|
||||
if (V8_UNLIKELY(ch == sharp_s)) {
|
||||
@ -1061,7 +1062,7 @@ bool ToUpperOneByte(const Vector<const Char>& src,
|
||||
// need to take the 16-bit path.
|
||||
return false;
|
||||
}
|
||||
result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));
|
||||
*dest++ = ToLatin1Upper(ch);
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -1082,6 +1083,16 @@ void ToUpperWithSharpS(const Vector<const Char>& src,
|
||||
}
|
||||
}
|
||||
|
||||
inline int FindFirstUpperOrNonAscii(Handle<String> s, int length) {
|
||||
for (int index = 0; index < length; ++index) {
|
||||
uint16_t ch = s->Get(index);
|
||||
if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
|
||||
return index;
|
||||
}
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
|
||||
@ -1091,60 +1102,65 @@ RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
|
||||
|
||||
int length = s->length();
|
||||
s = String::Flatten(s);
|
||||
// First scan the string for uppercase and non-ASCII characters:
|
||||
if (s->HasOnlyOneByteChars()) {
|
||||
int first_index_to_lower = length;
|
||||
for (int index = 0; index < length; ++index) {
|
||||
// Blink specializes this path for one-byte strings, so it
|
||||
// does not need to do a generic get, but can do the equivalent
|
||||
// of SeqOneByteStringGet.
|
||||
uint16_t ch = s->Get(index);
|
||||
if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
|
||||
first_index_to_lower = index;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Nothing to do if the string is all ASCII with no uppercase.
|
||||
if (first_index_to_lower == length) return *s;
|
||||
|
||||
// We depend here on the invariant that the length of a Latin1
|
||||
// string is invariant under ToLowerCase, and the result always
|
||||
// fits in the Latin1 range in the *root locale*. It does not hold
|
||||
// for ToUpperCase even in the root locale.
|
||||
Handle<SeqOneByteString> result;
|
||||
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
|
||||
isolate, result, isolate->factory()->NewRawOneByteString(length));
|
||||
|
||||
DisallowHeapAllocation no_gc;
|
||||
String::FlatContent flat = s->GetFlatContent();
|
||||
if (flat.IsOneByte()) {
|
||||
const uint8_t* src = flat.ToOneByteVector().start();
|
||||
CopyChars(result->GetChars(), src,
|
||||
static_cast<size_t>(first_index_to_lower));
|
||||
for (int index = first_index_to_lower; index < length; ++index) {
|
||||
uint16_t ch = static_cast<uint16_t>(src[index]);
|
||||
result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
|
||||
}
|
||||
} else {
|
||||
const uint16_t* src = flat.ToUC16Vector().start();
|
||||
CopyChars(result->GetChars(), src,
|
||||
static_cast<size_t>(first_index_to_lower));
|
||||
for (int index = first_index_to_lower; index < length; ++index) {
|
||||
uint16_t ch = src[index];
|
||||
result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
|
||||
}
|
||||
}
|
||||
|
||||
return *result;
|
||||
if (!s->HasOnlyOneByteChars()) {
|
||||
// Use a slower implementation for strings with characters beyond U+00FF.
|
||||
return LocaleConvertCase(s, isolate, false, "");
|
||||
}
|
||||
|
||||
// Blink had an additional case here for ASCII 2-byte strings, but
|
||||
// that is subsumed by the above code (assuming there isn't a false
|
||||
// negative for HasOnlyOneByteChars).
|
||||
// We depend here on the invariant that the length of a Latin1
|
||||
// string is invariant under ToLowerCase, and the result always
|
||||
// fits in the Latin1 range in the *root locale*. It does not hold
|
||||
// for ToUpperCase even in the root locale.
|
||||
|
||||
// Do a slower implementation for cases that include non-ASCII characters.
|
||||
return LocaleConvertCase(s, isolate, false, "");
|
||||
// Scan the string for uppercase and non-ASCII characters for strings
|
||||
// shorter than a machine-word without any memory allocation overhead.
|
||||
// TODO(jshin): Apply this to a longer input by breaking FastAsciiConvert()
|
||||
// to two parts, one for scanning the prefix with no change and the other for
|
||||
// handling ASCII-only characters.
|
||||
int index_to_first_unprocessed = length;
|
||||
const bool is_short = length < static_cast<int>(sizeof(uintptr_t));
|
||||
if (is_short) {
|
||||
index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
|
||||
// Nothing to do if the string is all ASCII with no uppercase.
|
||||
if (index_to_first_unprocessed == length) return *s;
|
||||
}
|
||||
|
||||
Handle<SeqOneByteString> result =
|
||||
isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
|
||||
|
||||
DisallowHeapAllocation no_gc;
|
||||
String::FlatContent flat = s->GetFlatContent();
|
||||
uint8_t* dest = result->GetChars();
|
||||
if (flat.IsOneByte()) {
|
||||
const uint8_t* src = flat.ToOneByteVector().start();
|
||||
bool has_changed_character = false;
|
||||
index_to_first_unprocessed = FastAsciiConvert<true>(
|
||||
reinterpret_cast<char*>(dest), reinterpret_cast<const char*>(src),
|
||||
length, &has_changed_character);
|
||||
// If not ASCII, we keep the result up to index_to_first_unprocessed and
|
||||
// process the rest.
|
||||
if (index_to_first_unprocessed == length)
|
||||
return has_changed_character ? *result : *s;
|
||||
|
||||
for (int index = index_to_first_unprocessed; index < length; ++index) {
|
||||
dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
|
||||
}
|
||||
} else {
|
||||
if (index_to_first_unprocessed == length) {
|
||||
DCHECK(!is_short);
|
||||
index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
|
||||
}
|
||||
// Nothing to do if the string is all ASCII with no uppercase.
|
||||
if (index_to_first_unprocessed == length) return *s;
|
||||
const uint16_t* src = flat.ToUC16Vector().start();
|
||||
CopyChars(dest, src, index_to_first_unprocessed);
|
||||
for (int index = index_to_first_unprocessed; index < length; ++index) {
|
||||
dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
|
||||
}
|
||||
}
|
||||
|
||||
return *result;
|
||||
}
|
||||
|
||||
RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
|
||||
@ -1152,35 +1168,38 @@ RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
|
||||
DCHECK_EQ(args.length(), 1);
|
||||
CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
|
||||
|
||||
// This function could be optimized for no-op cases the way lowercase
|
||||
// counterpart is, but in empirical testing, few actual calls to upper()
|
||||
// are no-ops. So, it wouldn't be worth the extra time for pre-scanning.
|
||||
|
||||
int32_t length = s->length();
|
||||
s = String::Flatten(s);
|
||||
|
||||
if (s->HasOnlyOneByteChars()) {
|
||||
Handle<SeqOneByteString> result;
|
||||
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
|
||||
isolate, result, isolate->factory()->NewRawOneByteString(length));
|
||||
Handle<SeqOneByteString> result =
|
||||
isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
|
||||
|
||||
int sharp_s_count;
|
||||
bool is_result_single_byte;
|
||||
{
|
||||
DisallowHeapAllocation no_gc;
|
||||
String::FlatContent flat = s->GetFlatContent();
|
||||
// If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII
|
||||
// could be removed because ToUpperOneByte is pretty fast now (it
|
||||
// does not call ICU API any more.).
|
||||
uint8_t* dest = result->GetChars();
|
||||
if (flat.IsOneByte()) {
|
||||
Vector<const uint8_t> src = flat.ToOneByteVector();
|
||||
if (ToUpperFastASCII(src, result)) return *result;
|
||||
is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
|
||||
bool has_changed_character = false;
|
||||
int index_to_first_unprocessed =
|
||||
FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),
|
||||
reinterpret_cast<const char*>(src.start()),
|
||||
length, &has_changed_character);
|
||||
if (index_to_first_unprocessed == length)
|
||||
return has_changed_character ? *result : *s;
|
||||
// If not ASCII, we keep the result up to index_to_first_unprocessed and
|
||||
// process the rest.
|
||||
is_result_single_byte =
|
||||
ToUpperOneByte(src.SubVector(index_to_first_unprocessed, length),
|
||||
dest + index_to_first_unprocessed, &sharp_s_count);
|
||||
} else {
|
||||
DCHECK(flat.IsTwoByte());
|
||||
Vector<const uint16_t> src = flat.ToUC16Vector();
|
||||
if (ToUpperFastASCII(src, result)) return *result;
|
||||
is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
|
||||
is_result_single_byte = ToUpperOneByte(src, dest, &sharp_s_count);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "src/arguments.h"
|
||||
#include "src/regexp/jsregexp-inl.h"
|
||||
#include "src/string-builder.h"
|
||||
#include "src/string-case.h"
|
||||
#include "src/string-search.h"
|
||||
|
||||
namespace v8 {
|
||||
@ -694,122 +695,6 @@ MUST_USE_RESULT static Object* ConvertCaseHelper(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static const uintptr_t kOneInEveryByte = kUintptrAllBitsSet / 0xFF;
|
||||
static const uintptr_t kAsciiMask = kOneInEveryByte << 7;
|
||||
|
||||
// Given a word and two range boundaries returns a word with high bit
|
||||
// set in every byte iff the corresponding input byte was strictly in
|
||||
// the range (m, n). All the other bits in the result are cleared.
|
||||
// This function is only useful when it can be inlined and the
|
||||
// boundaries are statically known.
|
||||
// Requires: all bytes in the input word and the boundaries must be
|
||||
// ASCII (less than 0x7F).
|
||||
static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) {
|
||||
// Use strict inequalities since in edge cases the function could be
|
||||
// further simplified.
|
||||
DCHECK(0 < m && m < n);
|
||||
// Has high bit set in every w byte less than n.
|
||||
uintptr_t tmp1 = kOneInEveryByte * (0x7F + n) - w;
|
||||
// Has high bit set in every w byte greater than m.
|
||||
uintptr_t tmp2 = w + kOneInEveryByte * (0x7F - m);
|
||||
return (tmp1 & tmp2 & (kOneInEveryByte * 0x80));
|
||||
}
|
||||
|
||||
|
||||
#ifdef DEBUG
|
||||
static bool CheckFastAsciiConvert(char* dst, const char* src, int length,
|
||||
bool changed, bool is_to_lower) {
|
||||
bool expected_changed = false;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (dst[i] == src[i]) continue;
|
||||
expected_changed = true;
|
||||
if (is_to_lower) {
|
||||
DCHECK('A' <= src[i] && src[i] <= 'Z');
|
||||
DCHECK(dst[i] == src[i] + ('a' - 'A'));
|
||||
} else {
|
||||
DCHECK('a' <= src[i] && src[i] <= 'z');
|
||||
DCHECK(dst[i] == src[i] - ('a' - 'A'));
|
||||
}
|
||||
}
|
||||
return (expected_changed == changed);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
template <class Converter>
|
||||
static bool FastAsciiConvert(char* dst, const char* src, int length,
|
||||
bool* changed_out) {
|
||||
#ifdef DEBUG
|
||||
char* saved_dst = dst;
|
||||
const char* saved_src = src;
|
||||
#endif
|
||||
DisallowHeapAllocation no_gc;
|
||||
// We rely on the distance between upper and lower case letters
|
||||
// being a known power of 2.
|
||||
DCHECK('a' - 'A' == (1 << 5));
|
||||
// Boundaries for the range of input characters than require conversion.
|
||||
static const char lo = Converter::kIsToLower ? 'A' - 1 : 'a' - 1;
|
||||
static const char hi = Converter::kIsToLower ? 'Z' + 1 : 'z' + 1;
|
||||
bool changed = false;
|
||||
uintptr_t or_acc = 0;
|
||||
const char* const limit = src + length;
|
||||
|
||||
// dst is newly allocated and always aligned.
|
||||
DCHECK(IsAligned(reinterpret_cast<intptr_t>(dst), sizeof(uintptr_t)));
|
||||
// Only attempt processing one word at a time if src is also aligned.
|
||||
if (IsAligned(reinterpret_cast<intptr_t>(src), sizeof(uintptr_t))) {
|
||||
// Process the prefix of the input that requires no conversion one aligned
|
||||
// (machine) word at a time.
|
||||
while (src <= limit - sizeof(uintptr_t)) {
|
||||
const uintptr_t w = *reinterpret_cast<const uintptr_t*>(src);
|
||||
or_acc |= w;
|
||||
if (AsciiRangeMask(w, lo, hi) != 0) {
|
||||
changed = true;
|
||||
break;
|
||||
}
|
||||
*reinterpret_cast<uintptr_t*>(dst) = w;
|
||||
src += sizeof(uintptr_t);
|
||||
dst += sizeof(uintptr_t);
|
||||
}
|
||||
// Process the remainder of the input performing conversion when
|
||||
// required one word at a time.
|
||||
while (src <= limit - sizeof(uintptr_t)) {
|
||||
const uintptr_t w = *reinterpret_cast<const uintptr_t*>(src);
|
||||
or_acc |= w;
|
||||
uintptr_t m = AsciiRangeMask(w, lo, hi);
|
||||
// The mask has high (7th) bit set in every byte that needs
|
||||
// conversion and we know that the distance between cases is
|
||||
// 1 << 5.
|
||||
*reinterpret_cast<uintptr_t*>(dst) = w ^ (m >> 2);
|
||||
src += sizeof(uintptr_t);
|
||||
dst += sizeof(uintptr_t);
|
||||
}
|
||||
}
|
||||
// Process the last few bytes of the input (or the whole input if
|
||||
// unaligned access is not supported).
|
||||
while (src < limit) {
|
||||
char c = *src;
|
||||
or_acc |= c;
|
||||
if (lo < c && c < hi) {
|
||||
c ^= (1 << 5);
|
||||
changed = true;
|
||||
}
|
||||
*dst = c;
|
||||
++src;
|
||||
++dst;
|
||||
}
|
||||
|
||||
if ((or_acc & kAsciiMask) != 0) return false;
|
||||
|
||||
DCHECK(CheckFastAsciiConvert(saved_dst, saved_src, length, changed,
|
||||
Converter::kIsToLower));
|
||||
|
||||
*changed_out = changed;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
template <class Converter>
|
||||
MUST_USE_RESULT static Object* ConvertCase(
|
||||
Handle<String> s, Isolate* isolate,
|
||||
@ -833,12 +718,13 @@ MUST_USE_RESULT static Object* ConvertCase(
|
||||
String::FlatContent flat_content = s->GetFlatContent();
|
||||
DCHECK(flat_content.IsFlat());
|
||||
bool has_changed_character = false;
|
||||
bool is_ascii = FastAsciiConvert<Converter>(
|
||||
int index_to_first_unprocessed = FastAsciiConvert<Converter::kIsToLower>(
|
||||
reinterpret_cast<char*>(result->GetChars()),
|
||||
reinterpret_cast<const char*>(flat_content.ToOneByteVector().start()),
|
||||
length, &has_changed_character);
|
||||
// If not ASCII, we discard the result and take the 2 byte path.
|
||||
if (is_ascii) return has_changed_character ? *result : *s;
|
||||
if (index_to_first_unprocessed == length)
|
||||
return has_changed_character ? *result : *s;
|
||||
}
|
||||
|
||||
Handle<SeqString> result; // Same length as input.
|
||||
@ -872,7 +758,6 @@ RUNTIME_FUNCTION(Runtime_StringToLowerCase) {
|
||||
return ConvertCase(s, isolate, isolate->runtime_state()->to_lower_mapping());
|
||||
}
|
||||
|
||||
|
||||
RUNTIME_FUNCTION(Runtime_StringToUpperCase) {
|
||||
HandleScope scope(isolate);
|
||||
DCHECK_EQ(args.length(), 1);
|
||||
|
@ -6,6 +6,7 @@
|
||||
#define V8_RUNTIME_RUNTIME_UTILS_H_
|
||||
|
||||
#include "src/base/logging.h"
|
||||
#include "src/globals.h"
|
||||
#include "src/runtime/runtime.h"
|
||||
|
||||
namespace v8 {
|
||||
|
130
src/string-case.cc
Normal file
130
src/string-case.cc
Normal file
@ -0,0 +1,130 @@
|
||||
// Copyright 2016 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "src/string-case.h"
|
||||
|
||||
#include "src/assert-scope.h"
|
||||
#include "src/base/logging.h"
|
||||
#include "src/globals.h"
|
||||
#include "src/utils.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
#ifdef DEBUG
|
||||
bool CheckFastAsciiConvert(char* dst, const char* src, int length, bool changed,
|
||||
bool is_to_lower) {
|
||||
bool expected_changed = false;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (dst[i] == src[i]) continue;
|
||||
expected_changed = true;
|
||||
if (is_to_lower) {
|
||||
DCHECK('A' <= src[i] && src[i] <= 'Z');
|
||||
DCHECK(dst[i] == src[i] + ('a' - 'A'));
|
||||
} else {
|
||||
DCHECK('a' <= src[i] && src[i] <= 'z');
|
||||
DCHECK(dst[i] == src[i] - ('a' - 'A'));
|
||||
}
|
||||
}
|
||||
return (expected_changed == changed);
|
||||
}
|
||||
#endif
|
||||
|
||||
const uintptr_t kOneInEveryByte = kUintptrAllBitsSet / 0xFF;
|
||||
const uintptr_t kAsciiMask = kOneInEveryByte << 7;
|
||||
|
||||
// Given a word and two range boundaries returns a word with high bit
|
||||
// set in every byte iff the corresponding input byte was strictly in
|
||||
// the range (m, n). All the other bits in the result are cleared.
|
||||
// This function is only useful when it can be inlined and the
|
||||
// boundaries are statically known.
|
||||
// Requires: all bytes in the input word and the boundaries must be
|
||||
// ASCII (less than 0x7F).
|
||||
static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) {
|
||||
// Use strict inequalities since in edge cases the function could be
|
||||
// further simplified.
|
||||
DCHECK(0 < m && m < n);
|
||||
// Has high bit set in every w byte less than n.
|
||||
uintptr_t tmp1 = kOneInEveryByte * (0x7F + n) - w;
|
||||
// Has high bit set in every w byte greater than m.
|
||||
uintptr_t tmp2 = w + kOneInEveryByte * (0x7F - m);
|
||||
return (tmp1 & tmp2 & (kOneInEveryByte * 0x80));
|
||||
}
|
||||
|
||||
template <bool is_lower>
|
||||
int FastAsciiConvert(char* dst, const char* src, int length,
|
||||
bool* changed_out) {
|
||||
#ifdef DEBUG
|
||||
char* saved_dst = dst;
|
||||
#endif
|
||||
const char* saved_src = src;
|
||||
DisallowHeapAllocation no_gc;
|
||||
// We rely on the distance between upper and lower case letters
|
||||
// being a known power of 2.
|
||||
DCHECK('a' - 'A' == (1 << 5));
|
||||
// Boundaries for the range of input characters than require conversion.
|
||||
static const char lo = is_lower ? 'A' - 1 : 'a' - 1;
|
||||
static const char hi = is_lower ? 'Z' + 1 : 'z' + 1;
|
||||
bool changed = false;
|
||||
const char* const limit = src + length;
|
||||
|
||||
// dst is newly allocated and always aligned.
|
||||
DCHECK(IsAligned(reinterpret_cast<intptr_t>(dst), sizeof(uintptr_t)));
|
||||
// Only attempt processing one word at a time if src is also aligned.
|
||||
if (IsAligned(reinterpret_cast<intptr_t>(src), sizeof(uintptr_t))) {
|
||||
// Process the prefix of the input that requires no conversion one aligned
|
||||
// (machine) word at a time.
|
||||
while (src <= limit - sizeof(uintptr_t)) {
|
||||
const uintptr_t w = *reinterpret_cast<const uintptr_t*>(src);
|
||||
if ((w & kAsciiMask) != 0) return static_cast<int>(src - saved_src);
|
||||
if (AsciiRangeMask(w, lo, hi) != 0) {
|
||||
changed = true;
|
||||
break;
|
||||
}
|
||||
*reinterpret_cast<uintptr_t*>(dst) = w;
|
||||
src += sizeof(uintptr_t);
|
||||
dst += sizeof(uintptr_t);
|
||||
}
|
||||
// Process the remainder of the input performing conversion when
|
||||
// required one word at a time.
|
||||
while (src <= limit - sizeof(uintptr_t)) {
|
||||
const uintptr_t w = *reinterpret_cast<const uintptr_t*>(src);
|
||||
if ((w & kAsciiMask) != 0) return static_cast<int>(src - saved_src);
|
||||
uintptr_t m = AsciiRangeMask(w, lo, hi);
|
||||
// The mask has high (7th) bit set in every byte that needs
|
||||
// conversion and we know that the distance between cases is
|
||||
// 1 << 5.
|
||||
*reinterpret_cast<uintptr_t*>(dst) = w ^ (m >> 2);
|
||||
src += sizeof(uintptr_t);
|
||||
dst += sizeof(uintptr_t);
|
||||
}
|
||||
}
|
||||
// Process the last few bytes of the input (or the whole input if
|
||||
// unaligned access is not supported).
|
||||
while (src < limit) {
|
||||
char c = *src;
|
||||
if ((c & kAsciiMask) != 0) return static_cast<int>(src - saved_src);
|
||||
if (lo < c && c < hi) {
|
||||
c ^= (1 << 5);
|
||||
changed = true;
|
||||
}
|
||||
*dst = c;
|
||||
++src;
|
||||
++dst;
|
||||
}
|
||||
|
||||
DCHECK(
|
||||
CheckFastAsciiConvert(saved_dst, saved_src, length, changed, is_lower));
|
||||
|
||||
*changed_out = changed;
|
||||
return length;
|
||||
}
|
||||
|
||||
template int FastAsciiConvert<false>(char* dst, const char* src, int length,
|
||||
bool* changed_out);
|
||||
template int FastAsciiConvert<true>(char* dst, const char* src, int length,
|
||||
bool* changed_out);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
17
src/string-case.h
Normal file
17
src/string-case.h
Normal file
@ -0,0 +1,17 @@
|
||||
// Copyright 2016 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_STRING_CASE_H_
|
||||
#define V8_STRING_CASE_H_
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
template <bool is_lower>
|
||||
int FastAsciiConvert(char* dst, const char* src, int length, bool* changed_out);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_STRING_CASE_H__
|
@ -1228,6 +1228,8 @@
|
||||
'startup-data-util.h',
|
||||
'string-builder.cc',
|
||||
'string-builder.h',
|
||||
'string-case.cc',
|
||||
'string-case.h',
|
||||
'string-search.h',
|
||||
'string-stream.cc',
|
||||
'string-stream.h',
|
||||
|
@ -16,14 +16,33 @@ assertEquals("σς", "\u03A3\u03A3".toLowerCase());
|
||||
// Expand sharp s in latin1 fastpath
|
||||
assertEquals("ASSB", "A\u00DFB".toUpperCase());
|
||||
assertEquals("AB", "Ab".toUpperCase());
|
||||
// Find first upper case in fastpath
|
||||
// Find first uppercase in fastpath
|
||||
// Input length < a machine word size
|
||||
assertEquals("ab", "ab".toLowerCase());
|
||||
assertEquals("ab", "aB".toLowerCase());
|
||||
assertEquals("AÜ", "aü".toUpperCase());
|
||||
assertEquals("AÜ", "AÜ".toUpperCase());
|
||||
assertEquals("aü", "aü".toLowerCase());
|
||||
assertEquals("aü", "aÜ".toLowerCase());
|
||||
assertEquals("aü", "AÜ".toLowerCase());
|
||||
assertEquals("aü", "AÜ".toLowerCase());
|
||||
|
||||
// Input length >= a machine word size
|
||||
assertEquals("abcdefghij", "abcdefghij".toLowerCase());
|
||||
assertEquals("abcdefghij", "abcdefghiJ".toLowerCase());
|
||||
assertEquals("abçdefghij", "abçdefghiJ".toLowerCase());
|
||||
assertEquals("abçdefghij", "abÇdefghiJ".toLowerCase());
|
||||
assertEquals("abcdefghiá", "abcdeFghiá".toLowerCase());
|
||||
assertEquals("abcdefghiá", "abcdeFghiÁ".toLowerCase());
|
||||
|
||||
assertEquals("ABCDEFGHIJ", "ABCDEFGHIJ".toUpperCase());
|
||||
assertEquals("ABCDEFGHIJ", "ABCDEFGHIj".toUpperCase());
|
||||
assertEquals("ABÇDEFGHIJ", "ABÇDEFGHIj".toUpperCase());
|
||||
assertEquals("ABÇDEFGHIJ", "ABçDEFGHIj".toUpperCase());
|
||||
assertEquals("ABCDEFGHIÁ", "ABCDEfGHIÁ".toUpperCase());
|
||||
assertEquals("ABCDEFGHIÁ", "ABCDEfGHIá".toUpperCase());
|
||||
|
||||
|
||||
// Starts with fastpath, but switches to full Unicode path
|
||||
// U+00FF is uppercased to U+0178.
|
||||
assertEquals("AŸ", "aÿ".toUpperCase());
|
||||
@ -33,6 +52,10 @@ assertEquals("AΜ", "aµ".toUpperCase());
|
||||
// Buffer size increase
|
||||
assertEquals("CSSBẶ", "cßbặ".toUpperCase());
|
||||
assertEquals("FIFLFFIFFL", "\uFB01\uFB02\uFB03\uFB04".toUpperCase());
|
||||
assertEquals("ABCÀCSSA", "abcàcßa".toUpperCase());
|
||||
assertEquals("ABCDEFGHIÀCSSA", "ABCDEFGHIàcßa".toUpperCase());
|
||||
assertEquals("ABCDEFGHIÀCSSA", "abcdeFghiàcßa".toUpperCase());
|
||||
|
||||
// OneByte input with buffer size increase: non-fast path
|
||||
assertEquals("ABCSS", "abCß".toLocaleUpperCase("tr"));
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user