[string] Move String.p.toLowerCase to CSA

This CL migrates the CPP builtin to CSA with fast paths for strings
that can be unpacked to direct one-byte strings. Short strings are
handled directly in CSA, others need to call into C for conversion.

Microbenchmarks for "abcd".toLowerCase() show speedups of 2.5x.

BUG=v8:6353,v8:6344

Review-Url: https://codereview.chromium.org/2859203002
Cr-Commit-Position: refs/heads/master@{#45141}
This commit is contained in:
jgruber 2017-05-05 08:59:08 -07:00 committed by Commit bot
parent 1cda1732a7
commit f0e95769db
14 changed files with 261 additions and 49 deletions

View File

@ -919,6 +919,7 @@ v8_source_set("v8_builtins_generators") {
"src/builtins/builtins-ic-gen.cc",
"src/builtins/builtins-internal-gen.cc",
"src/builtins/builtins-interpreter-gen.cc",
"src/builtins/builtins-intl-gen.cc",
"src/builtins/builtins-math-gen.cc",
"src/builtins/builtins-number-gen.cc",
"src/builtins/builtins-object-gen.cc",
@ -996,6 +997,10 @@ v8_source_set("v8_builtins_generators") {
]
}
if (!v8_enable_i18n_support) {
sources -= [ "src/builtins/builtins-intl-gen.cc" ]
}
configs = [ ":internal_config" ]
}

View File

@ -91,6 +91,10 @@
#endif // Target architecture.
#endif // V8_INTERPRETED_REGEXP
#ifdef V8_INTL_SUPPORT
#include "src/intl.h"
#endif // V8_INTL_SUPPORT
namespace v8 {
namespace internal {
@ -1572,6 +1576,20 @@ ExternalReference ExternalReference::try_internalize_string_function(
isolate, FUNCTION_ADDR(StringTable::LookupStringIfExists_NoAllocate)));
}
#ifdef V8_INTL_SUPPORT
ExternalReference ExternalReference::intl_convert_one_byte_to_lower(
Isolate* isolate) {
return ExternalReference(
Redirect(isolate, FUNCTION_ADDR(ConvertOneByteToLower)));
}
ExternalReference ExternalReference::intl_to_latin1_lower_table(
Isolate* isolate) {
uint8_t* ptr = const_cast<uint8_t*>(ToLatin1LowerTable());
return ExternalReference(reinterpret_cast<Address>(ptr));
}
#endif // V8_INTL_SUPPORT
// Explicit instantiations for all combinations of 1- and 2-byte strings.
template ExternalReference
ExternalReference::search_string_raw<const uint8_t, const uint8_t>(Isolate*);

View File

@ -993,6 +993,11 @@ class ExternalReference BASE_EMBEDDED {
static ExternalReference try_internalize_string_function(Isolate* isolate);
#ifdef V8_INTL_SUPPORT
static ExternalReference intl_convert_one_byte_to_lower(Isolate* isolate);
static ExternalReference intl_to_latin1_lower_table(Isolate* isolate);
#endif // V8_INTL_SUPPORT
template <typename SubjectChar, typename PatternChar>
static ExternalReference search_string_raw(Isolate* isolate);

View File

@ -4084,7 +4084,7 @@ void Genesis::InitializeGlobal_icu_case_mapping() {
SetFunction(string_prototype,
SimpleCreateFunction(isolate(), name,
Builtins::kStringPrototypeToLowerCaseIntl,
0, false),
0, true),
name);
}
{

View File

@ -1004,7 +1004,7 @@ namespace internal {
BUILTIN_LIST_BASE(CPP, API, TFJ, TFC, TFS, TFH, ASM, DBG) \
\
/* ES #sec-string.prototype.tolowercase */ \
CPP(StringPrototypeToLowerCaseIntl) \
TFJ(StringPrototypeToLowerCaseIntl, 0) \
/* ES #sec-string.prototype.touppercase */ \
CPP(StringPrototypeToUpperCaseIntl) \
/* ES #sec-string.prototype.normalize */ \

View File

@ -0,0 +1,113 @@
// Copyright 2017 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_INTL_SUPPORT
#error Internationalization is expected to be enabled.
#endif // V8_INTL_SUPPORT
#include "src/builtins/builtins-utils-gen.h"
#include "src/code-stub-assembler.h"
namespace v8 {
namespace internal {
class IntlBuiltinsAssembler : public CodeStubAssembler {
public:
explicit IntlBuiltinsAssembler(compiler::CodeAssemblerState* state)
: CodeStubAssembler(state) {}
};
TF_BUILTIN(StringPrototypeToLowerCaseIntl, IntlBuiltinsAssembler) {
Node* const maybe_string = Parameter(Descriptor::kReceiver);
Node* const context = Parameter(Descriptor::kContext);
Node* const string =
ToThisString(context, maybe_string, "String.prototype.toLowerCase");
Label call_c(this), return_string(this), runtime(this, Label::kDeferred);
// Early exit on empty strings.
Node* const length = SmiUntag(LoadStringLength(string));
GotoIf(IntPtrEqual(length, IntPtrConstant(0)), &return_string);
// Unpack strings if possible, and bail to runtime unless we get a one-byte
// flat string.
ToDirectStringAssembler to_direct(
state(), string, ToDirectStringAssembler::kDontUnpackSlicedStrings);
to_direct.TryToDirect(&runtime);
Node* const instance_type = to_direct.instance_type();
CSA_ASSERT(this,
Word32BinaryNot(IsIndirectStringInstanceType(instance_type)));
GotoIfNot(IsOneByteStringInstanceType(instance_type), &runtime);
// For short strings, do the conversion in CSA through the lookup table.
Node* const dst = AllocateSeqOneByteString(context, length);
const int kMaxShortStringLength = 24; // Determined empirically.
GotoIf(IntPtrGreaterThan(length, IntPtrConstant(kMaxShortStringLength)),
&call_c);
{
Node* const dst_ptr = PointerToSeqStringData(dst);
VARIABLE(var_cursor, MachineType::PointerRepresentation(),
IntPtrConstant(0));
Node* const start_address = to_direct.PointerToData(&call_c);
Node* const end_address = IntPtrAdd(start_address, length);
Node* const to_lower_table_addr = ExternalConstant(
ExternalReference::intl_to_latin1_lower_table(isolate()));
VariableList push_vars({&var_cursor}, zone());
BuildFastLoop(
push_vars, start_address, end_address,
[=, &var_cursor](Node* current) {
Node* c = ChangeInt32ToIntPtr(Load(MachineType::Uint8(), current));
Node* lower = Load(MachineType::Uint8(), to_lower_table_addr, c);
StoreNoWriteBarrier(MachineRepresentation::kWord8, dst_ptr,
var_cursor.value(), lower);
Increment(var_cursor);
},
kCharSize, INTPTR_PARAMETERS, IndexAdvanceMode::kPost);
// All lower-case.
Return(dst);
}
// Call into C for case conversion. The signature is:
// Object* ConvertOneByteToLower(String* src, String* dst, Isolate* isolate);
BIND(&call_c);
{
Node* const src = to_direct.string();
Node* const function_addr = ExternalConstant(
ExternalReference::intl_convert_one_byte_to_lower(isolate()));
Node* const isolate_ptr =
ExternalConstant(ExternalReference::isolate_address(isolate()));
MachineType type_ptr = MachineType::Pointer();
MachineType type_tagged = MachineType::AnyTagged();
Node* const result =
CallCFunction3(type_tagged, type_tagged, type_tagged, type_ptr,
function_addr, src, dst, isolate_ptr);
Return(result);
}
BIND(&return_string);
Return(string);
BIND(&runtime);
{
Node* const result =
CallRuntime(Runtime::kStringToLowerCaseIntl, context, string);
Return(result);
}
}
} // namespace internal
} // namespace v8

View File

@ -16,13 +16,6 @@
namespace v8 {
namespace internal {
BUILTIN(StringPrototypeToLowerCaseIntl) {
HandleScope scope(isolate);
TO_THIS_STRING(string, "String.prototype.toLowerCase");
string = String::Flatten(string);
return ConvertCase(string, false, isolate);
}
BUILTIN(StringPrototypeToUpperCaseIntl) {
HandleScope scope(isolate);
TO_THIS_STRING(string, "String.prototype.toUpperCase");

View File

@ -1238,6 +1238,16 @@ Node* CodeStubAssembler::LoadStringLength(Node* object) {
return LoadObjectField(object, String::kLengthOffset);
}
Node* CodeStubAssembler::PointerToSeqStringData(Node* seq_string) {
CSA_ASSERT(this, IsString(seq_string));
CSA_ASSERT(this,
IsSequentialStringInstanceType(LoadInstanceType(seq_string)));
STATIC_ASSERT(SeqOneByteString::kHeaderSize == SeqTwoByteString::kHeaderSize);
return IntPtrAdd(
BitcastTaggedToWord(seq_string),
IntPtrConstant(SeqOneByteString::kHeaderSize - kHeapObjectTag));
}
Node* CodeStubAssembler::LoadJSValueValue(Node* object) {
CSA_ASSERT(this, IsJSValue(object));
return LoadObjectField(object, JSValue::kValueOffset);
@ -3137,6 +3147,13 @@ Node* CodeStubAssembler::IsConsStringInstanceType(Node* instance_type) {
Int32Constant(kConsStringTag));
}
Node* CodeStubAssembler::IsIndirectStringInstanceType(Node* instance_type) {
CSA_ASSERT(this, IsStringInstanceType(instance_type));
STATIC_ASSERT(kIsIndirectStringMask == 0x1);
STATIC_ASSERT(kIsIndirectStringTag == 0x1);
return Word32And(instance_type, Int32Constant(kIsIndirectStringMask));
}
Node* CodeStubAssembler::IsExternalStringInstanceType(Node* instance_type) {
CSA_ASSERT(this, IsStringInstanceType(instance_type));
return Word32Equal(
@ -3660,12 +3677,13 @@ Node* CodeStubAssembler::SubString(Node* context, Node* string, Node* from,
}
ToDirectStringAssembler::ToDirectStringAssembler(
compiler::CodeAssemblerState* state, Node* string)
compiler::CodeAssemblerState* state, Node* string, Flags flags)
: CodeStubAssembler(state),
var_string_(this, MachineRepresentation::kTagged, string),
var_instance_type_(this, MachineRepresentation::kWord32),
var_offset_(this, MachineType::PointerRepresentation()),
var_is_external_(this, MachineRepresentation::kWord32) {
var_is_external_(this, MachineRepresentation::kWord32),
flags_(flags) {
CSA_ASSERT(this, TaggedIsNotSmi(string));
CSA_ASSERT(this, IsString(string));
@ -3722,16 +3740,20 @@ Node* ToDirectStringAssembler::TryToDirect(Label* if_bailout) {
// Sliced string. Fetch parent and correct start index by offset.
BIND(&if_issliced);
{
Node* const string = var_string_.value();
Node* const sliced_offset =
LoadAndUntagObjectField(string, SlicedString::kOffsetOffset);
var_offset_.Bind(IntPtrAdd(var_offset_.value(), sliced_offset));
if (flags_ & kDontUnpackSlicedStrings) {
Goto(if_bailout);
} else {
Node* const string = var_string_.value();
Node* const sliced_offset =
LoadAndUntagObjectField(string, SlicedString::kOffsetOffset);
var_offset_.Bind(IntPtrAdd(var_offset_.value(), sliced_offset));
Node* const parent = LoadObjectField(string, SlicedString::kParentOffset);
var_string_.Bind(parent);
var_instance_type_.Bind(LoadInstanceType(parent));
Node* const parent = LoadObjectField(string, SlicedString::kParentOffset);
var_string_.Bind(parent);
var_instance_type_.Bind(LoadInstanceType(parent));
Goto(&dispatch);
Goto(&dispatch);
}
}
// Thin string. Fetch the actual string.

View File

@ -420,6 +420,8 @@ class V8_EXPORT_PRIVATE CodeStubAssembler : public compiler::CodeAssembler {
// Load length field of a String object.
Node* LoadStringLength(Node* object);
// Loads a pointer to the sequential String char array.
Node* PointerToSeqStringData(Node* seq_string);
// Load value field of a JSValue object.
Node* LoadJSValueValue(Node* object);
// Load value field of a WeakCell object.
@ -742,6 +744,7 @@ class V8_EXPORT_PRIVATE CodeStubAssembler : public compiler::CodeAssembler {
Node* IsShortExternalStringInstanceType(Node* instance_type);
Node* IsSequentialStringInstanceType(Node* instance_type);
Node* IsConsStringInstanceType(Node* instance_type);
Node* IsIndirectStringInstanceType(Node* instance_type);
Node* IsString(Node* object);
Node* IsJSObject(Node* object);
Node* IsJSGlobalProxy(Node* object);
@ -1531,19 +1534,28 @@ class ToDirectStringAssembler : public CodeStubAssembler {
enum StringPointerKind { PTR_TO_DATA, PTR_TO_STRING };
public:
explicit ToDirectStringAssembler(compiler::CodeAssemblerState* state,
Node* string);
enum Flag {
kDontUnpackSlicedStrings = 1 << 0,
};
typedef base::Flags<Flag> Flags;
ToDirectStringAssembler(compiler::CodeAssemblerState* state, Node* string,
Flags flags = Flags());
// Converts flat cons, thin, and sliced strings and returns the direct
// string. The result can be either a sequential or external string.
// Jumps to if_bailout if the string if the string is indirect and cannot
// be unpacked.
Node* TryToDirect(Label* if_bailout);
// Returns a pointer to the beginning of the string data.
// Jumps to if_bailout if the external string cannot be unpacked.
Node* PointerToData(Label* if_bailout) {
return TryToSequential(PTR_TO_DATA, if_bailout);
}
// Returns a pointer that, offset-wise, looks like a String.
// Jumps to if_bailout if the external string cannot be unpacked.
Node* PointerToString(Label* if_bailout) {
return TryToSequential(PTR_TO_STRING, if_bailout);
}
@ -1560,6 +1572,8 @@ class ToDirectStringAssembler : public CodeStubAssembler {
Variable var_instance_type_;
Variable var_offset_;
Variable var_is_external_;
const Flags flags_;
};
#ifdef DEBUG

View File

@ -250,6 +250,12 @@ void ExternalReferenceTable::AddReferences(Isolate* isolate) {
"libc_memset");
Add(ExternalReference::try_internalize_string_function(isolate).address(),
"try_internalize_string_function");
#ifdef V8_INTL_SUPPORT
Add(ExternalReference::intl_convert_one_byte_to_lower(isolate).address(),
"intl_convert_one_byte_to_lower");
Add(ExternalReference::intl_to_latin1_lower_table(isolate).address(),
"intl_to_latin1_lower_table");
#endif // V8_INTL_SUPPORT
Add(ExternalReference::search_string_raw<const uint8_t, const uint8_t>(
isolate)
.address(),

View File

@ -141,6 +141,8 @@ inline int FindFirstUpperOrNonAscii(Handle<String> s, int length) {
} // namespace
const uint8_t* ToLatin1LowerTable() { return &kToLower[0]; }
const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,
std::unique_ptr<uc16[]>* dest,
int32_t length) {
@ -201,6 +203,41 @@ MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,
return *s;
}
// A stripped-down version of ConvertToLower that can only handle flat one-byte
// strings and does not allocate.
// Called from TF builtins.
MUST_USE_RESULT Object* ConvertOneByteToLower(String* src, String* dst,
Isolate* isolate) {
DCHECK_EQ(src->length(), dst->length());
DCHECK(src->IsOneByteRepresentation());
DCHECK(src->IsFlat());
DCHECK(dst->IsSeqOneByteString());
DisallowHeapAllocation no_gc;
const int length = src->length();
const uint8_t* src_data = src->GetFlatContent().ToOneByteVector().start();
uint8_t* dst_data = SeqOneByteString::cast(dst)->GetChars();
bool has_changed_character = false;
int index_to_first_unprocessed = FastAsciiConvert<true>(
reinterpret_cast<char*>(dst_data),
reinterpret_cast<const char*>(src_data), length, &has_changed_character);
if (index_to_first_unprocessed == length) {
return has_changed_character ? dst : src;
}
// If not ASCII, we keep the result up to index_to_first_unprocessed and
// process the rest.
for (int index = index_to_first_unprocessed; index < length; ++index) {
dst_data[index] = ToLatin1Lower(static_cast<uint16_t>(src_data[index]));
}
return dst;
}
MUST_USE_RESULT Object* ConvertToLower(Handle<String> s, Isolate* isolate) {
if (!s->HasOnlyOneByteChars()) {
// Use a slower implementation for strings with characters beyond U+00FF.
@ -230,36 +267,27 @@ MUST_USE_RESULT Object* ConvertToLower(Handle<String> s, Isolate* isolate) {
Handle<SeqOneByteString> result =
isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
if (s->IsOneByteRepresentation()) {
return ConvertOneByteToLower(*s, *result, isolate);
}
DisallowHeapAllocation no_gc;
DCHECK(s->IsFlat());
DCHECK(s->IsTwoByteRepresentation());
String::FlatContent flat = s->GetFlatContent();
uint8_t* dest = result->GetChars();
if (flat.IsOneByte()) {
const uint8_t* src = flat.ToOneByteVector().start();
bool has_changed_character = false;
index_to_first_unprocessed = FastAsciiConvert<true>(
reinterpret_cast<char*>(dest), reinterpret_cast<const char*>(src),
length, &has_changed_character);
// If not ASCII, we keep the result up to index_to_first_unprocessed and
// process the rest.
if (index_to_first_unprocessed == length)
return has_changed_character ? *result : *s;
DCHECK(flat.IsTwoByte());
for (int index = index_to_first_unprocessed; index < length; ++index) {
dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
}
} else {
if (index_to_first_unprocessed == length) {
DCHECK(!is_short);
index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
}
// Nothing to do if the string is all ASCII with no uppercase.
if (index_to_first_unprocessed == length) return *s;
const uint16_t* src = flat.ToUC16Vector().start();
CopyChars(dest, src, index_to_first_unprocessed);
for (int index = index_to_first_unprocessed; index < length; ++index) {
dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
}
uint8_t* dest = result->GetChars();
if (index_to_first_unprocessed == length) {
DCHECK(!is_short);
index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
}
// Nothing to do if the string is all ASCII with no uppercase.
if (index_to_first_unprocessed == length) return *s;
const uint16_t* src = flat.ToUC16Vector().start();
CopyChars(dest, src, index_to_first_unprocessed);
for (int index = index_to_first_unprocessed; index < length; ++index) {
dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
}
return *result;

View File

@ -30,6 +30,11 @@ MUST_USE_RESULT Object* ConvertToUpper(Handle<String> s, Isolate* isolate);
MUST_USE_RESULT Object* ConvertCase(Handle<String> s, bool is_upper,
Isolate* isolate);
MUST_USE_RESULT Object* ConvertOneByteToLower(String* src, String* dst,
Isolate* isolate);
const uint8_t* ToLatin1LowerTable();
// ICUTimezoneCache calls out to ICU for TimezoneCache
// functionality in a straightforward way.
class ICUTimezoneCache : public base::TimezoneCache {

View File

@ -528,6 +528,7 @@
'builtins/builtins-string.cc',
'builtins/builtins-string-gen.cc',
'builtins/builtins-intl.cc',
'builtins/builtins-intl-gen.cc',
'builtins/builtins-symbol.cc',
'builtins/builtins-symbol-gen.cc',
'builtins/builtins-typedarray.cc',
@ -1836,6 +1837,7 @@
}, { # v8_enable_i18n_support==0
'sources!': [
'builtins/builtins-intl.cc',
'builtins/builtins-intl-gen.cc',
'intl.cc',
'intl.h',
'objects/intl-objects.cc',

View File

@ -59,8 +59,9 @@ function test(length) {
strLower += String.fromCharCode(charCodeToLower(c));
strUpper += String.fromCharCode(charCodeToUpper(c));
}
%FlattenString(strLower);
%FlattenString(strUpper);
str = %FlattenString(str);
strLower = %FlattenString(strLower);
strUpper = %FlattenString(strUpper);
// Sequential string.
assertEquals(strLower, str.toLowerCase());
assertEquals(strUpper, str.toUpperCase());