Added back some utf8 optimizations

R=yangguo@chromium.org
BUG=https://code.google.com/p/v8/issues/detail?id=2551

Review URL: https://codereview.chromium.org/12390057

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13842 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
dcarney@chromium.org 2013-03-06 15:39:57 +00:00
parent 940ed0dc92
commit 07e2494735
3 changed files with 306 additions and 79 deletions

View File

@ -3831,68 +3831,200 @@ bool String::IsOneByte() const {
}
class Utf8LengthVisitor {
class Utf8LengthHelper : public i::AllStatic {
public:
explicit Utf8LengthVisitor()
: utf8_length_(0),
last_character_(unibrow::Utf16::kNoPreviousCharacter) {}
enum State {
kEndsWithLeadingSurrogate = 1 << 0,
kStartsWithTrailingSurrogate = 1 << 1,
kLeftmostEdgeIsCalculated = 1 << 2,
kRightmostEdgeIsCalculated = 1 << 3,
kLeftmostEdgeIsSurrogate = 1 << 4,
kRightmostEdgeIsSurrogate = 1 << 5
};
inline int GetLength() {
return utf8_length_;
static const uint8_t kInitialState = 0;
static inline bool EndsWithSurrogate(uint8_t state) {
return state & kEndsWithLeadingSurrogate;
}
template<typename Char>
inline void Visit(const Char* chars, unsigned length) {
ASSERT(length > 0);
// TODO(dcarney) Add back ascii fast path.
int utf8_length = 0;
int last_character = last_character_;
for (unsigned i = 0; i < length; i++) {
uint16_t c = chars[i];
utf8_length += unibrow::Utf8::Length(c, last_character);
last_character = c;
static inline bool StartsWithSurrogate(uint8_t state) {
return state & kStartsWithTrailingSurrogate;
}
class Visitor {
public:
explicit Visitor()
: utf8_length_(0),
state_(kInitialState) {}
template<typename Char>
inline void Visit(const Char* chars, int length) {
int utf8_length = 0;
int last_character = unibrow::Utf16::kNoPreviousCharacter;
for (int i = 0; i < length; i++) {
uint16_t c = chars[i];
utf8_length += unibrow::Utf8::Length(c, last_character);
if (sizeof(Char) > 1) {
last_character = c;
}
}
utf8_length_ = utf8_length;
}
void VisitOneByteString(const uint8_t* chars, int length) {
Visit(chars, length);
state_ = kInitialState;
}
void VisitTwoByteString(const uint16_t* chars, int length) {
Visit(chars, length);
uint8_t state = 0;
if (unibrow::Utf16::IsTrailSurrogate(chars[0])) {
state |= kStartsWithTrailingSurrogate;
}
if (unibrow::Utf16::IsLeadSurrogate(chars[length-1])) {
state |= kEndsWithLeadingSurrogate;
}
state_ = state;
}
static i::ConsString* VisitFlat(i::String* string,
int* length,
uint8_t* state) {
Visitor visitor;
i::ConsString* cons_string = i::String::VisitFlat(&visitor, string);
*length = visitor.utf8_length_;
*state = visitor.state_;
return cons_string;
}
private:
int utf8_length_;
uint8_t state_;
DISALLOW_COPY_AND_ASSIGN(Visitor);
};
static inline void MergeLeafLeft(int* length,
uint8_t* state,
uint8_t leaf_state) {
bool edge_surrogate = StartsWithSurrogate(leaf_state);
if (!(*state & kLeftmostEdgeIsCalculated)) {
ASSERT(!(*state & kLeftmostEdgeIsSurrogate));
*state |= kLeftmostEdgeIsCalculated
| (edge_surrogate ? kLeftmostEdgeIsSurrogate : 0);
} else if (EndsWithSurrogate(*state) && edge_surrogate) {
*length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
}
if (EndsWithSurrogate(leaf_state)) {
*state |= kEndsWithLeadingSurrogate;
} else {
*state &= ~kEndsWithLeadingSurrogate;
}
last_character_ = last_character;
utf8_length_ += utf8_length;
}
inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
Visit(chars, length);
static inline void MergeLeafRight(int* length,
uint8_t* state,
uint8_t leaf_state) {
bool edge_surrogate = EndsWithSurrogate(leaf_state);
if (!(*state & kRightmostEdgeIsCalculated)) {
ASSERT(!(*state & kRightmostEdgeIsSurrogate));
*state |= (kRightmostEdgeIsCalculated
| (edge_surrogate ? kRightmostEdgeIsSurrogate : 0));
} else if (edge_surrogate && StartsWithSurrogate(*state)) {
*length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
}
if (StartsWithSurrogate(leaf_state)) {
*state |= kStartsWithTrailingSurrogate;
} else {
*state &= ~kStartsWithTrailingSurrogate;
}
}
inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
Visit(chars, length);
static inline void MergeTerminal(int* length,
uint8_t state,
uint8_t* state_out) {
ASSERT((state & kLeftmostEdgeIsCalculated) &&
(state & kRightmostEdgeIsCalculated));
if (EndsWithSurrogate(state) && StartsWithSurrogate(state)) {
*length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
}
*state_out = kInitialState |
(state & kLeftmostEdgeIsSurrogate ? kStartsWithTrailingSurrogate : 0) |
(state & kRightmostEdgeIsSurrogate ? kEndsWithLeadingSurrogate : 0);
}
static int Calculate(i::ConsString* current, uint8_t* state_out) {
using namespace internal;
int total_length = 0;
uint8_t state = kInitialState;
while (true) {
i::String* left = current->first();
i::String* right = current->second();
uint8_t right_leaf_state;
uint8_t left_leaf_state;
int leaf_length;
ConsString* left_as_cons =
Visitor::VisitFlat(left, &leaf_length, &left_leaf_state);
if (left_as_cons == NULL) {
total_length += leaf_length;
MergeLeafLeft(&total_length, &state, left_leaf_state);
}
ConsString* right_as_cons =
Visitor::VisitFlat(right, &leaf_length, &right_leaf_state);
if (right_as_cons == NULL) {
total_length += leaf_length;
MergeLeafRight(&total_length, &state, right_leaf_state);
// Terminal node.
if (left_as_cons == NULL) {
MergeTerminal(&total_length, state, state_out);
return total_length;
}
} else if (left_as_cons != NULL) {
// Both strings are ConsStrings.
// Recurse on smallest.
if (left->length() < right->length()) {
total_length += Calculate(left_as_cons, &left_leaf_state);
MergeLeafLeft(&total_length, &state, left_leaf_state);
current = right_as_cons;
continue;
} else {
total_length += Calculate(right_as_cons, &right_leaf_state);
MergeLeafRight(&total_length, &state, right_leaf_state);
current = left_as_cons;
continue;
}
}
// 1 leaf node. Do in place descent.
if (left_as_cons != NULL) {
current = left_as_cons;
} else {
ASSERT(right_as_cons != NULL);
current = right_as_cons;
}
}
UNREACHABLE();
return 0;
}
static inline int Calculate(i::ConsString* current) {
uint8_t state = kInitialState;
return Calculate(current, &state);
}
private:
int utf8_length_;
int last_character_;
DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor);
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8LengthHelper);
};
static int Utf8Length(i::String* str, i::Isolate* isolate) {
unsigned length = static_cast<unsigned>(str->length());
int length = str->length();
if (length == 0) return 0;
int32_t type = str->map()->instance_type();
Utf8LengthVisitor visitor;
// Non ConsString branch.
if ((type & i::kStringRepresentationMask) != i::kConsStringTag) {
i::ConsStringNullOp null_op;
i::String::Visit(str, 0, visitor, null_op, type, length);
return visitor.GetLength();
}
i::ConsStringIteratorOp* op = isolate->write_iterator();
unsigned offset = 0;
i::String* leaf = op->Operate(str, &offset, &type, &length);
ASSERT(leaf != NULL);
while (leaf != NULL) {
i::ConsStringNullOp null_op;
ASSERT(offset == 0);
i::String::Visit(leaf, 0, visitor, null_op, type, length);
leaf = op->ContinueOperation(&type, &length);
}
return visitor.GetLength();
uint8_t state;
i::ConsString* cons_string =
Utf8LengthHelper::Visitor::VisitFlat(str, &length, &state);
if (cons_string == NULL) return length;
return Utf8LengthHelper::Calculate(cons_string);
}
@ -3906,12 +4038,14 @@ int String::Utf8Length() const {
class Utf8WriterVisitor {
public:
Utf8WriterVisitor(char* buffer, int capacity)
Utf8WriterVisitor(
char* buffer, int capacity, bool skip_capacity_check)
: early_termination_(false),
last_character_(unibrow::Utf16::kNoPreviousCharacter),
buffer_(buffer),
start_(buffer),
capacity_(capacity),
skip_capacity_check_(capacity == -1 || skip_capacity_check),
utf16_chars_read_(0) {
}
@ -3935,7 +4069,7 @@ class Utf8WriterVisitor {
// Can't encode using last_character as gcc has array bounds issues.
int written = Utf8::Encode(temp_buffer,
character,
unibrow::Utf16::kNoPreviousCharacter);
Utf16::kNoPreviousCharacter);
// Won't fit.
if (written > remaining) return 0;
// Copy over the character from temp_buffer.
@ -3948,45 +4082,55 @@ class Utf8WriterVisitor {
template<typename Char>
void Visit(const Char* chars, const int length) {
using namespace unibrow;
// TODO(dcarney): Add back ascii fast path.
ASSERT(!early_termination_);
ASSERT(length > 0);
if (length == 0) return;
// Copy state to stack.
char* buffer = buffer_;
int last_character = last_character_;
int last_character =
sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_;
int i = 0;
// Do a fast loop where there is no exit capacity check.
while (true) {
int fast_length;
if (capacity_ == -1) {
if (skip_capacity_check_) {
fast_length = length;
} else {
int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);
// Need enough space to write everything but one character.
STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3);
int writable_length = (remaining_capacity - 3)/3;
int max_size_per_char = sizeof(Char) == 1 ? 2 : 3;
int writable_length =
(remaining_capacity - max_size_per_char)/max_size_per_char;
// Need to drop into slow loop.
if (writable_length <= 0) break;
fast_length = i + writable_length;
if (fast_length > length) fast_length = length;
}
// Write the characters to the stream.
for (; i < fast_length; i++) {
uint16_t character = *chars++;
buffer += Utf8::Encode(buffer, character, last_character);
last_character = character;
ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
if (sizeof(Char) == 1) {
for (; i < fast_length; i++) {
buffer +=
Utf8::Encode(buffer, *chars++, Utf16::kNoPreviousCharacter);
ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
}
} else {
for (; i < fast_length; i++) {
uint16_t character = *chars++;
buffer += Utf8::Encode(buffer, character, last_character);
last_character = character;
ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
}
}
// Array is fully written. Exit.
if (fast_length == length) {
// Write state back out to object.
last_character_ = last_character;
buffer_ = buffer;
utf16_chars_read_ += i;
utf16_chars_read_ += length;
return;
}
}
ASSERT(capacity_ != -1);
ASSERT(!skip_capacity_check_);
// Slow loop. Must check capacity on each iteration.
int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);
ASSERT(remaining_capacity >= 0);
@ -4014,15 +4158,15 @@ class Utf8WriterVisitor {
return early_termination_;
}
inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
Visit(chars, static_cast<int>(length));
inline void VisitOneByteString(const uint8_t* chars, int length) {
Visit(chars, length);
}
inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
Visit(chars, static_cast<int>(length));
inline void VisitTwoByteString(const uint16_t* chars, int length) {
Visit(chars, length);
}
inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
// Write out number of utf16 characters written to the stream.
if (utf16_chars_read_out != NULL) {
*utf16_chars_read_out = utf16_chars_read_;
@ -4042,11 +4186,32 @@ class Utf8WriterVisitor {
char* buffer_;
char* const start_;
int capacity_;
bool const skip_capacity_check_;
int utf16_chars_read_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
};
static bool RecursivelySerializeToUtf8(i::String* current,
Utf8WriterVisitor* writer,
int recursion_budget) {
while (!writer->IsDone()) {
i::ConsString* cons_string = i::String::VisitFlat(writer, current);
if (cons_string == NULL) return true; // Leaf node.
if (recursion_budget <= 0) return false;
// Must write the left branch first.
i::String* first = cons_string->first();
bool success = RecursivelySerializeToUtf8(first,
writer,
recursion_budget - 1);
if (!success) return false;
// Inline tail recurse for right branch.
current = cons_string->second();
}
return true;
}
int String::WriteUtf8(char* buffer,
int capacity,
int* nchars_ref,
@ -4059,23 +4224,41 @@ int String::WriteUtf8(char* buffer,
if (options & HINT_MANY_WRITES_EXPECTED) {
FlattenString(str); // Flatten the string for efficiency.
}
Utf8WriterVisitor writer(buffer, capacity);
i::ConsStringIteratorOp* op = isolate->write_iterator();
op->Reset();
int32_t type = str->map()->instance_type();
unsigned str_length = static_cast<unsigned>(str->length());
if (str_length != 0) {
i::String::Visit(*str, 0, writer, *op, type, str_length);
while (!writer.IsDone()) {
unsigned length_out;
i::String* next = op->ContinueOperation(&type, &length_out);
if (next == NULL) break;
// TODO(dcarney): need an asserting null op.
i::ConsStringNullOp null_op;
i::String::Visit(next, 0, writer, null_op, type, length_out);
const int string_length = str->length();
bool write_null = !(options & NO_NULL_TERMINATION);
// First check if we can just write the string without checking capacity.
if (capacity == -1 || capacity / 3 >= string_length) {
Utf8WriterVisitor writer(buffer, capacity, true);
const int kMaxRecursion = 100;
bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);
if (success) return writer.CompleteWrite(write_null, nchars_ref);
} else if (capacity >= string_length) {
// First check that the buffer is large enough.
int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate());
if (utf8_bytes <= capacity) {
// ASCII fast path.
if (utf8_bytes == string_length) {
WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options);
if (nchars_ref != NULL) *nchars_ref = string_length;
if (write_null && (utf8_bytes+1 <= capacity)) {
return string_length + 1;
}
return string_length;
}
if (write_null && (utf8_bytes+1 > capacity)) {
options |= NO_NULL_TERMINATION;
}
// Recurse once without a capacity limit.
// This will get into the first branch above.
// TODO(dcarney) Check max left rec. in Utf8Length and fall through.
return WriteUtf8(buffer, -1, nchars_ref, options);
}
}
return writer.CompleteWrite(!(options & NO_NULL_TERMINATION), nchars_ref);
// Recursive slow path can potentially be unreasonable slow. Flatten.
str = FlattenGetString(str);
Utf8WriterVisitor writer(buffer, capacity, false);
i::String::VisitFlat(&writer, *str);
return writer.CompleteWrite(write_null, nchars_ref);
}

View File

@ -2648,6 +2648,32 @@ void String::Visit(
}
// TODO(dcarney): Remove this class after conversion to VisitFlat.
class ConsStringCaptureOp {
public:
inline ConsStringCaptureOp() : cons_string_(NULL) {}
inline String* Operate(String* string, unsigned*, int32_t*, unsigned*) {
cons_string_ = ConsString::cast(string);
return NULL;
}
ConsString* cons_string_;
};
template<class Visitor>
ConsString* String::VisitFlat(Visitor* visitor,
String* string,
int offset,
int length,
int32_t type) {
ASSERT(length >= 0 && length == string->length());
ASSERT(offset >= 0 && offset <= length);
ConsStringCaptureOp op;
Visit(string, offset, *visitor, op, type, static_cast<unsigned>(length));
return op.cons_string_;
}
uint16_t SeqOneByteString::SeqOneByteStringGet(int index) {
ASSERT(index >= 0 && index < length());
return READ_BYTE_FIELD(this, kHeaderSize + index * kCharSize);

View File

@ -7337,6 +7337,8 @@ class Symbol: public Name {
};
class ConsString;
// The String abstract class captures JavaScript string values:
//
// Ecma-262:
@ -7615,6 +7617,7 @@ class String: public Name {
return NonOneByteStart(chars, length) >= length;
}
// TODO(dcarney): Replace all instances of this with VisitFlat.
template<class Visitor, class ConsOp>
static inline void Visit(String* string,
unsigned offset,
@ -7623,6 +7626,21 @@ class String: public Name {
int32_t type,
unsigned length);
template<class Visitor>
static inline ConsString* VisitFlat(Visitor* visitor,
String* string,
int offset,
int length,
int32_t type);
template<class Visitor>
static inline ConsString* VisitFlat(Visitor* visitor,
String* string,
int offset = 0) {
int32_t type = string->map()->instance_type();
return VisitFlat(visitor, string, offset, string->length(), type);
}
private:
friend class Name;