Simplify Utf8Length calculation.

Flattening the string upfront has performance benefits and we can
also simplify the implementation quite a bit.

Bug: v8:6780
Cq-Include-Trybots: luci.chromium.try:linux_chromium_rel_ng
Change-Id: Ic75bdfdf900d30e51e95cdf1cb8d09aab06332c6
Reviewed-on: https://chromium-review.googlesource.com/1014102
Reviewed-by: Marja Hölttä <marja@chromium.org>
Reviewed-by: Mathias Bynens <mathias@chromium.org>
Commit-Queue: Yang Guo <yangguo@chromium.org>
Cr-Commit-Position: refs/heads/master@{#52643}
This commit is contained in:
Yang Guo 2018-04-16 12:23:04 +02:00 committed by Commit Bot
parent fbf9b5e4ff
commit c3da929020

View File

@ -5405,202 +5405,28 @@ bool String::ContainsOnlyOneByte() const {
}
class Utf8LengthHelper : public i::AllStatic {
public:
enum State {
kEndsWithLeadingSurrogate = 1 << 0,
kStartsWithTrailingSurrogate = 1 << 1,
kLeftmostEdgeIsCalculated = 1 << 2,
kRightmostEdgeIsCalculated = 1 << 3,
kLeftmostEdgeIsSurrogate = 1 << 4,
kRightmostEdgeIsSurrogate = 1 << 5
};
static const uint8_t kInitialState = 0;
static inline bool EndsWithSurrogate(uint8_t state) {
return state & kEndsWithLeadingSurrogate;
}
static inline bool StartsWithSurrogate(uint8_t state) {
return state & kStartsWithTrailingSurrogate;
}
class Visitor {
public:
Visitor() : utf8_length_(0), state_(kInitialState) {}
void VisitOneByteString(const uint8_t* chars, int length) {
int utf8_length = 0;
// Add in length 1 for each non-Latin1 character.
for (int i = 0; i < length; i++) {
utf8_length += *chars++ >> 7;
}
// Add in length 1 for each character.
utf8_length_ = utf8_length + length;
state_ = kInitialState;
}
void VisitTwoByteString(const uint16_t* chars, int length) {
int utf8_length = 0;
int last_character = unibrow::Utf16::kNoPreviousCharacter;
for (int i = 0; i < length; i++) {
uint16_t c = chars[i];
utf8_length += unibrow::Utf8::Length(c, last_character);
last_character = c;
}
utf8_length_ = utf8_length;
uint8_t state = 0;
if (unibrow::Utf16::IsTrailSurrogate(chars[0])) {
state |= kStartsWithTrailingSurrogate;
}
if (unibrow::Utf16::IsLeadSurrogate(chars[length-1])) {
state |= kEndsWithLeadingSurrogate;
}
state_ = state;
}
static i::ConsString* VisitFlat(i::String* string,
int* length,
uint8_t* state) {
Visitor visitor;
i::ConsString* cons_string = i::String::VisitFlat(&visitor, string);
*length = visitor.utf8_length_;
*state = visitor.state_;
return cons_string;
}
private:
int utf8_length_;
uint8_t state_;
DISALLOW_COPY_AND_ASSIGN(Visitor);
};
static inline void MergeLeafLeft(int* length,
uint8_t* state,
uint8_t leaf_state) {
bool edge_surrogate = StartsWithSurrogate(leaf_state);
if (!(*state & kLeftmostEdgeIsCalculated)) {
DCHECK(!(*state & kLeftmostEdgeIsSurrogate));
*state |= kLeftmostEdgeIsCalculated
| (edge_surrogate ? kLeftmostEdgeIsSurrogate : 0);
} else if (EndsWithSurrogate(*state) && edge_surrogate) {
*length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
}
if (EndsWithSurrogate(leaf_state)) {
*state |= kEndsWithLeadingSurrogate;
} else {
*state &= ~kEndsWithLeadingSurrogate;
}
}
static inline void MergeLeafRight(int* length,
uint8_t* state,
uint8_t leaf_state) {
bool edge_surrogate = EndsWithSurrogate(leaf_state);
if (!(*state & kRightmostEdgeIsCalculated)) {
DCHECK(!(*state & kRightmostEdgeIsSurrogate));
*state |= (kRightmostEdgeIsCalculated
| (edge_surrogate ? kRightmostEdgeIsSurrogate : 0));
} else if (edge_surrogate && StartsWithSurrogate(*state)) {
*length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
}
if (StartsWithSurrogate(leaf_state)) {
*state |= kStartsWithTrailingSurrogate;
} else {
*state &= ~kStartsWithTrailingSurrogate;
}
}
static inline void MergeTerminal(int* length,
uint8_t state,
uint8_t* state_out) {
DCHECK((state & kLeftmostEdgeIsCalculated) &&
(state & kRightmostEdgeIsCalculated));
if (EndsWithSurrogate(state) && StartsWithSurrogate(state)) {
*length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
}
*state_out = kInitialState |
(state & kLeftmostEdgeIsSurrogate ? kStartsWithTrailingSurrogate : 0) |
(state & kRightmostEdgeIsSurrogate ? kEndsWithLeadingSurrogate : 0);
}
static int Calculate(i::ConsString* current, uint8_t* state_out) {
using internal::ConsString;
int total_length = 0;
uint8_t state = kInitialState;
while (true) {
i::String* left = current->first();
i::String* right = current->second();
uint8_t right_leaf_state;
uint8_t left_leaf_state;
int leaf_length;
ConsString* left_as_cons =
Visitor::VisitFlat(left, &leaf_length, &left_leaf_state);
if (left_as_cons == nullptr) {
total_length += leaf_length;
MergeLeafLeft(&total_length, &state, left_leaf_state);
}
ConsString* right_as_cons =
Visitor::VisitFlat(right, &leaf_length, &right_leaf_state);
if (right_as_cons == nullptr) {
total_length += leaf_length;
MergeLeafRight(&total_length, &state, right_leaf_state);
if (left_as_cons != nullptr) {
// 1 Leaf node. Descend in place.
current = left_as_cons;
continue;
} else {
// Terminal node.
MergeTerminal(&total_length, state, state_out);
return total_length;
}
} else if (left_as_cons == nullptr) {
// 1 Leaf node. Descend in place.
current = right_as_cons;
continue;
}
// Both strings are ConsStrings.
// Recurse on smallest.
if (left->length() < right->length()) {
total_length += Calculate(left_as_cons, &left_leaf_state);
MergeLeafLeft(&total_length, &state, left_leaf_state);
current = right_as_cons;
} else {
total_length += Calculate(right_as_cons, &right_leaf_state);
MergeLeafRight(&total_length, &state, right_leaf_state);
current = left_as_cons;
}
}
UNREACHABLE();
}
static inline int Calculate(i::ConsString* current) {
uint8_t state = kInitialState;
return Calculate(current, &state);
}
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8LengthHelper);
};
static int Utf8Length(i::String* str, i::Isolate* isolate) {
int length = str->length();
if (length == 0) return 0;
uint8_t state;
i::ConsString* cons_string =
Utf8LengthHelper::Visitor::VisitFlat(str, &length, &state);
if (cons_string == nullptr) return length;
return Utf8LengthHelper::Calculate(cons_string);
}
int String::Utf8Length() const {
i::Handle<i::String> str = Utils::OpenHandle(this);
str = i::String::Flatten(str);
i::Isolate* isolate = str->GetIsolate();
return v8::Utf8Length(*str, isolate);
int length = str->length();
if (length == 0) return 0;
i::DisallowHeapAllocation no_gc;
i::String::FlatContent flat = str->GetFlatContent();
DCHECK(flat.IsFlat());
int utf8_length = 0;
if (flat.IsOneByte()) {
for (uint8_t c : flat.ToOneByteVector()) {
utf8_length += c >> 7;
}
utf8_length += length;
} else {
int last_character = unibrow::Utf16::kNoPreviousCharacter;
for (uint16_t c : flat.ToUC16Vector()) {
utf8_length += unibrow::Utf8::Length(c, last_character);
last_character = c;
}
}
return utf8_length;
}
@ -5825,7 +5651,7 @@ int String::WriteUtf8(char* buffer,
if (success) return writer.CompleteWrite(write_null, nchars_ref);
} else if (capacity >= string_length) {
// First check that the buffer is large enough.
int utf8_bytes = v8::Utf8Length(*str, isolate);
int utf8_bytes = Utf8Length();
if (utf8_bytes <= capacity) {
// one-byte fast path.
if (utf8_bytes == string_length) {
@ -5845,8 +5671,6 @@ int String::WriteUtf8(char* buffer,
return WriteUtf8(buffer, -1, nchars_ref, options);
}
}
// Recursive slow path can potentially be unreasonable slow. Flatten.
str = i::String::Flatten(str);
Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8);
i::String::VisitFlat(&writer, *str);
return writer.CompleteWrite(write_null, nchars_ref);
@ -9055,8 +8879,7 @@ String::Utf8Value::Utf8Value(v8::Isolate* isolate, v8::Local<v8::Value> obj)
TryCatch try_catch(isolate);
Local<String> str;
if (!obj->ToString(context).ToLocal(&str)) return;
i::Handle<i::String> i_str = Utils::OpenHandle(*str);
length_ = v8::Utf8Length(*i_str, i_isolate);
length_ = str->Utf8Length();
str_ = i::NewArray<char>(length_ + 1);
str->WriteUtf8(str_);
}