Simplify Utf8Length calculation.
Flattening the string upfront has performance benefits and we can also simplify the implementation quite a bit. Bug: v8:6780 Cq-Include-Trybots: luci.chromium.try:linux_chromium_rel_ng Change-Id: Ic75bdfdf900d30e51e95cdf1cb8d09aab06332c6 Reviewed-on: https://chromium-review.googlesource.com/1014102 Reviewed-by: Marja Hölttä <marja@chromium.org> Reviewed-by: Mathias Bynens <mathias@chromium.org> Commit-Queue: Yang Guo <yangguo@chromium.org> Cr-Commit-Position: refs/heads/master@{#52643}
This commit is contained in:
parent
fbf9b5e4ff
commit
c3da929020
219
src/api.cc
219
src/api.cc
@ -5405,202 +5405,28 @@ bool String::ContainsOnlyOneByte() const {
|
||||
}
|
||||
|
||||
|
||||
class Utf8LengthHelper : public i::AllStatic {
|
||||
public:
|
||||
enum State {
|
||||
kEndsWithLeadingSurrogate = 1 << 0,
|
||||
kStartsWithTrailingSurrogate = 1 << 1,
|
||||
kLeftmostEdgeIsCalculated = 1 << 2,
|
||||
kRightmostEdgeIsCalculated = 1 << 3,
|
||||
kLeftmostEdgeIsSurrogate = 1 << 4,
|
||||
kRightmostEdgeIsSurrogate = 1 << 5
|
||||
};
|
||||
|
||||
static const uint8_t kInitialState = 0;
|
||||
|
||||
static inline bool EndsWithSurrogate(uint8_t state) {
|
||||
return state & kEndsWithLeadingSurrogate;
|
||||
}
|
||||
|
||||
static inline bool StartsWithSurrogate(uint8_t state) {
|
||||
return state & kStartsWithTrailingSurrogate;
|
||||
}
|
||||
|
||||
class Visitor {
|
||||
public:
|
||||
Visitor() : utf8_length_(0), state_(kInitialState) {}
|
||||
|
||||
void VisitOneByteString(const uint8_t* chars, int length) {
|
||||
int utf8_length = 0;
|
||||
// Add in length 1 for each non-Latin1 character.
|
||||
for (int i = 0; i < length; i++) {
|
||||
utf8_length += *chars++ >> 7;
|
||||
}
|
||||
// Add in length 1 for each character.
|
||||
utf8_length_ = utf8_length + length;
|
||||
state_ = kInitialState;
|
||||
}
|
||||
|
||||
void VisitTwoByteString(const uint16_t* chars, int length) {
|
||||
int utf8_length = 0;
|
||||
int last_character = unibrow::Utf16::kNoPreviousCharacter;
|
||||
for (int i = 0; i < length; i++) {
|
||||
uint16_t c = chars[i];
|
||||
utf8_length += unibrow::Utf8::Length(c, last_character);
|
||||
last_character = c;
|
||||
}
|
||||
utf8_length_ = utf8_length;
|
||||
uint8_t state = 0;
|
||||
if (unibrow::Utf16::IsTrailSurrogate(chars[0])) {
|
||||
state |= kStartsWithTrailingSurrogate;
|
||||
}
|
||||
if (unibrow::Utf16::IsLeadSurrogate(chars[length-1])) {
|
||||
state |= kEndsWithLeadingSurrogate;
|
||||
}
|
||||
state_ = state;
|
||||
}
|
||||
|
||||
static i::ConsString* VisitFlat(i::String* string,
|
||||
int* length,
|
||||
uint8_t* state) {
|
||||
Visitor visitor;
|
||||
i::ConsString* cons_string = i::String::VisitFlat(&visitor, string);
|
||||
*length = visitor.utf8_length_;
|
||||
*state = visitor.state_;
|
||||
return cons_string;
|
||||
}
|
||||
|
||||
private:
|
||||
int utf8_length_;
|
||||
uint8_t state_;
|
||||
DISALLOW_COPY_AND_ASSIGN(Visitor);
|
||||
};
|
||||
|
||||
static inline void MergeLeafLeft(int* length,
|
||||
uint8_t* state,
|
||||
uint8_t leaf_state) {
|
||||
bool edge_surrogate = StartsWithSurrogate(leaf_state);
|
||||
if (!(*state & kLeftmostEdgeIsCalculated)) {
|
||||
DCHECK(!(*state & kLeftmostEdgeIsSurrogate));
|
||||
*state |= kLeftmostEdgeIsCalculated
|
||||
| (edge_surrogate ? kLeftmostEdgeIsSurrogate : 0);
|
||||
} else if (EndsWithSurrogate(*state) && edge_surrogate) {
|
||||
*length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
|
||||
}
|
||||
if (EndsWithSurrogate(leaf_state)) {
|
||||
*state |= kEndsWithLeadingSurrogate;
|
||||
} else {
|
||||
*state &= ~kEndsWithLeadingSurrogate;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void MergeLeafRight(int* length,
|
||||
uint8_t* state,
|
||||
uint8_t leaf_state) {
|
||||
bool edge_surrogate = EndsWithSurrogate(leaf_state);
|
||||
if (!(*state & kRightmostEdgeIsCalculated)) {
|
||||
DCHECK(!(*state & kRightmostEdgeIsSurrogate));
|
||||
*state |= (kRightmostEdgeIsCalculated
|
||||
| (edge_surrogate ? kRightmostEdgeIsSurrogate : 0));
|
||||
} else if (edge_surrogate && StartsWithSurrogate(*state)) {
|
||||
*length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
|
||||
}
|
||||
if (StartsWithSurrogate(leaf_state)) {
|
||||
*state |= kStartsWithTrailingSurrogate;
|
||||
} else {
|
||||
*state &= ~kStartsWithTrailingSurrogate;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void MergeTerminal(int* length,
|
||||
uint8_t state,
|
||||
uint8_t* state_out) {
|
||||
DCHECK((state & kLeftmostEdgeIsCalculated) &&
|
||||
(state & kRightmostEdgeIsCalculated));
|
||||
if (EndsWithSurrogate(state) && StartsWithSurrogate(state)) {
|
||||
*length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
|
||||
}
|
||||
*state_out = kInitialState |
|
||||
(state & kLeftmostEdgeIsSurrogate ? kStartsWithTrailingSurrogate : 0) |
|
||||
(state & kRightmostEdgeIsSurrogate ? kEndsWithLeadingSurrogate : 0);
|
||||
}
|
||||
|
||||
static int Calculate(i::ConsString* current, uint8_t* state_out) {
|
||||
using internal::ConsString;
|
||||
int total_length = 0;
|
||||
uint8_t state = kInitialState;
|
||||
while (true) {
|
||||
i::String* left = current->first();
|
||||
i::String* right = current->second();
|
||||
uint8_t right_leaf_state;
|
||||
uint8_t left_leaf_state;
|
||||
int leaf_length;
|
||||
ConsString* left_as_cons =
|
||||
Visitor::VisitFlat(left, &leaf_length, &left_leaf_state);
|
||||
if (left_as_cons == nullptr) {
|
||||
total_length += leaf_length;
|
||||
MergeLeafLeft(&total_length, &state, left_leaf_state);
|
||||
}
|
||||
ConsString* right_as_cons =
|
||||
Visitor::VisitFlat(right, &leaf_length, &right_leaf_state);
|
||||
if (right_as_cons == nullptr) {
|
||||
total_length += leaf_length;
|
||||
MergeLeafRight(&total_length, &state, right_leaf_state);
|
||||
if (left_as_cons != nullptr) {
|
||||
// 1 Leaf node. Descend in place.
|
||||
current = left_as_cons;
|
||||
continue;
|
||||
} else {
|
||||
// Terminal node.
|
||||
MergeTerminal(&total_length, state, state_out);
|
||||
return total_length;
|
||||
}
|
||||
} else if (left_as_cons == nullptr) {
|
||||
// 1 Leaf node. Descend in place.
|
||||
current = right_as_cons;
|
||||
continue;
|
||||
}
|
||||
// Both strings are ConsStrings.
|
||||
// Recurse on smallest.
|
||||
if (left->length() < right->length()) {
|
||||
total_length += Calculate(left_as_cons, &left_leaf_state);
|
||||
MergeLeafLeft(&total_length, &state, left_leaf_state);
|
||||
current = right_as_cons;
|
||||
} else {
|
||||
total_length += Calculate(right_as_cons, &right_leaf_state);
|
||||
MergeLeafRight(&total_length, &state, right_leaf_state);
|
||||
current = left_as_cons;
|
||||
}
|
||||
}
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
static inline int Calculate(i::ConsString* current) {
|
||||
uint8_t state = kInitialState;
|
||||
return Calculate(current, &state);
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8LengthHelper);
|
||||
};
|
||||
|
||||
|
||||
static int Utf8Length(i::String* str, i::Isolate* isolate) {
|
||||
int length = str->length();
|
||||
if (length == 0) return 0;
|
||||
uint8_t state;
|
||||
i::ConsString* cons_string =
|
||||
Utf8LengthHelper::Visitor::VisitFlat(str, &length, &state);
|
||||
if (cons_string == nullptr) return length;
|
||||
return Utf8LengthHelper::Calculate(cons_string);
|
||||
}
|
||||
|
||||
|
||||
int String::Utf8Length() const {
|
||||
i::Handle<i::String> str = Utils::OpenHandle(this);
|
||||
str = i::String::Flatten(str);
|
||||
i::Isolate* isolate = str->GetIsolate();
|
||||
return v8::Utf8Length(*str, isolate);
|
||||
int length = str->length();
|
||||
if (length == 0) return 0;
|
||||
i::DisallowHeapAllocation no_gc;
|
||||
i::String::FlatContent flat = str->GetFlatContent();
|
||||
DCHECK(flat.IsFlat());
|
||||
int utf8_length = 0;
|
||||
if (flat.IsOneByte()) {
|
||||
for (uint8_t c : flat.ToOneByteVector()) {
|
||||
utf8_length += c >> 7;
|
||||
}
|
||||
utf8_length += length;
|
||||
} else {
|
||||
int last_character = unibrow::Utf16::kNoPreviousCharacter;
|
||||
for (uint16_t c : flat.ToUC16Vector()) {
|
||||
utf8_length += unibrow::Utf8::Length(c, last_character);
|
||||
last_character = c;
|
||||
}
|
||||
}
|
||||
return utf8_length;
|
||||
}
|
||||
|
||||
|
||||
@ -5825,7 +5651,7 @@ int String::WriteUtf8(char* buffer,
|
||||
if (success) return writer.CompleteWrite(write_null, nchars_ref);
|
||||
} else if (capacity >= string_length) {
|
||||
// First check that the buffer is large enough.
|
||||
int utf8_bytes = v8::Utf8Length(*str, isolate);
|
||||
int utf8_bytes = Utf8Length();
|
||||
if (utf8_bytes <= capacity) {
|
||||
// one-byte fast path.
|
||||
if (utf8_bytes == string_length) {
|
||||
@ -5845,8 +5671,6 @@ int String::WriteUtf8(char* buffer,
|
||||
return WriteUtf8(buffer, -1, nchars_ref, options);
|
||||
}
|
||||
}
|
||||
// Recursive slow path can potentially be unreasonable slow. Flatten.
|
||||
str = i::String::Flatten(str);
|
||||
Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8);
|
||||
i::String::VisitFlat(&writer, *str);
|
||||
return writer.CompleteWrite(write_null, nchars_ref);
|
||||
@ -9055,8 +8879,7 @@ String::Utf8Value::Utf8Value(v8::Isolate* isolate, v8::Local<v8::Value> obj)
|
||||
TryCatch try_catch(isolate);
|
||||
Local<String> str;
|
||||
if (!obj->ToString(context).ToLocal(&str)) return;
|
||||
i::Handle<i::String> i_str = Utils::OpenHandle(*str);
|
||||
length_ = v8::Utf8Length(*i_str, i_isolate);
|
||||
length_ = str->Utf8Length();
|
||||
str_ = i::NewArray<char>(length_ + 1);
|
||||
str->WriteUtf8(str_);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user