Revert "[utf8] Rewrite NewStringFromUtf8 using Utf8::ValueOfIncremental"
This reverts commit 73dd9b5527
.
Reason for revert: Broke telemetry layout tests - https://ci.chromium.org/p/chromium/builders/luci.chromium.try/win7-rel/9936 as can be seen in this roll - https://chromium-review.googlesource.com/c/chromium/src/+/1454259
Original change's description:
> [utf8] Rewrite NewStringFromUtf8 using Utf8::ValueOfIncremental
>
> This is 3-4x faster than using the Utf8Decoder. This matters for proper
> parse-time measurements using d8.
>
> Change-Id: I9870e9fbe400ec022a6eeb20491c80a2a32f8519
> Reviewed-on: https://chromium-review.googlesource.com/c/1451827
> Commit-Queue: Toon Verwaest <verwaest@chromium.org>
> Reviewed-by: Leszek Swirski <leszeks@chromium.org>
> Reviewed-by: Ulan Degenbaev <ulan@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#59347}
TBR=ulan@chromium.org,leszeks@chromium.org,verwaest@chromium.org
# Not skipping CQ checks because original CL landed > 1 day ago.
Change-Id: I3f8faebb61c19a41ee496a571228f53c0d5fc8dd
Reviewed-on: https://chromium-review.googlesource.com/c/1454495
Reviewed-by: Maya Lekova <mslekova@chromium.org>
Commit-Queue: Yang Guo <yangguo@chromium.org>
Cr-Commit-Position: refs/heads/master@{#59378}
This commit is contained in:
parent
85fcaff1b0
commit
ec30cf47c7
@ -42,7 +42,7 @@
|
||||
#include "src/objects/stack-frame-info-inl.h"
|
||||
#include "src/objects/struct-inl.h"
|
||||
#include "src/unicode-cache.h"
|
||||
#include "src/unicode-inl.h"
|
||||
#include "src/unicode-decoder.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
@ -661,38 +661,13 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
|
||||
return NewStringFromOneByte(Vector<const uint8_t>::cast(string), pretenure);
|
||||
}
|
||||
|
||||
std::unique_ptr<uint16_t[]> buffer(new uint16_t[length - non_ascii_start]);
|
||||
// Non-ASCII and we need to decode.
|
||||
auto non_ascii = string.SubVector(non_ascii_start, length);
|
||||
Access<UnicodeCache::Utf8Decoder> decoder(
|
||||
isolate()->unicode_cache()->utf8_decoder());
|
||||
decoder->Reset(non_ascii);
|
||||
|
||||
const uint8_t* cursor =
|
||||
reinterpret_cast<const uint8_t*>(&string[non_ascii_start]);
|
||||
const uint8_t* end = reinterpret_cast<const uint8_t*>(string.end());
|
||||
|
||||
uint16_t* output_cursor = buffer.get();
|
||||
|
||||
uint32_t incomplete_char = 0;
|
||||
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
|
||||
|
||||
while (cursor < end) {
|
||||
unibrow::uchar t =
|
||||
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
|
||||
|
||||
if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
|
||||
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
|
||||
} else if (t == unibrow::Utf8::kIncomplete) {
|
||||
continue;
|
||||
} else {
|
||||
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
|
||||
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
|
||||
}
|
||||
}
|
||||
|
||||
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
|
||||
if (t != unibrow::Utf8::kBufferEmpty) {
|
||||
*(output_cursor++) = static_cast<uc16>(t);
|
||||
}
|
||||
|
||||
DCHECK_LE(output_cursor, buffer.get() + length - non_ascii_start);
|
||||
int utf16_length = static_cast<int>(output_cursor - buffer.get());
|
||||
int utf16_length = static_cast<int>(decoder->Utf16Length());
|
||||
DCHECK_GT(utf16_length, 0);
|
||||
|
||||
// Allocate string.
|
||||
@ -701,13 +676,15 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
|
||||
isolate(), result,
|
||||
NewRawTwoByteString(non_ascii_start + utf16_length, pretenure), String);
|
||||
|
||||
DCHECK_LE(non_ascii_start + utf16_length, length);
|
||||
|
||||
// Copy ASCII portion.
|
||||
DisallowHeapAllocation no_gc;
|
||||
uint16_t* data = result->GetChars(no_gc);
|
||||
CopyChars(data, ascii_data, non_ascii_start);
|
||||
CopyChars(data + non_ascii_start, buffer.get(), utf16_length);
|
||||
for (int i = 0; i < non_ascii_start; i++) {
|
||||
*data++ = *ascii_data++;
|
||||
}
|
||||
|
||||
// Now write the remainder.
|
||||
decoder->WriteUtf16(data, utf16_length, non_ascii);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -514,38 +514,23 @@ bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
|
||||
unibrow::Utf8::State state = chunk.start.state;
|
||||
uint32_t incomplete_char = chunk.start.incomplete_char;
|
||||
size_t it = current_.pos.bytes - chunk.start.bytes;
|
||||
const uint8_t* cursor = &chunk.data[it];
|
||||
const uint8_t* end = &chunk.data[chunk.length];
|
||||
|
||||
size_t chars = current_.pos.chars;
|
||||
|
||||
if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) {
|
||||
while (cursor < end) {
|
||||
unibrow::uchar t =
|
||||
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
|
||||
if (t == unibrow::Utf8::kIncomplete) continue;
|
||||
if (t != kUtf8Bom) {
|
||||
chars++;
|
||||
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
while (cursor < end && chars < position) {
|
||||
unibrow::uchar t =
|
||||
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
|
||||
if (t != unibrow::Utf8::kIncomplete) {
|
||||
size_t chars = chunk.start.chars;
|
||||
while (it < chunk.length && chars < position) {
|
||||
unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
|
||||
chunk.data[it], &it, &state, &incomplete_char);
|
||||
if (t == kUtf8Bom && current_.pos.chars == 0) {
|
||||
// BOM detected at beginning of the stream. Don't copy it.
|
||||
} else if (t != unibrow::Utf8::kIncomplete) {
|
||||
chars++;
|
||||
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
|
||||
}
|
||||
}
|
||||
|
||||
current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
|
||||
current_.pos.bytes += it;
|
||||
current_.pos.chars = chars;
|
||||
current_.pos.incomplete_char = incomplete_char;
|
||||
current_.pos.state = state;
|
||||
current_.chunk_no += (cursor == end);
|
||||
current_.chunk_no += (it == chunk.length);
|
||||
|
||||
return current_.pos.chars == position;
|
||||
}
|
||||
@ -559,8 +544,8 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
|
||||
|
||||
// The buffer_ is writable, but buffer_*_ members are const. So we get a
|
||||
// non-const pointer into buffer that points to the same char as buffer_end_.
|
||||
uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_);
|
||||
DCHECK_EQ(output_cursor, buffer_end_);
|
||||
uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
|
||||
DCHECK_EQ(cursor, buffer_end_);
|
||||
|
||||
unibrow::Utf8::State state = current_.pos.state;
|
||||
uint32_t incomplete_char = current_.pos.incomplete_char;
|
||||
@ -571,7 +556,7 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
|
||||
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
|
||||
if (t != unibrow::Utf8::kBufferEmpty) {
|
||||
DCHECK_EQ(t, unibrow::Utf8::kBadChar);
|
||||
*output_cursor = static_cast<uc16>(t);
|
||||
*cursor = static_cast<uc16>(t);
|
||||
buffer_end_++;
|
||||
current_.pos.chars++;
|
||||
current_.pos.incomplete_char = 0;
|
||||
@ -581,50 +566,30 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
|
||||
}
|
||||
|
||||
size_t it = current_.pos.bytes - chunk.start.bytes;
|
||||
const uint8_t* cursor = chunk.data + it;
|
||||
const uint8_t* end = chunk.data + chunk.length;
|
||||
|
||||
// Deal with possible BOM.
|
||||
if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) {
|
||||
while (cursor < end) {
|
||||
unibrow::uchar t =
|
||||
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
|
||||
if (V8_LIKELY(t < kUtf8Bom)) {
|
||||
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
|
||||
} else if (t == unibrow::Utf8::kIncomplete) {
|
||||
continue;
|
||||
} else if (t == kUtf8Bom) {
|
||||
// BOM detected at beginning of the stream. Don't copy it.
|
||||
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
|
||||
*(output_cursor++) = static_cast<uc16>(t);
|
||||
} else {
|
||||
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
|
||||
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
while (cursor < end && output_cursor + 1 < buffer_start_ + kBufferSize) {
|
||||
unibrow::uchar t =
|
||||
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
|
||||
if (V8_LIKELY(t < unibrow::Utf16::kMaxNonSurrogateCharCode)) {
|
||||
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
|
||||
while (it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize) {
|
||||
unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
|
||||
chunk.data[it], &it, &state, &incomplete_char);
|
||||
if (V8_LIKELY(t < kUtf8Bom)) {
|
||||
*(cursor++) = static_cast<uc16>(t); // The by most frequent case.
|
||||
} else if (t == unibrow::Utf8::kIncomplete) {
|
||||
continue;
|
||||
} else if (t == kUtf8Bom && current_.pos.bytes + it == 3) {
|
||||
// BOM detected at beginning of the stream. Don't copy it.
|
||||
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
|
||||
*(cursor++) = static_cast<uc16>(t);
|
||||
} else {
|
||||
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
|
||||
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
|
||||
*(cursor++) = unibrow::Utf16::LeadSurrogate(t);
|
||||
*(cursor++) = unibrow::Utf16::TrailSurrogate(t);
|
||||
}
|
||||
}
|
||||
|
||||
current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
|
||||
current_.pos.chars += (output_cursor - buffer_end_);
|
||||
current_.pos.bytes = chunk.start.bytes + it;
|
||||
current_.pos.chars += (cursor - buffer_end_);
|
||||
current_.pos.incomplete_char = incomplete_char;
|
||||
current_.pos.state = state;
|
||||
current_.chunk_no += (cursor == end);
|
||||
current_.chunk_no += (it == chunk.length);
|
||||
|
||||
buffer_end_ = output_cursor;
|
||||
buffer_end_ = cursor;
|
||||
}
|
||||
|
||||
bool Utf8ExternalStreamingStream::FetchChunk() {
|
||||
|
@ -56,53 +56,6 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
|
||||
}
|
||||
}
|
||||
|
||||
// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
|
||||
// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
|
||||
// when the stream is complete, to ensure incomplete sequences are handled.
|
||||
uchar Utf8::ValueOfIncremental(const byte** cursor, State* state,
|
||||
Utf8IncrementalBuffer* buffer) {
|
||||
DCHECK_NOT_NULL(buffer);
|
||||
State old_state = *state;
|
||||
byte next = **cursor;
|
||||
*cursor += 1;
|
||||
|
||||
if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
|
||||
DCHECK_EQ(0u, *buffer);
|
||||
return static_cast<uchar>(next);
|
||||
}
|
||||
|
||||
// So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
|
||||
// char in that sequence.
|
||||
Utf8DfaDecoder::Decode(next, state, buffer);
|
||||
|
||||
switch (*state) {
|
||||
case State::kAccept: {
|
||||
uchar t = *buffer;
|
||||
*buffer = 0;
|
||||
return t;
|
||||
}
|
||||
|
||||
case State::kReject:
|
||||
*state = State::kAccept;
|
||||
*buffer = 0;
|
||||
|
||||
// If we hit a bad byte, we need to determine if we were trying to start
|
||||
// a sequence or continue one. If we were trying to start a sequence,
|
||||
// that means it's just an invalid lead byte and we need to continue to
|
||||
// the next (which we already did above). If we were already in a
|
||||
// sequence, we need to reprocess this same byte after resetting to the
|
||||
// initial state.
|
||||
if (old_state != State::kAccept) {
|
||||
// We were trying to continue a sequence, so let's reprocess this byte
|
||||
// next time.
|
||||
*cursor -= 1;
|
||||
}
|
||||
return kBadChar;
|
||||
|
||||
default:
|
||||
return kIncomplete;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
|
||||
static const int kMask = ~(1 << 6);
|
||||
|
@ -203,17 +203,62 @@ uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
|
||||
Utf8IncrementalBuffer buffer = 0;
|
||||
uchar t;
|
||||
|
||||
const byte* start = str;
|
||||
const byte* end = str + max_length;
|
||||
|
||||
size_t i = 0;
|
||||
do {
|
||||
t = ValueOfIncremental(&str, &state, &buffer);
|
||||
} while (str < end && t == kIncomplete);
|
||||
t = ValueOfIncremental(str[i], &i, &state, &buffer);
|
||||
} while (i < max_length && t == kIncomplete);
|
||||
|
||||
*cursor += str - start;
|
||||
*cursor += i;
|
||||
return (state == State::kAccept) ? t : kBadChar;
|
||||
}
|
||||
|
||||
// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
|
||||
// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
|
||||
// when the stream is complete, to ensure incomplete sequences are handled.
|
||||
uchar Utf8::ValueOfIncremental(byte next, size_t* cursor, State* state,
|
||||
Utf8IncrementalBuffer* buffer) {
|
||||
DCHECK_NOT_NULL(buffer);
|
||||
State old_state = *state;
|
||||
*cursor += 1;
|
||||
|
||||
if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
|
||||
DCHECK_EQ(0u, *buffer);
|
||||
return static_cast<uchar>(next);
|
||||
}
|
||||
|
||||
// So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
|
||||
// char in that sequence.
|
||||
Utf8DfaDecoder::Decode(next, state, buffer);
|
||||
|
||||
switch (*state) {
|
||||
case State::kAccept: {
|
||||
uchar t = *buffer;
|
||||
*buffer = 0;
|
||||
return t;
|
||||
}
|
||||
|
||||
case State::kReject:
|
||||
*state = State::kAccept;
|
||||
*buffer = 0;
|
||||
|
||||
// If we hit a bad byte, we need to determine if we were trying to start
|
||||
// a sequence or continue one. If we were trying to start a sequence,
|
||||
// that means it's just an invalid lead byte and we need to continue to
|
||||
// the next (which we already did above). If we were already in a
|
||||
// sequence, we need to reprocess this same byte after resetting to the
|
||||
// initial state.
|
||||
if (old_state != State::kAccept) {
|
||||
// We were trying to continue a sequence, so let's reprocess this byte
|
||||
// next time.
|
||||
*cursor -= 1;
|
||||
}
|
||||
return kBadChar;
|
||||
|
||||
default:
|
||||
return kIncomplete;
|
||||
}
|
||||
}
|
||||
|
||||
// Finishes the incremental decoding, ensuring that if an unfinished sequence
|
||||
// is left that it is replaced by a replacement char.
|
||||
uchar Utf8::ValueOfIncrementalFinish(State* state) {
|
||||
|
@ -163,8 +163,8 @@ class V8_EXPORT_PRIVATE Utf8 {
|
||||
static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor);
|
||||
|
||||
typedef uint32_t Utf8IncrementalBuffer;
|
||||
static inline uchar ValueOfIncremental(const byte** cursor, State* state,
|
||||
Utf8IncrementalBuffer* buffer);
|
||||
static uchar ValueOfIncremental(byte next_byte, size_t* cursor, State* state,
|
||||
Utf8IncrementalBuffer* buffer);
|
||||
static uchar ValueOfIncrementalFinish(State* state);
|
||||
|
||||
// Excludes non-characters from the set of valid code points.
|
||||
|
@ -3,7 +3,6 @@
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "test/cctest/unicode-helpers.h"
|
||||
#include "src/unicode-inl.h"
|
||||
|
||||
int Ucs2CharLength(unibrow::uchar c) {
|
||||
if (c == unibrow::Utf8::kIncomplete || c == unibrow::Utf8::kBufferEmpty) {
|
||||
@ -20,9 +19,10 @@ int Utf8LengthHelper(const char* s) {
|
||||
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
|
||||
|
||||
int length = 0;
|
||||
const uint8_t* c = reinterpret_cast<const uint8_t*>(s);
|
||||
while (*c != '\0') {
|
||||
unibrow::uchar tmp = unibrow::Utf8::ValueOfIncremental(&c, &state, &buffer);
|
||||
size_t i = 0;
|
||||
while (s[i] != '\0') {
|
||||
unibrow::uchar tmp =
|
||||
unibrow::Utf8::ValueOfIncremental(s[i], &i, &state, &buffer);
|
||||
length += Ucs2CharLength(tmp);
|
||||
}
|
||||
unibrow::uchar tmp = unibrow::Utf8::ValueOfIncrementalFinish(&state);
|
||||
|
@ -50,11 +50,9 @@ void DecodeIncrementally(const std::vector<byte>& bytes,
|
||||
std::vector<unibrow::uchar>* output) {
|
||||
unibrow::Utf8::Utf8IncrementalBuffer buffer = 0;
|
||||
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
|
||||
const byte* cursor = &bytes[0];
|
||||
const byte* end = &bytes[bytes.size()];
|
||||
while (cursor < end) {
|
||||
for (size_t i = 0; i < bytes.size();) {
|
||||
unibrow::uchar result =
|
||||
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &buffer);
|
||||
unibrow::Utf8::ValueOfIncremental(bytes[i], &i, &state, &buffer);
|
||||
if (result != unibrow::Utf8::kIncomplete) {
|
||||
output->push_back(result);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user