Revert "[utf8] Rewrite NewStringFromUtf8 using Utf8::ValueOfIncremental"

This reverts commit 73dd9b5527.

Reason for revert: Broke telemetry layout tests - https://ci.chromium.org/p/chromium/builders/luci.chromium.try/win7-rel/9936 as can be seen in this roll - https://chromium-review.googlesource.com/c/chromium/src/+/1454259

Original change's description:
> [utf8] Rewrite NewStringFromUtf8 using Utf8::ValueOfIncremental
> 
> This is 3-4x faster than using the Utf8Decoder. This matters for proper
> parse-time measurements using d8.
> 
> Change-Id: I9870e9fbe400ec022a6eeb20491c80a2a32f8519
> Reviewed-on: https://chromium-review.googlesource.com/c/1451827
> Commit-Queue: Toon Verwaest <verwaest@chromium.org>
> Reviewed-by: Leszek Swirski <leszeks@chromium.org>
> Reviewed-by: Ulan Degenbaev <ulan@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#59347}

TBR=ulan@chromium.org,leszeks@chromium.org,verwaest@chromium.org

# Not skipping CQ checks because original CL landed > 1 day ago.

Change-Id: I3f8faebb61c19a41ee496a571228f53c0d5fc8dd
Reviewed-on: https://chromium-review.googlesource.com/c/1454495
Reviewed-by: Maya Lekova <mslekova@chromium.org>
Commit-Queue: Yang Guo <yangguo@chromium.org>
Cr-Commit-Position: refs/heads/master@{#59378}
This commit is contained in:
Maya Lekova 2019-02-05 16:20:28 +00:00 committed by Commit Bot
parent 85fcaff1b0
commit ec30cf47c7
7 changed files with 99 additions and 161 deletions

View File

@ -42,7 +42,7 @@
#include "src/objects/stack-frame-info-inl.h"
#include "src/objects/struct-inl.h"
#include "src/unicode-cache.h"
#include "src/unicode-inl.h"
#include "src/unicode-decoder.h"
namespace v8 {
namespace internal {
@ -661,38 +661,13 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
return NewStringFromOneByte(Vector<const uint8_t>::cast(string), pretenure);
}
std::unique_ptr<uint16_t[]> buffer(new uint16_t[length - non_ascii_start]);
// Non-ASCII and we need to decode.
auto non_ascii = string.SubVector(non_ascii_start, length);
Access<UnicodeCache::Utf8Decoder> decoder(
isolate()->unicode_cache()->utf8_decoder());
decoder->Reset(non_ascii);
const uint8_t* cursor =
reinterpret_cast<const uint8_t*>(&string[non_ascii_start]);
const uint8_t* end = reinterpret_cast<const uint8_t*>(string.end());
uint16_t* output_cursor = buffer.get();
uint32_t incomplete_char = 0;
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
while (cursor < end) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
} else if (t == unibrow::Utf8::kIncomplete) {
continue;
} else {
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
}
}
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
if (t != unibrow::Utf8::kBufferEmpty) {
*(output_cursor++) = static_cast<uc16>(t);
}
DCHECK_LE(output_cursor, buffer.get() + length - non_ascii_start);
int utf16_length = static_cast<int>(output_cursor - buffer.get());
int utf16_length = static_cast<int>(decoder->Utf16Length());
DCHECK_GT(utf16_length, 0);
// Allocate string.
@ -701,13 +676,15 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
isolate(), result,
NewRawTwoByteString(non_ascii_start + utf16_length, pretenure), String);
DCHECK_LE(non_ascii_start + utf16_length, length);
// Copy ASCII portion.
DisallowHeapAllocation no_gc;
uint16_t* data = result->GetChars(no_gc);
CopyChars(data, ascii_data, non_ascii_start);
CopyChars(data + non_ascii_start, buffer.get(), utf16_length);
for (int i = 0; i < non_ascii_start; i++) {
*data++ = *ascii_data++;
}
// Now write the remainder.
decoder->WriteUtf16(data, utf16_length, non_ascii);
return result;
}

View File

@ -514,38 +514,23 @@ bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
unibrow::Utf8::State state = chunk.start.state;
uint32_t incomplete_char = chunk.start.incomplete_char;
size_t it = current_.pos.bytes - chunk.start.bytes;
const uint8_t* cursor = &chunk.data[it];
const uint8_t* end = &chunk.data[chunk.length];
size_t chars = current_.pos.chars;
if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) {
while (cursor < end) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (t == unibrow::Utf8::kIncomplete) continue;
if (t != kUtf8Bom) {
chars++;
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
}
break;
}
}
while (cursor < end && chars < position) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (t != unibrow::Utf8::kIncomplete) {
size_t chars = chunk.start.chars;
while (it < chunk.length && chars < position) {
unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
chunk.data[it], &it, &state, &incomplete_char);
if (t == kUtf8Bom && current_.pos.chars == 0) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t != unibrow::Utf8::kIncomplete) {
chars++;
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
}
}
current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
current_.pos.bytes += it;
current_.pos.chars = chars;
current_.pos.incomplete_char = incomplete_char;
current_.pos.state = state;
current_.chunk_no += (cursor == end);
current_.chunk_no += (it == chunk.length);
return current_.pos.chars == position;
}
@ -559,8 +544,8 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
// The buffer_ is writable, but buffer_*_ members are const. So we get a
// non-const pointer into buffer that points to the same char as buffer_end_.
uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_);
DCHECK_EQ(output_cursor, buffer_end_);
uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
DCHECK_EQ(cursor, buffer_end_);
unibrow::Utf8::State state = current_.pos.state;
uint32_t incomplete_char = current_.pos.incomplete_char;
@ -571,7 +556,7 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
if (t != unibrow::Utf8::kBufferEmpty) {
DCHECK_EQ(t, unibrow::Utf8::kBadChar);
*output_cursor = static_cast<uc16>(t);
*cursor = static_cast<uc16>(t);
buffer_end_++;
current_.pos.chars++;
current_.pos.incomplete_char = 0;
@ -581,50 +566,30 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
}
size_t it = current_.pos.bytes - chunk.start.bytes;
const uint8_t* cursor = chunk.data + it;
const uint8_t* end = chunk.data + chunk.length;
// Deal with possible BOM.
if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) {
while (cursor < end) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (V8_LIKELY(t < kUtf8Bom)) {
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
} else if (t == unibrow::Utf8::kIncomplete) {
continue;
} else if (t == kUtf8Bom) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(output_cursor++) = static_cast<uc16>(t);
} else {
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
}
break;
}
}
while (cursor < end && output_cursor + 1 < buffer_start_ + kBufferSize) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (V8_LIKELY(t < unibrow::Utf16::kMaxNonSurrogateCharCode)) {
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
while (it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize) {
unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
chunk.data[it], &it, &state, &incomplete_char);
if (V8_LIKELY(t < kUtf8Bom)) {
*(cursor++) = static_cast<uc16>(t); // The by most frequent case.
} else if (t == unibrow::Utf8::kIncomplete) {
continue;
} else if (t == kUtf8Bom && current_.pos.bytes + it == 3) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(cursor++) = static_cast<uc16>(t);
} else {
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
*(cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(cursor++) = unibrow::Utf16::TrailSurrogate(t);
}
}
current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
current_.pos.chars += (output_cursor - buffer_end_);
current_.pos.bytes = chunk.start.bytes + it;
current_.pos.chars += (cursor - buffer_end_);
current_.pos.incomplete_char = incomplete_char;
current_.pos.state = state;
current_.chunk_no += (cursor == end);
current_.chunk_no += (it == chunk.length);
buffer_end_ = output_cursor;
buffer_end_ = cursor;
}
bool Utf8ExternalStreamingStream::FetchChunk() {

View File

@ -56,53 +56,6 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
}
}
// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
// when the stream is complete, to ensure incomplete sequences are handled.
uchar Utf8::ValueOfIncremental(const byte** cursor, State* state,
Utf8IncrementalBuffer* buffer) {
DCHECK_NOT_NULL(buffer);
State old_state = *state;
byte next = **cursor;
*cursor += 1;
if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
DCHECK_EQ(0u, *buffer);
return static_cast<uchar>(next);
}
// So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
// char in that sequence.
Utf8DfaDecoder::Decode(next, state, buffer);
switch (*state) {
case State::kAccept: {
uchar t = *buffer;
*buffer = 0;
return t;
}
case State::kReject:
*state = State::kAccept;
*buffer = 0;
// If we hit a bad byte, we need to determine if we were trying to start
// a sequence or continue one. If we were trying to start a sequence,
// that means it's just an invalid lead byte and we need to continue to
// the next (which we already did above). If we were already in a
// sequence, we need to reprocess this same byte after resetting to the
// initial state.
if (old_state != State::kAccept) {
// We were trying to continue a sequence, so let's reprocess this byte
// next time.
*cursor -= 1;
}
return kBadChar;
default:
return kIncomplete;
}
}
unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
static const int kMask = ~(1 << 6);

View File

@ -203,17 +203,62 @@ uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
Utf8IncrementalBuffer buffer = 0;
uchar t;
const byte* start = str;
const byte* end = str + max_length;
size_t i = 0;
do {
t = ValueOfIncremental(&str, &state, &buffer);
} while (str < end && t == kIncomplete);
t = ValueOfIncremental(str[i], &i, &state, &buffer);
} while (i < max_length && t == kIncomplete);
*cursor += str - start;
*cursor += i;
return (state == State::kAccept) ? t : kBadChar;
}
// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
// when the stream is complete, to ensure incomplete sequences are handled.
uchar Utf8::ValueOfIncremental(byte next, size_t* cursor, State* state,
Utf8IncrementalBuffer* buffer) {
DCHECK_NOT_NULL(buffer);
State old_state = *state;
*cursor += 1;
if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
DCHECK_EQ(0u, *buffer);
return static_cast<uchar>(next);
}
// So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
// char in that sequence.
Utf8DfaDecoder::Decode(next, state, buffer);
switch (*state) {
case State::kAccept: {
uchar t = *buffer;
*buffer = 0;
return t;
}
case State::kReject:
*state = State::kAccept;
*buffer = 0;
// If we hit a bad byte, we need to determine if we were trying to start
// a sequence or continue one. If we were trying to start a sequence,
// that means it's just an invalid lead byte and we need to continue to
// the next (which we already did above). If we were already in a
// sequence, we need to reprocess this same byte after resetting to the
// initial state.
if (old_state != State::kAccept) {
// We were trying to continue a sequence, so let's reprocess this byte
// next time.
*cursor -= 1;
}
return kBadChar;
default:
return kIncomplete;
}
}
// Finishes the incremental decoding, ensuring that if an unfinished sequence
// is left that it is replaced by a replacement char.
uchar Utf8::ValueOfIncrementalFinish(State* state) {

View File

@ -163,8 +163,8 @@ class V8_EXPORT_PRIVATE Utf8 {
static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor);
typedef uint32_t Utf8IncrementalBuffer;
static inline uchar ValueOfIncremental(const byte** cursor, State* state,
Utf8IncrementalBuffer* buffer);
static uchar ValueOfIncremental(byte next_byte, size_t* cursor, State* state,
Utf8IncrementalBuffer* buffer);
static uchar ValueOfIncrementalFinish(State* state);
// Excludes non-characters from the set of valid code points.

View File

@ -3,7 +3,6 @@
// found in the LICENSE file.
#include "test/cctest/unicode-helpers.h"
#include "src/unicode-inl.h"
int Ucs2CharLength(unibrow::uchar c) {
if (c == unibrow::Utf8::kIncomplete || c == unibrow::Utf8::kBufferEmpty) {
@ -20,9 +19,10 @@ int Utf8LengthHelper(const char* s) {
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
int length = 0;
const uint8_t* c = reinterpret_cast<const uint8_t*>(s);
while (*c != '\0') {
unibrow::uchar tmp = unibrow::Utf8::ValueOfIncremental(&c, &state, &buffer);
size_t i = 0;
while (s[i] != '\0') {
unibrow::uchar tmp =
unibrow::Utf8::ValueOfIncremental(s[i], &i, &state, &buffer);
length += Ucs2CharLength(tmp);
}
unibrow::uchar tmp = unibrow::Utf8::ValueOfIncrementalFinish(&state);

View File

@ -50,11 +50,9 @@ void DecodeIncrementally(const std::vector<byte>& bytes,
std::vector<unibrow::uchar>* output) {
unibrow::Utf8::Utf8IncrementalBuffer buffer = 0;
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
const byte* cursor = &bytes[0];
const byte* end = &bytes[bytes.size()];
while (cursor < end) {
for (size_t i = 0; i < bytes.size();) {
unibrow::uchar result =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &buffer);
unibrow::Utf8::ValueOfIncremental(bytes[i], &i, &state, &buffer);
if (result != unibrow::Utf8::kIncomplete) {
output->push_back(result);
}