diff --git a/src/heap/factory.cc b/src/heap/factory.cc index 478b960a91..61b50ca2ba 100644 --- a/src/heap/factory.cc +++ b/src/heap/factory.cc @@ -42,7 +42,7 @@ #include "src/objects/stack-frame-info-inl.h" #include "src/objects/struct-inl.h" #include "src/unicode-cache.h" -#include "src/unicode-inl.h" +#include "src/unicode-decoder.h" namespace v8 { namespace internal { @@ -661,38 +661,13 @@ MaybeHandle Factory::NewStringFromUtf8(Vector string, return NewStringFromOneByte(Vector::cast(string), pretenure); } - std::unique_ptr buffer(new uint16_t[length - non_ascii_start]); + // Non-ASCII and we need to decode. + auto non_ascii = string.SubVector(non_ascii_start, length); + Access decoder( + isolate()->unicode_cache()->utf8_decoder()); + decoder->Reset(non_ascii); - const uint8_t* cursor = - reinterpret_cast(&string[non_ascii_start]); - const uint8_t* end = reinterpret_cast(string.end()); - - uint16_t* output_cursor = buffer.get(); - - uint32_t incomplete_char = 0; - unibrow::Utf8::State state = unibrow::Utf8::State::kAccept; - - while (cursor < end) { - unibrow::uchar t = - unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char); - - if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) { - *(output_cursor++) = static_cast(t); // The most frequent case. - } else if (t == unibrow::Utf8::kIncomplete) { - continue; - } else { - *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t); - *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t); - } - } - - unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state); - if (t != unibrow::Utf8::kBufferEmpty) { - *(output_cursor++) = static_cast(t); - } - - DCHECK_LE(output_cursor, buffer.get() + length - non_ascii_start); - int utf16_length = static_cast(output_cursor - buffer.get()); + int utf16_length = static_cast(decoder->Utf16Length()); DCHECK_GT(utf16_length, 0); // Allocate string. @@ -701,13 +676,15 @@ MaybeHandle Factory::NewStringFromUtf8(Vector string, isolate(), result, NewRawTwoByteString(non_ascii_start + utf16_length, pretenure), String); - DCHECK_LE(non_ascii_start + utf16_length, length); - + // Copy ASCII portion. DisallowHeapAllocation no_gc; uint16_t* data = result->GetChars(no_gc); - CopyChars(data, ascii_data, non_ascii_start); - CopyChars(data + non_ascii_start, buffer.get(), utf16_length); + for (int i = 0; i < non_ascii_start; i++) { + *data++ = *ascii_data++; + } + // Now write the remainder. + decoder->WriteUtf16(data, utf16_length, non_ascii); return result; } diff --git a/src/parsing/scanner-character-streams.cc b/src/parsing/scanner-character-streams.cc index 761c12ee80..32dcaacbf5 100644 --- a/src/parsing/scanner-character-streams.cc +++ b/src/parsing/scanner-character-streams.cc @@ -514,38 +514,23 @@ bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) { unibrow::Utf8::State state = chunk.start.state; uint32_t incomplete_char = chunk.start.incomplete_char; size_t it = current_.pos.bytes - chunk.start.bytes; - const uint8_t* cursor = &chunk.data[it]; - const uint8_t* end = &chunk.data[chunk.length]; - - size_t chars = current_.pos.chars; - - if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) { - while (cursor < end) { - unibrow::uchar t = - unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char); - if (t == unibrow::Utf8::kIncomplete) continue; - if (t != kUtf8Bom) { - chars++; - if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++; - } - break; - } - } - - while (cursor < end && chars < position) { - unibrow::uchar t = - unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char); - if (t != unibrow::Utf8::kIncomplete) { + size_t chars = chunk.start.chars; + while (it < chunk.length && chars < position) { + unibrow::uchar t = unibrow::Utf8::ValueOfIncremental( + chunk.data[it], &it, &state, &incomplete_char); + if (t == kUtf8Bom && current_.pos.chars == 0) { + // BOM detected at beginning of the stream. Don't copy it. + } else if (t != unibrow::Utf8::kIncomplete) { chars++; if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++; } } - current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data); + current_.pos.bytes += it; current_.pos.chars = chars; current_.pos.incomplete_char = incomplete_char; current_.pos.state = state; - current_.chunk_no += (cursor == end); + current_.chunk_no += (it == chunk.length); return current_.pos.chars == position; } @@ -559,8 +544,8 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { // The buffer_ is writable, but buffer_*_ members are const. So we get a // non-const pointer into buffer that points to the same char as buffer_end_. - uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_); - DCHECK_EQ(output_cursor, buffer_end_); + uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_); + DCHECK_EQ(cursor, buffer_end_); unibrow::Utf8::State state = current_.pos.state; uint32_t incomplete_char = current_.pos.incomplete_char; @@ -571,7 +556,7 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state); if (t != unibrow::Utf8::kBufferEmpty) { DCHECK_EQ(t, unibrow::Utf8::kBadChar); - *output_cursor = static_cast(t); + *cursor = static_cast(t); buffer_end_++; current_.pos.chars++; current_.pos.incomplete_char = 0; @@ -581,50 +566,30 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { } size_t it = current_.pos.bytes - chunk.start.bytes; - const uint8_t* cursor = chunk.data + it; - const uint8_t* end = chunk.data + chunk.length; - - // Deal with possible BOM. - if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) { - while (cursor < end) { - unibrow::uchar t = - unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char); - if (V8_LIKELY(t < kUtf8Bom)) { - *(output_cursor++) = static_cast(t); // The most frequent case. - } else if (t == unibrow::Utf8::kIncomplete) { - continue; - } else if (t == kUtf8Bom) { - // BOM detected at beginning of the stream. Don't copy it. - } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) { - *(output_cursor++) = static_cast(t); - } else { - *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t); - *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t); - } - break; - } - } - - while (cursor < end && output_cursor + 1 < buffer_start_ + kBufferSize) { - unibrow::uchar t = - unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char); - if (V8_LIKELY(t < unibrow::Utf16::kMaxNonSurrogateCharCode)) { - *(output_cursor++) = static_cast(t); // The most frequent case. + while (it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize) { + unibrow::uchar t = unibrow::Utf8::ValueOfIncremental( + chunk.data[it], &it, &state, &incomplete_char); + if (V8_LIKELY(t < kUtf8Bom)) { + *(cursor++) = static_cast(t); // The by most frequent case. } else if (t == unibrow::Utf8::kIncomplete) { continue; + } else if (t == kUtf8Bom && current_.pos.bytes + it == 3) { + // BOM detected at beginning of the stream. Don't copy it. + } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) { + *(cursor++) = static_cast(t); } else { - *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t); - *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t); + *(cursor++) = unibrow::Utf16::LeadSurrogate(t); + *(cursor++) = unibrow::Utf16::TrailSurrogate(t); } } - current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data); - current_.pos.chars += (output_cursor - buffer_end_); + current_.pos.bytes = chunk.start.bytes + it; + current_.pos.chars += (cursor - buffer_end_); current_.pos.incomplete_char = incomplete_char; current_.pos.state = state; - current_.chunk_no += (cursor == end); + current_.chunk_no += (it == chunk.length); - buffer_end_ = output_cursor; + buffer_end_ = cursor; } bool Utf8ExternalStreamingStream::FetchChunk() { diff --git a/src/unicode-inl.h b/src/unicode-inl.h index c96d78438a..0140858115 100644 --- a/src/unicode-inl.h +++ b/src/unicode-inl.h @@ -56,53 +56,6 @@ template int Mapping::CalculateValue(uchar c, uchar n, } } -// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they -// stream in. This **must** be followed by a call to ValueOfIncrementalFinish -// when the stream is complete, to ensure incomplete sequences are handled. -uchar Utf8::ValueOfIncremental(const byte** cursor, State* state, - Utf8IncrementalBuffer* buffer) { - DCHECK_NOT_NULL(buffer); - State old_state = *state; - byte next = **cursor; - *cursor += 1; - - if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) { - DCHECK_EQ(0u, *buffer); - return static_cast(next); - } - - // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation - // char in that sequence. - Utf8DfaDecoder::Decode(next, state, buffer); - - switch (*state) { - case State::kAccept: { - uchar t = *buffer; - *buffer = 0; - return t; - } - - case State::kReject: - *state = State::kAccept; - *buffer = 0; - - // If we hit a bad byte, we need to determine if we were trying to start - // a sequence or continue one. If we were trying to start a sequence, - // that means it's just an invalid lead byte and we need to continue to - // the next (which we already did above). If we were already in a - // sequence, we need to reprocess this same byte after resetting to the - // initial state. - if (old_state != State::kAccept) { - // We were trying to continue a sequence, so let's reprocess this byte - // next time. - *cursor -= 1; - } - return kBadChar; - - default: - return kIncomplete; - } -} unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { static const int kMask = ~(1 << 6); diff --git a/src/unicode.cc b/src/unicode.cc index c7818dbaa0..4d7896ec37 100644 --- a/src/unicode.cc +++ b/src/unicode.cc @@ -203,17 +203,62 @@ uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { Utf8IncrementalBuffer buffer = 0; uchar t; - const byte* start = str; - const byte* end = str + max_length; - + size_t i = 0; do { - t = ValueOfIncremental(&str, &state, &buffer); - } while (str < end && t == kIncomplete); + t = ValueOfIncremental(str[i], &i, &state, &buffer); + } while (i < max_length && t == kIncomplete); - *cursor += str - start; + *cursor += i; return (state == State::kAccept) ? t : kBadChar; } +// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they +// stream in. This **must** be followed by a call to ValueOfIncrementalFinish +// when the stream is complete, to ensure incomplete sequences are handled. +uchar Utf8::ValueOfIncremental(byte next, size_t* cursor, State* state, + Utf8IncrementalBuffer* buffer) { + DCHECK_NOT_NULL(buffer); + State old_state = *state; + *cursor += 1; + + if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) { + DCHECK_EQ(0u, *buffer); + return static_cast(next); + } + + // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation + // char in that sequence. + Utf8DfaDecoder::Decode(next, state, buffer); + + switch (*state) { + case State::kAccept: { + uchar t = *buffer; + *buffer = 0; + return t; + } + + case State::kReject: + *state = State::kAccept; + *buffer = 0; + + // If we hit a bad byte, we need to determine if we were trying to start + // a sequence or continue one. If we were trying to start a sequence, + // that means it's just an invalid lead byte and we need to continue to + // the next (which we already did above). If we were already in a + // sequence, we need to reprocess this same byte after resetting to the + // initial state. + if (old_state != State::kAccept) { + // We were trying to continue a sequence, so let's reprocess this byte + // next time. + *cursor -= 1; + } + return kBadChar; + + default: + return kIncomplete; + } +} + // Finishes the incremental decoding, ensuring that if an unfinished sequence // is left that it is replaced by a replacement char. uchar Utf8::ValueOfIncrementalFinish(State* state) { diff --git a/src/unicode.h b/src/unicode.h index 1bebfe3e8a..68e69324f9 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -163,8 +163,8 @@ class V8_EXPORT_PRIVATE Utf8 { static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor); typedef uint32_t Utf8IncrementalBuffer; - static inline uchar ValueOfIncremental(const byte** cursor, State* state, - Utf8IncrementalBuffer* buffer); + static uchar ValueOfIncremental(byte next_byte, size_t* cursor, State* state, + Utf8IncrementalBuffer* buffer); static uchar ValueOfIncrementalFinish(State* state); // Excludes non-characters from the set of valid code points. diff --git a/test/cctest/unicode-helpers.cc b/test/cctest/unicode-helpers.cc index 1a74e0ca94..524e5936fc 100644 --- a/test/cctest/unicode-helpers.cc +++ b/test/cctest/unicode-helpers.cc @@ -3,7 +3,6 @@ // found in the LICENSE file. #include "test/cctest/unicode-helpers.h" -#include "src/unicode-inl.h" int Ucs2CharLength(unibrow::uchar c) { if (c == unibrow::Utf8::kIncomplete || c == unibrow::Utf8::kBufferEmpty) { @@ -20,9 +19,10 @@ int Utf8LengthHelper(const char* s) { unibrow::Utf8::State state = unibrow::Utf8::State::kAccept; int length = 0; - const uint8_t* c = reinterpret_cast(s); - while (*c != '\0') { - unibrow::uchar tmp = unibrow::Utf8::ValueOfIncremental(&c, &state, &buffer); + size_t i = 0; + while (s[i] != '\0') { + unibrow::uchar tmp = + unibrow::Utf8::ValueOfIncremental(s[i], &i, &state, &buffer); length += Ucs2CharLength(tmp); } unibrow::uchar tmp = unibrow::Utf8::ValueOfIncrementalFinish(&state); diff --git a/test/unittests/unicode-unittest.cc b/test/unittests/unicode-unittest.cc index da1383c22c..1bede08343 100644 --- a/test/unittests/unicode-unittest.cc +++ b/test/unittests/unicode-unittest.cc @@ -50,11 +50,9 @@ void DecodeIncrementally(const std::vector& bytes, std::vector* output) { unibrow::Utf8::Utf8IncrementalBuffer buffer = 0; unibrow::Utf8::State state = unibrow::Utf8::State::kAccept; - const byte* cursor = &bytes[0]; - const byte* end = &bytes[bytes.size()]; - while (cursor < end) { + for (size_t i = 0; i < bytes.size();) { unibrow::uchar result = - unibrow::Utf8::ValueOfIncremental(&cursor, &state, &buffer); + unibrow::Utf8::ValueOfIncremental(bytes[i], &i, &state, &buffer); if (result != unibrow::Utf8::kIncomplete) { output->push_back(result); }