Revert "[utf8] Rewrite NewStringFromUtf8 using Utf8::ValueOfIncremental"

This reverts commit 73dd9b5527. Reason for revert: Broke telemetry layout tests - https://ci.chromium.org/p/chromium/builders/luci.chromium.try/win7-rel/9936 as can be seen in this roll - https://chromium-review.googlesource.com/c/chromium/src/+/1454259 Original change's description: > [utf8] Rewrite NewStringFromUtf8 using Utf8::ValueOfIncremental > > This is 3-4x faster than using the Utf8Decoder. This matters for proper > parse-time measurements using d8. > > Change-Id: I9870e9fbe400ec022a6eeb20491c80a2a32f8519 > Reviewed-on: https://chromium-review.googlesource.com/c/1451827 > Commit-Queue: Toon Verwaest <verwaest@chromium.org> > Reviewed-by: Leszek Swirski <leszeks@chromium.org> > Reviewed-by: Ulan Degenbaev <ulan@chromium.org> > Cr-Commit-Position: refs/heads/master@{#59347} TBR=ulan@chromium.org,leszeks@chromium.org,verwaest@chromium.org # Not skipping CQ checks because original CL landed > 1 day ago. Change-Id: I3f8faebb61c19a41ee496a571228f53c0d5fc8dd Reviewed-on: https://chromium-review.googlesource.com/c/1454495 Reviewed-by: Maya Lekova <mslekova@chromium.org> Commit-Queue: Yang Guo <yangguo@chromium.org> Cr-Commit-Position: refs/heads/master@{#59378}
2019-02-05 16:20:28 +00:00 · 2019-02-05 16:20:28 +00:00 · ec30cf47c7
commit ec30cf47c7
parent 85fcaff1b0
7 changed files with 99 additions and 161 deletions
--- a/src/heap/factory.cc
+++ b/src/heap/factory.cc
@ -42,7 +42,7 @@
 #include "src/objects/stack-frame-info-inl.h"
 #include "src/objects/struct-inl.h"
 #include "src/unicode-cache.h"
-#include "src/unicode-inl.h"
+#include "src/unicode-decoder.h"

 namespace v8 {
 namespace internal {
@ -661,38 +661,13 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
    return NewStringFromOneByte(Vector<const uint8_t>::cast(string), pretenure);
  }

-  std::unique_ptr<uint16_t[]> buffer(new uint16_t[length - non_ascii_start]);
+  // Non-ASCII and we need to decode.
+  auto non_ascii = string.SubVector(non_ascii_start, length);
+  Access<UnicodeCache::Utf8Decoder> decoder(
+      isolate()->unicode_cache()->utf8_decoder());
+  decoder->Reset(non_ascii);

-  const uint8_t* cursor =
-      reinterpret_cast<const uint8_t*>(&string[non_ascii_start]);
-  const uint8_t* end = reinterpret_cast<const uint8_t*>(string.end());
-
-  uint16_t* output_cursor = buffer.get();
-
-  uint32_t incomplete_char = 0;
-  unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
-
-  while (cursor < end) {
-    unibrow::uchar t =
-        unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
-
-    if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
-      *(output_cursor++) = static_cast<uc16>(t);  // The most frequent case.
-    } else if (t == unibrow::Utf8::kIncomplete) {
-      continue;
-    } else {
-      *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
-      *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
-    }
-  }
-
-  unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
-  if (t != unibrow::Utf8::kBufferEmpty) {
-    *(output_cursor++) = static_cast<uc16>(t);
-  }
-
-  DCHECK_LE(output_cursor, buffer.get() + length - non_ascii_start);
-  int utf16_length = static_cast<int>(output_cursor - buffer.get());
+  int utf16_length = static_cast<int>(decoder->Utf16Length());
  DCHECK_GT(utf16_length, 0);

  // Allocate string.
@ -701,13 +676,15 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
      isolate(), result,
      NewRawTwoByteString(non_ascii_start + utf16_length, pretenure), String);

-  DCHECK_LE(non_ascii_start + utf16_length, length);
-
+  // Copy ASCII portion.
  DisallowHeapAllocation no_gc;
  uint16_t* data = result->GetChars(no_gc);
-  CopyChars(data, ascii_data, non_ascii_start);
-  CopyChars(data + non_ascii_start, buffer.get(), utf16_length);
+  for (int i = 0; i < non_ascii_start; i++) {
+    *data++ = *ascii_data++;
+  }

+  // Now write the remainder.
+  decoder->WriteUtf16(data, utf16_length, non_ascii);
  return result;
 }

--- a/src/parsing/scanner-character-streams.cc
+++ b/src/parsing/scanner-character-streams.cc
@ -514,38 +514,23 @@ bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
  unibrow::Utf8::State state = chunk.start.state;
  uint32_t incomplete_char = chunk.start.incomplete_char;
  size_t it = current_.pos.bytes - chunk.start.bytes;
-  const uint8_t* cursor = &chunk.data[it];
-  const uint8_t* end = &chunk.data[chunk.length];
-
-  size_t chars = current_.pos.chars;
-
-  if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) {
-    while (cursor < end) {
-      unibrow::uchar t =
-          unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
-      if (t == unibrow::Utf8::kIncomplete) continue;
-      if (t != kUtf8Bom) {
-        chars++;
-        if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
-      }
-      break;
-    }
-  }
-
-  while (cursor < end && chars < position) {
-    unibrow::uchar t =
-        unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
-    if (t != unibrow::Utf8::kIncomplete) {
+  size_t chars = chunk.start.chars;
+  while (it < chunk.length && chars < position) {
+    unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
+        chunk.data[it], &it, &state, &incomplete_char);
+    if (t == kUtf8Bom && current_.pos.chars == 0) {
+      // BOM detected at beginning of the stream. Don't copy it.
+    } else if (t != unibrow::Utf8::kIncomplete) {
      chars++;
      if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
    }
  }

-  current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
+  current_.pos.bytes += it;
  current_.pos.chars = chars;
  current_.pos.incomplete_char = incomplete_char;
  current_.pos.state = state;
-  current_.chunk_no += (cursor == end);
+  current_.chunk_no += (it == chunk.length);

  return current_.pos.chars == position;
 }
@ -559,8 +544,8 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {

  // The buffer_ is writable, but buffer_*_ members are const. So we get a
  // non-const pointer into buffer that points to the same char as buffer_end_.
-  uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_);
-  DCHECK_EQ(output_cursor, buffer_end_);
+  uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
+  DCHECK_EQ(cursor, buffer_end_);

  unibrow::Utf8::State state = current_.pos.state;
  uint32_t incomplete_char = current_.pos.incomplete_char;
@ -571,7 +556,7 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
    unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
    if (t != unibrow::Utf8::kBufferEmpty) {
      DCHECK_EQ(t, unibrow::Utf8::kBadChar);
-      *output_cursor = static_cast<uc16>(t);
+      *cursor = static_cast<uc16>(t);
      buffer_end_++;
      current_.pos.chars++;
      current_.pos.incomplete_char = 0;
@ -581,50 +566,30 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
  }

  size_t it = current_.pos.bytes - chunk.start.bytes;
-  const uint8_t* cursor = chunk.data + it;
-  const uint8_t* end = chunk.data + chunk.length;
-
-  // Deal with possible BOM.
-  if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) {
-    while (cursor < end) {
-      unibrow::uchar t =
-          unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
-      if (V8_LIKELY(t < kUtf8Bom)) {
-        *(output_cursor++) = static_cast<uc16>(t);  // The most frequent case.
-      } else if (t == unibrow::Utf8::kIncomplete) {
-        continue;
-      } else if (t == kUtf8Bom) {
-        // BOM detected at beginning of the stream. Don't copy it.
-      } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
-        *(output_cursor++) = static_cast<uc16>(t);
-      } else {
-        *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
-        *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
-      }
-      break;
-    }
-  }
-
-  while (cursor < end && output_cursor + 1 < buffer_start_ + kBufferSize) {
-    unibrow::uchar t =
-        unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
-    if (V8_LIKELY(t < unibrow::Utf16::kMaxNonSurrogateCharCode)) {
-      *(output_cursor++) = static_cast<uc16>(t);  // The most frequent case.
+  while (it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize) {
+    unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
+        chunk.data[it], &it, &state, &incomplete_char);
+    if (V8_LIKELY(t < kUtf8Bom)) {
+      *(cursor++) = static_cast<uc16>(t);  // The by most frequent case.
    } else if (t == unibrow::Utf8::kIncomplete) {
      continue;
+    } else if (t == kUtf8Bom && current_.pos.bytes + it == 3) {
+      // BOM detected at beginning of the stream. Don't copy it.
+    } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      *(cursor++) = static_cast<uc16>(t);
    } else {
-      *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
-      *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
+      *(cursor++) = unibrow::Utf16::LeadSurrogate(t);
+      *(cursor++) = unibrow::Utf16::TrailSurrogate(t);
    }
  }

-  current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
-  current_.pos.chars += (output_cursor - buffer_end_);
+  current_.pos.bytes = chunk.start.bytes + it;
+  current_.pos.chars += (cursor - buffer_end_);
  current_.pos.incomplete_char = incomplete_char;
  current_.pos.state = state;
-  current_.chunk_no += (cursor == end);
+  current_.chunk_no += (it == chunk.length);

-  buffer_end_ = output_cursor;
+  buffer_end_ = cursor;
 }

 bool Utf8ExternalStreamingStream::FetchChunk() {
--- a/src/unicode-inl.h
+++ b/src/unicode-inl.h
@ -56,53 +56,6 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
  }
 }

-// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
-// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
-// when the stream is complete, to ensure incomplete sequences are handled.
-uchar Utf8::ValueOfIncremental(const byte** cursor, State* state,
-                               Utf8IncrementalBuffer* buffer) {
-  DCHECK_NOT_NULL(buffer);
-  State old_state = *state;
-  byte next = **cursor;
-  *cursor += 1;
-
-  if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
-    DCHECK_EQ(0u, *buffer);
-    return static_cast<uchar>(next);
-  }
-
-  // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
-  // char in that sequence.
-  Utf8DfaDecoder::Decode(next, state, buffer);
-
-  switch (*state) {
-    case State::kAccept: {
-      uchar t = *buffer;
-      *buffer = 0;
-      return t;
-    }
-
-    case State::kReject:
-      *state = State::kAccept;
-      *buffer = 0;
-
-      // If we hit a bad byte, we need to determine if we were trying to start
-      // a sequence or continue one. If we were trying to start a sequence,
-      // that means it's just an invalid lead byte and we need to continue to
-      // the next (which we already did above). If we were already in a
-      // sequence, we need to reprocess this same byte after resetting to the
-      // initial state.
-      if (old_state != State::kAccept) {
-        // We were trying to continue a sequence, so let's reprocess this byte
-        // next time.
-        *cursor -= 1;
-      }
-      return kBadChar;
-
-    default:
-      return kIncomplete;
-  }
-}

 unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
  static const int kMask = ~(1 << 6);
--- a/src/unicode.cc
+++ b/src/unicode.cc
@ -203,17 +203,62 @@ uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
  Utf8IncrementalBuffer buffer = 0;
  uchar t;

-  const byte* start = str;
-  const byte* end = str + max_length;
-
+  size_t i = 0;
  do {
-    t = ValueOfIncremental(&str, &state, &buffer);
-  } while (str < end && t == kIncomplete);
+    t = ValueOfIncremental(str[i], &i, &state, &buffer);
+  } while (i < max_length && t == kIncomplete);

-  *cursor += str - start;
+  *cursor += i;
  return (state == State::kAccept) ? t : kBadChar;
 }

+// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
+// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
+// when the stream is complete, to ensure incomplete sequences are handled.
+uchar Utf8::ValueOfIncremental(byte next, size_t* cursor, State* state,
+                               Utf8IncrementalBuffer* buffer) {
+  DCHECK_NOT_NULL(buffer);
+  State old_state = *state;
+  *cursor += 1;
+
+  if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
+    DCHECK_EQ(0u, *buffer);
+    return static_cast<uchar>(next);
+  }
+
+  // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
+  // char in that sequence.
+  Utf8DfaDecoder::Decode(next, state, buffer);
+
+  switch (*state) {
+    case State::kAccept: {
+      uchar t = *buffer;
+      *buffer = 0;
+      return t;
+    }
+
+    case State::kReject:
+      *state = State::kAccept;
+      *buffer = 0;
+
+      // If we hit a bad byte, we need to determine if we were trying to start
+      // a sequence or continue one. If we were trying to start a sequence,
+      // that means it's just an invalid lead byte and we need to continue to
+      // the next (which we already did above). If we were already in a
+      // sequence, we need to reprocess this same byte after resetting to the
+      // initial state.
+      if (old_state != State::kAccept) {
+        // We were trying to continue a sequence, so let's reprocess this byte
+        // next time.
+        *cursor -= 1;
+      }
+      return kBadChar;
+
+    default:
+      return kIncomplete;
+  }
+}
+
 // Finishes the incremental decoding, ensuring that if an unfinished sequence
 // is left that it is replaced by a replacement char.
 uchar Utf8::ValueOfIncrementalFinish(State* state) {
--- a/src/unicode.h
+++ b/src/unicode.h
@ -163,8 +163,8 @@ class V8_EXPORT_PRIVATE Utf8 {
  static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor);

  typedef uint32_t Utf8IncrementalBuffer;
-  static inline uchar ValueOfIncremental(const byte** cursor, State* state,
-                                         Utf8IncrementalBuffer* buffer);
+  static uchar ValueOfIncremental(byte next_byte, size_t* cursor, State* state,
+                                  Utf8IncrementalBuffer* buffer);
  static uchar ValueOfIncrementalFinish(State* state);

  // Excludes non-characters from the set of valid code points.
--- a/test/cctest/unicode-helpers.cc
+++ b/test/cctest/unicode-helpers.cc
@ -3,7 +3,6 @@
 // found in the LICENSE file.

 #include "test/cctest/unicode-helpers.h"
-#include "src/unicode-inl.h"

 int Ucs2CharLength(unibrow::uchar c) {
  if (c == unibrow::Utf8::kIncomplete || c == unibrow::Utf8::kBufferEmpty) {
@ -20,9 +19,10 @@ int Utf8LengthHelper(const char* s) {
  unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;

  int length = 0;
-  const uint8_t* c = reinterpret_cast<const uint8_t*>(s);
-  while (*c != '\0') {
-    unibrow::uchar tmp = unibrow::Utf8::ValueOfIncremental(&c, &state, &buffer);
+  size_t i = 0;
+  while (s[i] != '\0') {
+    unibrow::uchar tmp =
+        unibrow::Utf8::ValueOfIncremental(s[i], &i, &state, &buffer);
    length += Ucs2CharLength(tmp);
  }
  unibrow::uchar tmp = unibrow::Utf8::ValueOfIncrementalFinish(&state);
--- a/test/unittests/unicode-unittest.cc
+++ b/test/unittests/unicode-unittest.cc
@ -50,11 +50,9 @@ void DecodeIncrementally(const std::vector<byte>& bytes,
                         std::vector<unibrow::uchar>* output) {
  unibrow::Utf8::Utf8IncrementalBuffer buffer = 0;
  unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
-  const byte* cursor = &bytes[0];
-  const byte* end = &bytes[bytes.size()];
-  while (cursor < end) {
+  for (size_t i = 0; i < bytes.size();) {
    unibrow::uchar result =
-        unibrow::Utf8::ValueOfIncremental(&cursor, &state, &buffer);
+        unibrow::Utf8::ValueOfIncremental(bytes[i], &i, &state, &buffer);
    if (result != unibrow::Utf8::kIncomplete) {
      output->push_back(result);
    }