[api] simplify String::WriteToUtf8.

Since we always flatten the string upfront, we don't need to implement the the UTF8 conversion as a string visitor anymore. R=petermarshall@chromium.org Bug: v8:6780, v8:8605 Change-Id: I27946551d7c3742f47ac36d5c909c19a7f2b0371 Reviewed-on: https://chromium-review.googlesource.com/c/1371828 Commit-Queue: Yang Guo <yangguo@chromium.org> Reviewed-by: Jakob Kummerow <jkummerow@chromium.org> Cr-Commit-Position: refs/heads/master@{#58312}
2018-12-18 11:21:08 +01:00 · 2018-12-18 11:21:08 +01:00 · c30e65e15b
commit c30e65e15b
parent e66c6ef750
3 changed files with 257 additions and 223 deletions
--- a/src/api.cc
+++ b/src/api.cc
@ -5304,204 +5304,133 @@ int String::Utf8Length(Isolate* isolate) const {
  return utf8_length;
 }

-class Utf8WriterVisitor {
- public:
-  Utf8WriterVisitor(
-      char* buffer,
-      int capacity,
-      bool skip_capacity_check,
-      bool replace_invalid_utf8)
-    : early_termination_(false),
-      last_character_(unibrow::Utf16::kNoPreviousCharacter),
-      buffer_(buffer),
-      start_(buffer),
-      capacity_(capacity),
-      skip_capacity_check_(capacity == -1 || skip_capacity_check),
-      replace_invalid_utf8_(replace_invalid_utf8),
-      utf16_chars_read_(0) {
-  }
-
-  static int WriteEndCharacter(uint16_t character,
-                               int last_character,
-                               int remaining,
-                               char* const buffer,
-                               bool replace_invalid_utf8) {
-    DCHECK_GT(remaining, 0);
-    // We can't use a local buffer here because Encode needs to modify
-    // previous characters in the stream.  We know, however, that
-    // exactly one character will be advanced.
-    if (unibrow::Utf16::IsSurrogatePair(last_character, character)) {
-      int written = unibrow::Utf8::Encode(buffer, character, last_character,
-                                          replace_invalid_utf8);
-      DCHECK_EQ(written, 1);
-      return written;
+namespace {
+// Writes the flat content of a string to a buffer. This is done in two phases.
+// The first phase calculates a pessimistic estimate (writable_length) on how
+// many code units can be safely written without exceeding the buffer capacity
+// and without leaving at a lone surrogate. The estimated number of code units
+// is then written out in one go, and the reported byte usage is used to
+// correct the estimate. This is repeated until the estimate becomes <= 0 or
+// all code units have been written out. The second phase writes out code
+// units until the buffer capacity is reached, would be exceeded by the next
+// unit, or all code units have been written out.
+template <typename Char>
+static int WriteUtf8Impl(i::Vector<const Char> string, char* write_start,
+                         int write_capacity, int options,
+                         int* utf16_chars_read_out) {
+  bool write_null = !(options & v8::String::NO_NULL_TERMINATION);
+  bool replace_invalid_utf8 = (options & v8::String::REPLACE_INVALID_UTF8);
+  char* current_write = write_start;
+  const Char* read_start = string.start();
+  int read_index = 0;
+  int read_length = string.length();
+  int prev_char = unibrow::Utf16::kNoPreviousCharacter;
+  // Do a fast loop where there is no exit capacity check.
+  // Need enough space to write everything but one character.
+  STATIC_ASSERT(unibrow::Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3);
+  static const int kMaxSizePerChar = sizeof(Char) == 1 ? 2 : 3;
+  while (read_index < read_length) {
+    int up_to = read_length;
+    if (write_capacity != -1) {
+      int remaining_capacity =
+          write_capacity - static_cast<int>(current_write - write_start);
+      int writable_length =
+          (remaining_capacity - kMaxSizePerChar) / kMaxSizePerChar;
+      // Need to drop into slow loop.
+      if (writable_length <= 0) break;
+      up_to = std::min(up_to, read_index + writable_length);
    }
-    // Use a scratch buffer to check the required characters.
-    char temp_buffer[unibrow::Utf8::kMaxEncodedSize];
-    // Can't encode using last_character as gcc has array bounds issues.
-    int written = unibrow::Utf8::Encode(temp_buffer, character,
+    // Write the characters to the stream.
+    if (sizeof(Char) == 1) {
+      // Simply memcpy if we only have ASCII characters.
+      uint8_t char_mask = 0;
+      for (int i = read_index; i < up_to; i++) char_mask |= read_start[i];
+      if ((char_mask & 0x80) == 0) {
+        int copy_length = up_to - read_index;
+        memcpy(current_write, read_start + read_index, copy_length);
+        current_write += copy_length;
+        read_index = up_to;
+      } else {
+        for (; read_index < up_to; read_index++) {
+          current_write += unibrow::Utf8::EncodeOneByte(
+              current_write, static_cast<uint8_t>(read_start[read_index]));
+          DCHECK(write_capacity == -1 ||
+                 (current_write - write_start) <= write_capacity);
+        }
+      }
+    } else {
+      for (; read_index < up_to; read_index++) {
+        uint16_t character = read_start[read_index];
+        current_write += unibrow::Utf8::Encode(current_write, character,
+                                               prev_char, replace_invalid_utf8);
+        prev_char = character;
+        DCHECK(write_capacity == -1 ||
+               (current_write - write_start) <= write_capacity);
+      }
+    }
+  }
+  if (read_index < read_length) {
+    DCHECK_NE(-1, write_capacity);
+    // Aborted due to limited capacity. Check capacity on each iteration.
+    int remaining_capacity =
+        write_capacity - static_cast<int>(current_write - write_start);
+    DCHECK_GE(remaining_capacity, 0);
+    for (; read_index < read_length && remaining_capacity > 0; read_index++) {
+      uint32_t character = read_start[read_index];
+      int written = 0;
+      // We can't use a local buffer here because Encode needs to modify
+      // previous characters in the stream.  We know, however, that
+      // exactly one character will be advanced.
+      if (unibrow::Utf16::IsSurrogatePair(prev_char, character)) {
+        written = unibrow::Utf8::Encode(current_write, character, prev_char,
+                                        replace_invalid_utf8);
+        DCHECK_EQ(written, 1);
+      } else {
+        // Use a scratch buffer to check the required characters.
+        char temp_buffer[unibrow::Utf8::kMaxEncodedSize];
+        // Encoding a surrogate pair to Utf8 always takes 4 bytes.
+        static const int kSurrogatePairEncodedSize =
+            static_cast<int>(unibrow::Utf8::kMaxEncodedSize);
+        // For REPLACE_INVALID_UTF8, catch the case where we cut off in the
+        // middle of a surrogate pair. Abort before encoding the pair instead.
+        if (replace_invalid_utf8 &&
+            remaining_capacity < kSurrogatePairEncodedSize &&
+            unibrow::Utf16::IsLeadSurrogate(character) &&
+            read_index + 1 < read_length &&
+            unibrow::Utf16::IsTrailSurrogate(read_start[read_index + 1])) {
+          write_null = false;
+          break;
+        }
+        // Can't encode using prev_char as gcc has array bounds issues.
+        written = unibrow::Utf8::Encode(temp_buffer, character,
                                        unibrow::Utf16::kNoPreviousCharacter,
                                        replace_invalid_utf8);
-    // Won't fit.
-    if (written > remaining) return 0;
-    // Copy over the character from temp_buffer.
-    for (int j = 0; j < written; j++) {
-      buffer[j] = temp_buffer[j];
-    }
-    return written;
-  }
+        if (written > remaining_capacity) {
+          // Won't fit. Abort and do not null-terminate the result.
+          write_null = false;
+          break;
+        }
+        // Copy over the character from temp_buffer.
+        for (int i = 0; i < written; i++) current_write[i] = temp_buffer[i];
+      }

-  // Visit writes out a group of code units (chars) of a v8::String to the
-  // internal buffer_. This is done in two phases. The first phase calculates a
-  // pesimistic estimate (writable_length) on how many code units can be safely
-  // written without exceeding the buffer capacity and without writing the last
-  // code unit (it could be a lead surrogate). The estimated number of code
-  // units is then written out in one go, and the reported byte usage is used
-  // to correct the estimate. This is repeated until the estimate becomes <= 0
-  // or all code units have been written out. The second phase writes out code
-  // units until the buffer capacity is reached, would be exceeded by the next
-  // unit, or all units have been written out.
-  template<typename Char>
-  void Visit(const Char* chars, const int length) {
-    DCHECK(!early_termination_);
-    if (length == 0) return;
-    // Copy state to stack.
-    char* buffer = buffer_;
-    int last_character = sizeof(Char) == 1
-                             ? unibrow::Utf16::kNoPreviousCharacter
-                             : last_character_;
-    int i = 0;
-    // Do a fast loop where there is no exit capacity check.
-    while (true) {
-      int fast_length;
-      if (skip_capacity_check_) {
-        fast_length = length;
-      } else {
-        int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);
-        // Need enough space to write everything but one character.
-        STATIC_ASSERT(unibrow::Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit ==
-                      3);
-        int max_size_per_char =  sizeof(Char) == 1 ? 2 : 3;
-        int writable_length =
-            (remaining_capacity - max_size_per_char)/max_size_per_char;
-        // Need to drop into slow loop.
-        if (writable_length <= 0) break;
-        fast_length = i + writable_length;
-        if (fast_length > length) fast_length = length;
-      }
-      // Write the characters to the stream.
-      if (sizeof(Char) == 1) {
-        for (; i < fast_length; i++) {
-          buffer += unibrow::Utf8::EncodeOneByte(
-              buffer, static_cast<uint8_t>(*chars++));
-          DCHECK(capacity_ == -1 || (buffer - start_) <= capacity_);
-        }
-      } else {
-        for (; i < fast_length; i++) {
-          uint16_t character = *chars++;
-          buffer += unibrow::Utf8::Encode(buffer, character, last_character,
-                                          replace_invalid_utf8_);
-          last_character = character;
-          DCHECK(capacity_ == -1 || (buffer - start_) <= capacity_);
-        }
-      }
-      // Array is fully written. Exit.
-      if (fast_length == length) {
-        // Write state back out to object.
-        last_character_ = last_character;
-        buffer_ = buffer;
-        utf16_chars_read_ += length;
-        return;
-      }
-    }
-    DCHECK(!skip_capacity_check_);
-    // Slow loop. Must check capacity on each iteration.
-    int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);
-    DCHECK_GE(remaining_capacity, 0);
-    for (; i < length && remaining_capacity > 0; i++) {
-      uint16_t character = *chars++;
-      // remaining_capacity is <= 3 bytes at this point, so we do not write out
-      // an umatched lead surrogate.
-      if (replace_invalid_utf8_ && unibrow::Utf16::IsLeadSurrogate(character)) {
-        early_termination_ = true;
-        break;
-      }
-      int written = WriteEndCharacter(character,
-                                      last_character,
-                                      remaining_capacity,
-                                      buffer,
-                                      replace_invalid_utf8_);
-      if (written == 0) {
-        early_termination_ = true;
-        break;
-      }
-      buffer += written;
+      current_write += written;
      remaining_capacity -= written;
-      last_character = character;
+      prev_char = character;
    }
-    // Write state back out to object.
-    last_character_ = last_character;
-    buffer_ = buffer;
-    utf16_chars_read_ += i;
  }

-  inline bool IsDone() {
-    return early_termination_;
-  }
+  // Write out number of utf16 characters written to the stream.
+  if (utf16_chars_read_out != nullptr) *utf16_chars_read_out = read_index;

-  inline void VisitOneByteString(const uint8_t* chars, int length) {
-    Visit(chars, length);
+  // Only null-terminate if there's space.
+  if (write_null && (write_capacity == -1 ||
+                     (current_write - write_start) < write_capacity)) {
+    *current_write++ = '\0';
  }
-
-  inline void VisitTwoByteString(const uint16_t* chars, int length) {
-    Visit(chars, length);
-  }
-
-  int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
-    // Write out number of utf16 characters written to the stream.
-    if (utf16_chars_read_out != nullptr) {
-      *utf16_chars_read_out = utf16_chars_read_;
-    }
-    // Only null terminate if all of the string was written and there's space.
-    if (write_null &&
-        !early_termination_ &&
-        (capacity_ == -1 || (buffer_ - start_) < capacity_)) {
-      *buffer_++ = '\0';
-    }
-    return static_cast<int>(buffer_ - start_);
-  }
-
- private:
-  bool early_termination_;
-  int last_character_;
-  char* buffer_;
-  char* const start_;
-  int capacity_;
-  bool const skip_capacity_check_;
-  bool const replace_invalid_utf8_;
-  int utf16_chars_read_;
-  DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
-};
-
-// TODO(yangguo): Simplify this. We can now expect the string to be flat.
-static bool RecursivelySerializeToUtf8(i::String current,
-                                       Utf8WriterVisitor* writer,
-                                       int recursion_budget) {
-  while (!writer->IsDone()) {
-    i::ConsString cons_string = i::String::VisitFlat(writer, current);
-    if (cons_string.is_null()) return true;  // Leaf node.
-    if (recursion_budget <= 0) return false;
-    // Must write the left branch first.
-    i::String first = cons_string->first();
-    bool success = RecursivelySerializeToUtf8(first,
-                                              writer,
-                                              recursion_budget - 1);
-    if (!success) return false;
-    // Inline tail recurse for right branch.
-    current = cons_string->second();
-  }
-  return true;
+  return static_cast<int>(current_write - write_start);
 }
+}  // anonymous namespace

 int String::WriteUtf8(Isolate* v8_isolate, char* buffer, int capacity,
                      int* nchars_ref, int options) const {
@ -5509,43 +5438,16 @@ int String::WriteUtf8(Isolate* v8_isolate, char* buffer, int capacity,
  i::Isolate* isolate = reinterpret_cast<i::Isolate*>(v8_isolate);
  LOG_API(isolate, String, WriteUtf8);
  ENTER_V8_NO_SCRIPT_NO_EXCEPTION(isolate);
-  str = i::String::Flatten(isolate, str);  // Flatten the string for efficiency.
-  const int string_length = str->length();
-  bool write_null = !(options & NO_NULL_TERMINATION);
-  bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8);
-  int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize;
-  // First check if we can just write the string without checking capacity.
-  if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) {
-    Utf8WriterVisitor writer(buffer, capacity, true, replace_invalid_utf8);
-    const int kMaxRecursion = 100;
-    bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);
-    if (success) return writer.CompleteWrite(write_null, nchars_ref);
-  } else if (capacity >= string_length) {
-    // First check that the buffer is large enough.
-    int utf8_bytes = Utf8Length(v8_isolate);
-    if (utf8_bytes <= capacity) {
-      // one-byte fast path.
-      if (utf8_bytes == string_length) {
-        WriteOneByte(v8_isolate, reinterpret_cast<uint8_t*>(buffer), 0,
-                     capacity, options);
-        if (nchars_ref != nullptr) *nchars_ref = string_length;
-        if (write_null && (utf8_bytes+1 <= capacity)) {
-          return string_length + 1;
-        }
-        return string_length;
-      }
-      if (write_null && (utf8_bytes+1 > capacity)) {
-        options |= NO_NULL_TERMINATION;
-      }
-      // Recurse once without a capacity limit.
-      // This will get into the first branch above.
-      // TODO(dcarney) Check max left rec. in Utf8Length and fall through.
-      return WriteUtf8(v8_isolate, buffer, -1, nchars_ref, options);
-    }
+  str = i::String::Flatten(isolate, str);
+  i::DisallowHeapAllocation no_gc;
+  i::String::FlatContent content = str->GetFlatContent(no_gc);
+  if (content.IsOneByte()) {
+    return WriteUtf8Impl<uint8_t>(content.ToOneByteVector(), buffer, capacity,
+                                  options, nchars_ref);
+  } else {
+    return WriteUtf8Impl<uint16_t>(content.ToUC16Vector(), buffer, capacity,
+                                   options, nchars_ref);
  }
-  Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8);
-  i::String::VisitFlat(&writer, *str);
-  return writer.CompleteWrite(write_null, nchars_ref);
 }

 template <typename CharType>
--- a/test/cctest/test-api.cc
+++ b/test/cctest/test-api.cc
@ -8452,6 +8452,13 @@ THREADED_TEST(StringWrite) {
  int len;
  int charlen;

+  memset(utf8buf, 0x1, 1000);
+  len = v8::String::Empty(isolate)->WriteUtf8(isolate, utf8buf, sizeof(utf8buf),
+                                              &charlen);
+  CHECK_EQ(1, len);
+  CHECK_EQ(0, charlen);
+  CHECK_EQ(0, strcmp(utf8buf, ""));
+
  memset(utf8buf, 0x1, 1000);
  len = str2->WriteUtf8(isolate, utf8buf, sizeof(utf8buf), &charlen);
  CHECK_EQ(9, len);
--- a/test/cctest/test-strings.cc
+++ b/test/cctest/test-strings.cc
@ -35,6 +35,7 @@
 #include "src/v8.h"

 #include "src/api-inl.h"
+#include "src/base/platform/elapsed-timer.h"
 #include "src/heap/factory.h"
 #include "src/messages.h"
 #include "src/objects-inl.h"
@ -957,6 +958,109 @@ TEST(Utf8Conversion) {
  }
 }

+TEST(Utf8ConversionPerf) {
+  // Smoke test for converting strings to utf-8.
+  LocalContext context;
+  v8::HandleScope handle_scope(CcTest::isolate());
+  v8::Local<v8::String> ascii_string =
+      CompileRun("'abc'.repeat(1E6)").As<v8::String>();
+  v8::Local<v8::String> one_byte_string =
+      CompileRun("'\\u0255\\u0254\\u0253'.repeat(1E6)").As<v8::String>();
+  v8::Local<v8::String> two_byte_string =
+      CompileRun("'\\u2255\\u2254\\u2253'.repeat(1E6)").As<v8::String>();
+  v8::Local<v8::String> surrogate_string =
+      CompileRun("'\\u{12345}\\u2244'.repeat(1E6)").As<v8::String>();
+  int size = 1E7;
+  char* buffer = new char[4 * size];
+  {
+    v8::base::ElapsedTimer timer;
+    timer.Start();
+    ascii_string->WriteUtf8(CcTest::isolate(), buffer, size, nullptr);
+    printf("ascii string %0.3f\n", timer.Elapsed().InMillisecondsF());
+    timer.Stop();
+  }
+  {
+    v8::base::ElapsedTimer timer;
+    timer.Start();
+    ascii_string->WriteUtf8(CcTest::isolate(), buffer, size, nullptr);
+    printf("ascii string %0.3f\n", timer.Elapsed().InMillisecondsF());
+    timer.Stop();
+  }
+  {
+    v8::base::ElapsedTimer timer;
+    timer.Start();
+    ascii_string->WriteUtf8(CcTest::isolate(), buffer, 4 * size, nullptr);
+    printf("ascii string %0.3f\n", timer.Elapsed().InMillisecondsF());
+    timer.Stop();
+  }
+
+  {
+    v8::base::ElapsedTimer timer;
+    timer.Start();
+    one_byte_string->WriteUtf8(CcTest::isolate(), buffer, size, nullptr);
+    printf("one byte string %0.3f\n", timer.Elapsed().InMillisecondsF());
+    timer.Stop();
+  }
+  {
+    v8::base::ElapsedTimer timer;
+    timer.Start();
+    one_byte_string->WriteUtf8(CcTest::isolate(), buffer, size, nullptr);
+    printf("one byte string %0.3f\n", timer.Elapsed().InMillisecondsF());
+    timer.Stop();
+  }
+  {
+    v8::base::ElapsedTimer timer;
+    timer.Start();
+    one_byte_string->WriteUtf8(CcTest::isolate(), buffer, 4 * size, nullptr);
+    printf("one byte string %0.3f\n", timer.Elapsed().InMillisecondsF());
+    timer.Stop();
+  }
+
+  {
+    v8::base::ElapsedTimer timer;
+    timer.Start();
+    two_byte_string->WriteUtf8(CcTest::isolate(), buffer, size, nullptr);
+    printf("two byte string %0.3f\n", timer.Elapsed().InMillisecondsF());
+    timer.Stop();
+  }
+  {
+    v8::base::ElapsedTimer timer;
+    timer.Start();
+    two_byte_string->WriteUtf8(CcTest::isolate(), buffer, size, nullptr);
+    printf("two byte string %0.3f\n", timer.Elapsed().InMillisecondsF());
+    timer.Stop();
+  }
+  {
+    v8::base::ElapsedTimer timer;
+    timer.Start();
+    two_byte_string->WriteUtf8(CcTest::isolate(), buffer, 4 * size, nullptr);
+    printf("two byte string %0.3f\n", timer.Elapsed().InMillisecondsF());
+    timer.Stop();
+  }
+
+  {
+    v8::base::ElapsedTimer timer;
+    timer.Start();
+    surrogate_string->WriteUtf8(CcTest::isolate(), buffer, size, nullptr);
+    printf("surrogate string %0.3f\n", timer.Elapsed().InMillisecondsF());
+    timer.Stop();
+  }
+  {
+    v8::base::ElapsedTimer timer;
+    timer.Start();
+    surrogate_string->WriteUtf8(CcTest::isolate(), buffer, size, nullptr);
+    printf("surrogate string %0.3f\n", timer.Elapsed().InMillisecondsF());
+    timer.Stop();
+  }
+  {
+    v8::base::ElapsedTimer timer;
+    timer.Start();
+    surrogate_string->WriteUtf8(CcTest::isolate(), buffer, 4 * size, nullptr);
+    printf("surrogate string %0.3f\n", timer.Elapsed().InMillisecondsF());
+    timer.Stop();
+  }
+  delete[] buffer;
+}

 TEST(ExternalShortStringAdd) {
  LocalContext context;
@ -1064,6 +1168,27 @@ TEST(ExternalShortStringAdd) {
  CHECK_EQ(0, CompileRun(source)->Int32Value(context.local()).FromJust());
 }

+TEST(ReplaceInvalidUtf8) {
+  LocalContext context;
+  v8::HandleScope handle_scope(CcTest::isolate());
+  v8::Local<v8::String> string = CompileRun("'ab\\ud800cd'").As<v8::String>();
+  char buffer[7];
+  memset(buffer, 0, 7);
+  int chars_written = 0;
+  int size = string->WriteUtf8(CcTest::isolate(), buffer, 7, &chars_written,
+                               v8::String::REPLACE_INVALID_UTF8);
+  CHECK_EQ(7, size);
+  CHECK_EQ(5, chars_written);
+  CHECK_EQ(0, memcmp("\x61\x62\xef\xbf\xbd\x63\x64", buffer, 7));
+
+  memset(buffer, 0, 7);
+  chars_written = 0;
+  size = string->WriteUtf8(CcTest::isolate(), buffer, 6, &chars_written,
+                           v8::String::REPLACE_INVALID_UTF8);
+  CHECK_EQ(6, size);
+  CHECK_EQ(4, chars_written);
+  CHECK_EQ(0, memcmp("\x61\x62\xef\xbf\xbd\x63", buffer, 6));
+}

 TEST(JSONStringifySliceMadeExternal) {
  if (!FLAG_string_slices) return;