Added back some utf8 optimizations

R=yangguo@chromium.org BUG=https://code.google.com/p/v8/issues/detail?id=2551 Review URL: https://codereview.chromium.org/12390057 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13842 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2013-03-06 15:39:57 +00:00 · 2013-03-06 15:39:57 +00:00 · 07e2494735
commit 07e2494735
parent 940ed0dc92
3 changed files with 306 additions and 79 deletions
--- a/src/api.cc
+++ b/src/api.cc
@ -3831,68 +3831,200 @@ bool String::IsOneByte() const {
 }


-class Utf8LengthVisitor {
+class Utf8LengthHelper : public i::AllStatic {
 public:
-  explicit Utf8LengthVisitor()
-    : utf8_length_(0),
-      last_character_(unibrow::Utf16::kNoPreviousCharacter) {}
+  enum State {
+    kEndsWithLeadingSurrogate = 1 << 0,
+    kStartsWithTrailingSurrogate = 1 << 1,
+    kLeftmostEdgeIsCalculated = 1 << 2,
+    kRightmostEdgeIsCalculated = 1 << 3,
+    kLeftmostEdgeIsSurrogate = 1 << 4,
+    kRightmostEdgeIsSurrogate = 1 << 5
+  };

-  inline int GetLength() {
-    return utf8_length_;
+  static const uint8_t kInitialState = 0;
+
+  static inline bool EndsWithSurrogate(uint8_t state) {
+    return state & kEndsWithLeadingSurrogate;
  }

-  template<typename Char>
-  inline void Visit(const Char* chars, unsigned length) {
-    ASSERT(length > 0);
-    // TODO(dcarney) Add back ascii fast path.
-    int utf8_length = 0;
-    int last_character = last_character_;
-    for (unsigned i = 0; i < length; i++) {
-      uint16_t c = chars[i];
-      utf8_length += unibrow::Utf8::Length(c, last_character);
-      last_character = c;
+  static inline bool StartsWithSurrogate(uint8_t state) {
+    return state & kStartsWithTrailingSurrogate;
+  }
+
+  class Visitor {
+   public:
+    explicit Visitor()
+      : utf8_length_(0),
+        state_(kInitialState) {}
+
+    template<typename Char>
+    inline void Visit(const Char* chars, int length) {
+      int utf8_length = 0;
+      int last_character = unibrow::Utf16::kNoPreviousCharacter;
+      for (int i = 0; i < length; i++) {
+        uint16_t c = chars[i];
+        utf8_length += unibrow::Utf8::Length(c, last_character);
+        if (sizeof(Char) > 1) {
+          last_character = c;
+        }
+      }
+      utf8_length_ = utf8_length;
+    }
+
+    void VisitOneByteString(const uint8_t* chars, int length) {
+      Visit(chars, length);
+      state_ = kInitialState;
+    }
+
+    void VisitTwoByteString(const uint16_t* chars, int length) {
+      Visit(chars, length);
+      uint8_t state = 0;
+      if (unibrow::Utf16::IsTrailSurrogate(chars[0])) {
+        state |= kStartsWithTrailingSurrogate;
+      }
+      if (unibrow::Utf16::IsLeadSurrogate(chars[length-1])) {
+        state |= kEndsWithLeadingSurrogate;
+      }
+      state_ = state;
+    }
+
+    static i::ConsString* VisitFlat(i::String* string,
+                                    int* length,
+                                    uint8_t* state) {
+      Visitor visitor;
+      i::ConsString* cons_string = i::String::VisitFlat(&visitor, string);
+      *length = visitor.utf8_length_;
+      *state = visitor.state_;
+      return cons_string;
+    }
+
+   private:
+    int utf8_length_;
+    uint8_t state_;
+    DISALLOW_COPY_AND_ASSIGN(Visitor);
+  };
+
+  static inline void MergeLeafLeft(int* length,
+                                   uint8_t* state,
+                                   uint8_t leaf_state) {
+    bool edge_surrogate = StartsWithSurrogate(leaf_state);
+    if (!(*state & kLeftmostEdgeIsCalculated)) {
+      ASSERT(!(*state & kLeftmostEdgeIsSurrogate));
+      *state |= kLeftmostEdgeIsCalculated
+          | (edge_surrogate ? kLeftmostEdgeIsSurrogate : 0);
+    } else if (EndsWithSurrogate(*state) && edge_surrogate) {
+      *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+    }
+    if (EndsWithSurrogate(leaf_state)) {
+      *state |= kEndsWithLeadingSurrogate;
+    } else {
+      *state &= ~kEndsWithLeadingSurrogate;
    }
-    last_character_ = last_character;
-    utf8_length_ += utf8_length;
  }

-  inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
-    Visit(chars, length);
+  static inline void MergeLeafRight(int* length,
+                                    uint8_t* state,
+                                    uint8_t leaf_state) {
+    bool edge_surrogate = EndsWithSurrogate(leaf_state);
+    if (!(*state & kRightmostEdgeIsCalculated)) {
+      ASSERT(!(*state & kRightmostEdgeIsSurrogate));
+      *state |= (kRightmostEdgeIsCalculated
+          | (edge_surrogate ? kRightmostEdgeIsSurrogate : 0));
+    } else if (edge_surrogate && StartsWithSurrogate(*state)) {
+      *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+    }
+    if (StartsWithSurrogate(leaf_state)) {
+      *state |= kStartsWithTrailingSurrogate;
+    } else {
+      *state &= ~kStartsWithTrailingSurrogate;
+    }
  }

-  inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
-    Visit(chars, length);
+  static inline void MergeTerminal(int* length,
+                                   uint8_t state,
+                                   uint8_t* state_out) {
+    ASSERT((state & kLeftmostEdgeIsCalculated) &&
+           (state & kRightmostEdgeIsCalculated));
+    if (EndsWithSurrogate(state) && StartsWithSurrogate(state)) {
+      *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+    }
+    *state_out = kInitialState |
+        (state & kLeftmostEdgeIsSurrogate ? kStartsWithTrailingSurrogate : 0) |
+        (state & kRightmostEdgeIsSurrogate ? kEndsWithLeadingSurrogate : 0);
+  }
+
+  static int Calculate(i::ConsString* current, uint8_t* state_out) {
+    using namespace internal;
+    int total_length = 0;
+    uint8_t state = kInitialState;
+    while (true) {
+      i::String* left = current->first();
+      i::String* right = current->second();
+      uint8_t right_leaf_state;
+      uint8_t left_leaf_state;
+      int leaf_length;
+      ConsString* left_as_cons =
+          Visitor::VisitFlat(left, &leaf_length, &left_leaf_state);
+      if (left_as_cons == NULL) {
+        total_length += leaf_length;
+        MergeLeafLeft(&total_length, &state, left_leaf_state);
+      }
+      ConsString* right_as_cons =
+          Visitor::VisitFlat(right, &leaf_length, &right_leaf_state);
+      if (right_as_cons == NULL) {
+        total_length += leaf_length;
+        MergeLeafRight(&total_length, &state, right_leaf_state);
+        // Terminal node.
+        if (left_as_cons == NULL) {
+          MergeTerminal(&total_length, state, state_out);
+          return total_length;
+        }
+      } else if (left_as_cons != NULL) {
+        // Both strings are ConsStrings.
+        // Recurse on smallest.
+        if (left->length() < right->length()) {
+          total_length += Calculate(left_as_cons, &left_leaf_state);
+          MergeLeafLeft(&total_length, &state, left_leaf_state);
+          current = right_as_cons;
+          continue;
+        } else {
+          total_length += Calculate(right_as_cons, &right_leaf_state);
+          MergeLeafRight(&total_length, &state, right_leaf_state);
+          current = left_as_cons;
+          continue;
+        }
+      }
+      // 1 leaf node. Do in place descent.
+      if (left_as_cons != NULL) {
+        current = left_as_cons;
+      } else {
+        ASSERT(right_as_cons != NULL);
+        current = right_as_cons;
+      }
+    }
+    UNREACHABLE();
+    return 0;
+  }
+
+  static inline int Calculate(i::ConsString* current) {
+    uint8_t state = kInitialState;
+    return Calculate(current, &state);
  }

 private:
-  int utf8_length_;
-  int last_character_;
-  DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor);
+  DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8LengthHelper);
 };


 static int Utf8Length(i::String* str, i::Isolate* isolate) {
-  unsigned length = static_cast<unsigned>(str->length());
+  int length = str->length();
  if (length == 0) return 0;
-  int32_t type = str->map()->instance_type();
-  Utf8LengthVisitor visitor;
-  // Non ConsString branch.
-  if ((type & i::kStringRepresentationMask) != i::kConsStringTag) {
-    i::ConsStringNullOp null_op;
-    i::String::Visit(str, 0, visitor, null_op, type, length);
-    return visitor.GetLength();
-  }
-  i::ConsStringIteratorOp* op = isolate->write_iterator();
-  unsigned offset = 0;
-  i::String* leaf = op->Operate(str, &offset, &type, &length);
-  ASSERT(leaf != NULL);
-  while (leaf != NULL) {
-    i::ConsStringNullOp null_op;
-    ASSERT(offset == 0);
-    i::String::Visit(leaf, 0, visitor, null_op, type, length);
-    leaf = op->ContinueOperation(&type, &length);
-  }
-  return visitor.GetLength();
+  uint8_t state;
+  i::ConsString* cons_string =
+      Utf8LengthHelper::Visitor::VisitFlat(str, &length, &state);
+  if (cons_string == NULL) return length;
+  return Utf8LengthHelper::Calculate(cons_string);
 }


@ -3906,12 +4038,14 @@ int String::Utf8Length() const {

 class Utf8WriterVisitor {
 public:
-  Utf8WriterVisitor(char* buffer, int capacity)
+  Utf8WriterVisitor(
+      char* buffer, int capacity, bool skip_capacity_check)
    : early_termination_(false),
      last_character_(unibrow::Utf16::kNoPreviousCharacter),
      buffer_(buffer),
      start_(buffer),
      capacity_(capacity),
+      skip_capacity_check_(capacity == -1 || skip_capacity_check),
      utf16_chars_read_(0) {
  }

@ -3935,7 +4069,7 @@ class Utf8WriterVisitor {
    // Can't encode using last_character as gcc has array bounds issues.
    int written = Utf8::Encode(temp_buffer,
                               character,
-                               unibrow::Utf16::kNoPreviousCharacter);
+                               Utf16::kNoPreviousCharacter);
    // Won't fit.
    if (written > remaining) return 0;
    // Copy over the character from temp_buffer.
@ -3948,45 +4082,55 @@ class Utf8WriterVisitor {
  template<typename Char>
  void Visit(const Char* chars, const int length) {
    using namespace unibrow;
-    // TODO(dcarney): Add back ascii fast path.
    ASSERT(!early_termination_);
-    ASSERT(length > 0);
+    if (length == 0) return;
    // Copy state to stack.
    char* buffer = buffer_;
-    int last_character = last_character_;
+    int last_character =
+        sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_;
    int i = 0;
    // Do a fast loop where there is no exit capacity check.
    while (true) {
      int fast_length;
-      if (capacity_ == -1) {
+      if (skip_capacity_check_) {
        fast_length = length;
      } else {
        int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);
        // Need enough space to write everything but one character.
        STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3);
-        int writable_length = (remaining_capacity - 3)/3;
+        int max_size_per_char =  sizeof(Char) == 1 ? 2 : 3;
+        int writable_length =
+            (remaining_capacity - max_size_per_char)/max_size_per_char;
        // Need to drop into slow loop.
        if (writable_length <= 0) break;
        fast_length = i + writable_length;
        if (fast_length > length) fast_length = length;
      }
      // Write the characters to the stream.
-      for (; i < fast_length; i++) {
-        uint16_t character = *chars++;
-        buffer += Utf8::Encode(buffer, character, last_character);
-        last_character = character;
-        ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
+      if (sizeof(Char) == 1) {
+        for (; i < fast_length; i++) {
+          buffer +=
+              Utf8::Encode(buffer, *chars++, Utf16::kNoPreviousCharacter);
+          ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
+        }
+      } else {
+        for (; i < fast_length; i++) {
+          uint16_t character = *chars++;
+          buffer += Utf8::Encode(buffer, character, last_character);
+          last_character = character;
+          ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
+        }
      }
      // Array is fully written. Exit.
      if (fast_length == length) {
        // Write state back out to object.
        last_character_ = last_character;
        buffer_ = buffer;
-        utf16_chars_read_ += i;
+        utf16_chars_read_ += length;
        return;
      }
    }
-    ASSERT(capacity_ != -1);
+    ASSERT(!skip_capacity_check_);
    // Slow loop. Must check capacity on each iteration.
    int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);
    ASSERT(remaining_capacity >= 0);
@ -4014,15 +4158,15 @@ class Utf8WriterVisitor {
    return early_termination_;
  }

-  inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
-    Visit(chars, static_cast<int>(length));
+  inline void VisitOneByteString(const uint8_t* chars, int length) {
+    Visit(chars, length);
  }

-  inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
-    Visit(chars, static_cast<int>(length));
+  inline void VisitTwoByteString(const uint16_t* chars, int length) {
+    Visit(chars, length);
  }

-  inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
+  int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
    // Write out number of utf16 characters written to the stream.
    if (utf16_chars_read_out != NULL) {
      *utf16_chars_read_out = utf16_chars_read_;
@ -4042,11 +4186,32 @@ class Utf8WriterVisitor {
  char* buffer_;
  char* const start_;
  int capacity_;
+  bool const skip_capacity_check_;
  int utf16_chars_read_;
  DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
 };


+static bool RecursivelySerializeToUtf8(i::String* current,
+                                      Utf8WriterVisitor* writer,
+                                      int recursion_budget) {
+  while (!writer->IsDone()) {
+    i::ConsString* cons_string = i::String::VisitFlat(writer, current);
+    if (cons_string == NULL) return true;  // Leaf node.
+    if (recursion_budget <= 0) return false;
+    // Must write the left branch first.
+    i::String* first = cons_string->first();
+    bool success = RecursivelySerializeToUtf8(first,
+                                              writer,
+                                              recursion_budget - 1);
+    if (!success) return false;
+    // Inline tail recurse for right branch.
+    current = cons_string->second();
+  }
+  return true;
+}
+
+
 int String::WriteUtf8(char* buffer,
                      int capacity,
                      int* nchars_ref,
@ -4059,23 +4224,41 @@ int String::WriteUtf8(char* buffer,
  if (options & HINT_MANY_WRITES_EXPECTED) {
    FlattenString(str);  // Flatten the string for efficiency.
  }
-  Utf8WriterVisitor writer(buffer, capacity);
-  i::ConsStringIteratorOp* op = isolate->write_iterator();
-  op->Reset();
-  int32_t type = str->map()->instance_type();
-  unsigned str_length = static_cast<unsigned>(str->length());
-  if (str_length != 0) {
-    i::String::Visit(*str, 0, writer, *op, type, str_length);
-    while (!writer.IsDone()) {
-      unsigned length_out;
-      i::String* next = op->ContinueOperation(&type, &length_out);
-      if (next == NULL) break;
-      // TODO(dcarney): need an asserting null op.
-      i::ConsStringNullOp null_op;
-      i::String::Visit(next, 0, writer, null_op, type, length_out);
+  const int string_length = str->length();
+  bool write_null = !(options & NO_NULL_TERMINATION);
+  // First check if we can just write the string without checking capacity.
+  if (capacity == -1 || capacity / 3 >= string_length) {
+    Utf8WriterVisitor writer(buffer, capacity, true);
+    const int kMaxRecursion = 100;
+    bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);
+    if (success) return writer.CompleteWrite(write_null, nchars_ref);
+  } else if (capacity >= string_length) {
+    // First check that the buffer is large enough.
+    int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate());
+    if (utf8_bytes <= capacity) {
+      // ASCII fast path.
+      if (utf8_bytes == string_length) {
+        WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options);
+        if (nchars_ref != NULL) *nchars_ref = string_length;
+        if (write_null && (utf8_bytes+1 <= capacity)) {
+          return string_length + 1;
+        }
+        return string_length;
+      }
+      if (write_null && (utf8_bytes+1 > capacity)) {
+        options |= NO_NULL_TERMINATION;
+      }
+      // Recurse once without a capacity limit.
+      // This will get into the first branch above.
+      // TODO(dcarney) Check max left rec. in Utf8Length and fall through.
+      return WriteUtf8(buffer, -1, nchars_ref, options);
    }
  }
-  return writer.CompleteWrite(!(options & NO_NULL_TERMINATION), nchars_ref);
+  // Recursive slow path can potentially be unreasonable slow. Flatten.
+  str = FlattenGetString(str);
+  Utf8WriterVisitor writer(buffer, capacity, false);
+  i::String::VisitFlat(&writer, *str);
+  return writer.CompleteWrite(write_null, nchars_ref);
 }


--- a/src/objects-inl.h
+++ b/src/objects-inl.h
@ -2648,6 +2648,32 @@ void String::Visit(
 }


+// TODO(dcarney): Remove this class after conversion to VisitFlat.
+class ConsStringCaptureOp {
+ public:
+  inline ConsStringCaptureOp() : cons_string_(NULL) {}
+  inline String* Operate(String* string, unsigned*, int32_t*, unsigned*) {
+    cons_string_ = ConsString::cast(string);
+    return NULL;
+  }
+  ConsString* cons_string_;
+};
+
+
+template<class Visitor>
+ConsString* String::VisitFlat(Visitor* visitor,
+                              String* string,
+                              int offset,
+                              int length,
+                              int32_t type) {
+  ASSERT(length >= 0 && length == string->length());
+  ASSERT(offset >= 0 && offset <= length);
+  ConsStringCaptureOp op;
+  Visit(string, offset, *visitor, op, type, static_cast<unsigned>(length));
+  return op.cons_string_;
+}
+
+
 uint16_t SeqOneByteString::SeqOneByteStringGet(int index) {
  ASSERT(index >= 0 && index < length());
  return READ_BYTE_FIELD(this, kHeaderSize + index * kCharSize);
--- a/src/objects.h
+++ b/src/objects.h
@ -7337,6 +7337,8 @@ class Symbol: public Name {
 };


+class ConsString;
+
 // The String abstract class captures JavaScript string values:
 //
 // Ecma-262:
@ -7615,6 +7617,7 @@ class String: public Name {
    return NonOneByteStart(chars, length) >= length;
  }

+  // TODO(dcarney): Replace all instances of this with VisitFlat.
  template<class Visitor, class ConsOp>
  static inline void Visit(String* string,
                           unsigned offset,
@ -7623,6 +7626,21 @@ class String: public Name {
                           int32_t type,
                           unsigned length);

+  template<class Visitor>
+  static inline ConsString* VisitFlat(Visitor* visitor,
+                                      String* string,
+                                      int offset,
+                                      int length,
+                                      int32_t type);
+
+  template<class Visitor>
+  static inline ConsString* VisitFlat(Visitor* visitor,
+                                      String* string,
+                                      int offset = 0) {
+    int32_t type = string->map()->instance_type();
+    return VisitFlat(visitor, string, offset, string->length(), type);
+  }
+
 private:
  friend class Name;