Remove Utf8InputBuffer

R=yangguo@chromium.org BUG= Review URL: https://chromiumcodereview.appspot.com/11649018 Patch from Dan Carney <dcarney@google.com>. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13248 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2012-12-20 09:20:37 +00:00 · 2012-12-20 09:20:37 +00:00 · eedcaf1866
commit eedcaf1866
parent 9eccd63ccc
9 changed files with 235 additions and 159 deletions
--- a/src/debug-agent.cc
+++ b/src/debug-agent.cc
@ -192,21 +192,14 @@ void DebuggerAgentSession::Run() {
    }

    // Convert UTF-8 to UTF-16.
-    unibrow::Utf8InputBuffer<> buf(msg, StrLength(msg));
-    int len = 0;
-    while (buf.has_more()) {
-      buf.GetNext();
-      len++;
-    }
-    ScopedVector<int16_t> temp(len + 1);
-    buf.Reset(msg, StrLength(msg));
-    for (int i = 0; i < len; i++) {
-      temp[i] = buf.GetNext();
-    }
+    unibrow::Utf8Decoder<128> decoder(msg, StrLength(msg));
+    int utf16_length = decoder.Utf16Length();
+    ScopedVector<uint16_t> temp(utf16_length + 1);
+    decoder.WriteUtf16(temp.start(), utf16_length);

    // Send the request received to the debugger.
-    v8::Debug::SendCommand(reinterpret_cast<const uint16_t *>(temp.start()),
-                           len,
+    v8::Debug::SendCommand(temp.start(),
+                           utf16_length,
                           NULL,
                           reinterpret_cast<v8::Isolate*>(agent_->isolate()));

--- a/src/heap.cc
+++ b/src/heap.cc
@ -4546,37 +4546,31 @@ MaybeObject* Heap::AllocateStringFromUtf8Slow(Vector<const char> string,
                                              PretenureFlag pretenure) {
  // Continue counting the number of characters in the UTF-8 string, starting
  // from the first non-ascii character or word.
-  int chars = non_ascii_start;
  Access<UnicodeCache::Utf8Decoder>
      decoder(isolate_->unicode_cache()->utf8_decoder());
-  decoder->Reset(string.start() + non_ascii_start, string.length() - chars);
-  while (decoder->has_more()) {
-    uint32_t r = decoder->GetNext();
-    if (r <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
-      chars++;
-    } else {
-      chars += 2;
-    }
-  }
-
+  decoder->Reset(string.start() + non_ascii_start,
+                 string.length() - non_ascii_start);
+  int utf16_length = decoder->Utf16Length();
+  ASSERT(utf16_length > 0);
+  // Allocate string.
  Object* result;
-  { MaybeObject* maybe_result = AllocateRawTwoByteString(chars, pretenure);
+  {
+    int chars = non_ascii_start + utf16_length;
+    MaybeObject* maybe_result = AllocateRawTwoByteString(chars, pretenure);
    if (!maybe_result->ToObject(&result)) return maybe_result;
  }
-
  // Convert and copy the characters into the new object.
  SeqTwoByteString* twobyte = SeqTwoByteString::cast(result);
-  decoder->Reset(string.start(), string.length());
-  int i = 0;
-  while (i < chars) {
-    uint32_t r = decoder->GetNext();
-    if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
-      twobyte->SeqTwoByteStringSet(i++, unibrow::Utf16::LeadSurrogate(r));
-      twobyte->SeqTwoByteStringSet(i++, unibrow::Utf16::TrailSurrogate(r));
-    } else {
-      twobyte->SeqTwoByteStringSet(i++, r);
+  // Copy ascii portion.
+  uint16_t* data = twobyte->GetChars();
+  if (non_ascii_start != 0) {
+    const char* ascii_data = string.start();
+    for (int i = 0; i < non_ascii_start; i++) {
+      *data++ = *ascii_data++;
    }
  }
+  // Now write the remainder.
+  decoder->WriteUtf16(data, utf16_length);
  return result;
 }

--- a/src/objects.cc
+++ b/src/objects.cc
@ -7641,14 +7641,20 @@ bool String::MarkAsUndetectable() {


 bool String::IsEqualTo(Vector<const char> str) {
-  Isolate* isolate = GetIsolate();
  int slen = length();
-  Access<UnicodeCache::Utf8Decoder>
-      decoder(isolate->unicode_cache()->utf8_decoder());
-  decoder->Reset(str.start(), str.length());
+  // Can't check exact length equality, but we can check bounds.
+  int str_len = str.length();
+  if (str_len < slen ||
+      str_len > slen*static_cast<int>(unibrow::Utf8::kMaxEncodedSize)) {
+    return false;
+  }
  int i;
-  for (i = 0; i < slen && decoder->has_more(); i++) {
-    uint32_t r = decoder->GetNext();
+  unsigned remaining_in_str = static_cast<unsigned>(str_len);
+  const uint8_t* utf8_data = reinterpret_cast<const uint8_t*>(str.start());
+  for (i = 0; i < slen && remaining_in_str > 0; i++) {
+    unsigned cursor = 0;
+    uint32_t r = unibrow::Utf8::ValueOf(utf8_data, remaining_in_str, &cursor);
+    ASSERT(cursor > 0 && cursor <= remaining_in_str);
    if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
      if (i > slen - 1) return false;
      if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;
@ -7656,8 +7662,10 @@ bool String::IsEqualTo(Vector<const char> str) {
    } else {
      if (Get(i) != r) return false;
    }
+    utf8_data += cursor;
+    remaining_in_str -= cursor;
  }
-  return i == slen && !decoder->has_more();
+  return i == slen && remaining_in_str == 0;
 }


@ -7862,46 +7870,51 @@ uint32_t StringHasher::GetHashField() {
 }


-uint32_t StringHasher::ComputeHashField(unibrow::CharacterStream* buffer,
-                                        int length,
-                                        uint32_t seed) {
-  typedef unibrow::Utf16 u;
-  StringHasher hasher(length, seed);
-  // Very long strings have a trivial hash that doesn't inspect the
-  // string contents.
-  if (hasher.has_trivial_hash()) {
-    return hasher.GetHashField();
+uint32_t StringHasher::ComputeUtf8Hash(Vector<const char> chars,
+                                       uint32_t seed,
+                                       int* utf16_length_out) {
+  int vector_length = chars.length();
+  // Handle some edge cases
+  if (vector_length <= 1) {
+    ASSERT(vector_length == 0 ||
+           static_cast<uint8_t>(chars.start()[0]) <=
+               unibrow::Utf8::kMaxOneByteChar);
+    *utf16_length_out = vector_length;
+    return HashSequentialString(chars.start(), vector_length, seed);
  }
-  // Do the iterative array index computation as long as there is a
-  // chance this is an array index.
-  if (hasher.is_array_index_) {
-    while (buffer->has_more()) {
-      uint32_t c = buffer->GetNext();
-      if (c > u::kMaxNonSurrogateCharCode) {
-        uint16_t c1 = u::LeadSurrogate(c);
-        uint16_t c2 = u::TrailSurrogate(c);
-        hasher.AddCharacter(c1);
-        hasher.AddCharacter(c2);
-        if (!hasher.UpdateIndex(c1)) break;
-        if (!hasher.UpdateIndex(c2)) break;
-      } else {
-        hasher.AddCharacter(c);
-        if (!hasher.UpdateIndex(c)) break;
-      }
-    }
-  }
-  // Process the remaining characters without updating the array
-  // index.
-  while (buffer->has_more()) {
-    ASSERT(!hasher.is_array_index_);
-    uint32_t c = buffer->GetNext();
-    if (c > u::kMaxNonSurrogateCharCode) {
-      hasher.AddCharacter(u::LeadSurrogate(c));
-      hasher.AddCharacter(u::TrailSurrogate(c));
+  // Start with a fake length which won't affect computation.
+  // It will be updated later.
+  StringHasher hasher(String::kMaxArrayIndexSize, seed);
+  unsigned remaining = static_cast<unsigned>(vector_length);
+  const uint8_t* stream = reinterpret_cast<const uint8_t*>(chars.start());
+  int utf16_length = 0;
+  bool is_index = true;
+  ASSERT(hasher.is_array_index_);
+  while (remaining > 0) {
+    unsigned consumed = 0;
+    uint32_t c = unibrow::Utf8::ValueOf(stream, remaining, &consumed);
+    ASSERT(consumed > 0 && consumed <= remaining);
+    stream += consumed;
+    remaining -= consumed;
+    bool is_two_characters = c > unibrow::Utf16::kMaxNonSurrogateCharCode;
+    utf16_length += is_two_characters ? 2 : 1;
+    // No need to keep hashing. But we do need to calculate utf16_length.
+    if (utf16_length > String::kMaxHashCalcLength) continue;
+    if (is_two_characters) {
+      uint16_t c1 = unibrow::Utf16::LeadSurrogate(c);
+      uint16_t c2 = unibrow::Utf16::TrailSurrogate(c);
+      hasher.AddCharacter(c1);
+      hasher.AddCharacter(c2);
+      if (is_index) is_index = hasher.UpdateIndex(c1);
+      if (is_index) is_index = hasher.UpdateIndex(c2);
    } else {
      hasher.AddCharacter(c);
+      if (is_index) is_index = hasher.UpdateIndex(c);
    }
  }
+  *utf16_length_out = static_cast<int>(utf16_length);
+  // Must set length here so that hash computation is correct.
+  hasher.length_ = utf16_length;
  return hasher.GetHashField();
 }

@ -11716,10 +11729,7 @@ class Utf8SymbolKey : public HashTableKey {

  uint32_t Hash() {
    if (hash_field_ != 0) return hash_field_ >> String::kHashShift;
-    unibrow::Utf8InputBuffer<> buffer(string_.start(),
-                                      static_cast<unsigned>(string_.length()));
-    chars_ = buffer.Utf16Length();
-    hash_field_ = StringHasher::ComputeHashField(&buffer, chars_, seed_);
+    hash_field_ = StringHasher::ComputeUtf8Hash(string_, seed_, &chars_);
    uint32_t result = hash_field_ >> String::kHashShift;
    ASSERT(result != 0);  // Ensure that the hash value of 0 is never computed.
    return result;
--- a/src/objects.h
+++ b/src/objects.h
@ -6934,9 +6934,10 @@ class StringHasher {
                                              int length,
                                              uint32_t seed);

-  static uint32_t ComputeHashField(unibrow::CharacterStream* buffer,
-                                   int length,
-                                   uint32_t seed);
+  // Reads all the data, even for long strings and computes the utf16 length.
+  static uint32_t ComputeUtf8Hash(Vector<const char> chars,
+                                  uint32_t seed,
+                                  int* utf16_length_out);

  // Calculated hash value for a string consisting of 1 to
  // String::kMaxArrayIndexSize digits with no leading zeros (except "0").
--- a/src/scanner.h
+++ b/src/scanner.h
@ -145,7 +145,7 @@ class UnicodeCache {
 // Caching predicates used by scanners.
 public:
  UnicodeCache() {}
-  typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
+  typedef unibrow::Utf8Decoder<512> Utf8Decoder;

  StaticResource<Utf8Decoder>* utf8_decoder() {
    return &utf8_decoder_;
@ -315,8 +315,6 @@ class Scanner {
  // -1 is outside of the range of any real source code.
  static const int kNoOctalLocation = -1;

-  typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
-
  explicit Scanner(UnicodeCache* scanner_contants);

  void Initialize(Utf16CharacterStream* source);
--- a/src/unicode-inl.h
+++ b/src/unicode-inl.h
@ -240,10 +240,51 @@ void InputBuffer<R, I, s>::Seek(unsigned position) {
  buffer_ = R::ReadBlock(input_, util_buffer_, s, &remaining_, &offset_);
 }

-template <unsigned s>
-Utf8InputBuffer<s>::Utf8InputBuffer(const char* data, unsigned length)
-    : InputBuffer<Utf8, Buffer<const char*>, s>(Buffer<const char*>(data,
-                                                                    length)) {
+Utf8DecoderBase::Utf8DecoderBase()
+  : unbuffered_start_(NULL),
+    utf16_length_(0),
+    last_byte_of_buffer_unused_(false) {}
+
+Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer,
+                                 unsigned buffer_length,
+                                 const uint8_t* stream,
+                                 unsigned stream_length) {
+  Reset(buffer, buffer_length, stream, stream_length);
+}
+
+template<unsigned kBufferSize>
+Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
+  : Utf8DecoderBase(buffer_,
+                    kBufferSize,
+                    reinterpret_cast<const uint8_t*>(stream),
+                    length) {
+}
+
+template<unsigned kBufferSize>
+void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
+  Utf8DecoderBase::Reset(buffer_,
+                         kBufferSize,
+                         reinterpret_cast<const uint8_t*>(stream),
+                         length);
+}
+
+template <unsigned kBufferSize>
+unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
+                                              unsigned length) const {
+  ASSERT(length > 0);
+  if (length > utf16_length_) length = utf16_length_;
+  // memcpy everything in buffer.
+  unsigned buffer_length =
+      last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
+  unsigned memcpy_length = length <= buffer_length  ? length : buffer_length;
+  memcpy(data, buffer_, memcpy_length*sizeof(uint16_t));
+  if (length <= buffer_length) return length;
+  ASSERT(unbuffered_start_ != NULL);
+  // Copy the rest the slow way.
+  WriteUtf16Slow(unbuffered_start_,
+                 data + buffer_length,
+                 length - buffer_length);
+  return length;
 }

 }  // namespace unibrow
--- a/src/unicode.cc
+++ b/src/unicode.cc
@ -277,58 +277,6 @@ uchar Utf8::CalculateValue(const byte* str,
 }


-const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer,
-    unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) {
-  unsigned offset = *offset_ptr;
-  // Bail out early if we've reached the end of the string.
-  if (offset == str.length()) {
-    *chars_read_ptr = 0;
-    return NULL;
-  }
-  const byte* data = reinterpret_cast<const byte*>(str.data());
-  if (data[offset] <= kMaxOneByteChar) {
-    // The next character is an ASCII char so we scan forward over
-    // the following ASCII characters and return the next pure ASCII
-    // substring
-    const byte* result = data + offset;
-    offset++;
-    while ((offset < str.length()) && (data[offset] <= kMaxOneByteChar))
-      offset++;
-    *chars_read_ptr = offset - *offset_ptr;
-    *offset_ptr = offset;
-    return result;
-  } else {
-    // The next character is non-ASCII so we just fill the buffer
-    unsigned cursor = 0;
-    unsigned chars_read = 0;
-    while (offset < str.length()) {
-      uchar c = data[offset];
-      if (c <= kMaxOneByteChar) {
-        // Fast case for ASCII characters
-        if (!CharacterStream::EncodeAsciiCharacter(c,
-                                                   buffer,
-                                                   capacity,
-                                                   cursor))
-          break;
-        offset += 1;
-      } else {
-        unsigned chars = 0;
-        c = Utf8::ValueOf(data + offset, str.length() - offset, &chars);
-        if (!CharacterStream::EncodeNonAsciiCharacter(c,
-                                                      buffer,
-                                                      capacity,
-                                                      cursor))
-          break;
-        offset += chars;
-      }
-      chars_read++;
-    }
-    *offset_ptr = offset;
-    *chars_read_ptr = chars_read;
-    return buffer;
-  }
-}
-
 unsigned CharacterStream::Length() {
  unsigned result = 0;
  while (has_more()) {
@ -356,6 +304,75 @@ void CharacterStream::Seek(unsigned position) {
  }
 }

+void Utf8DecoderBase::Reset(uint16_t* buffer,
+                            unsigned buffer_length,
+                            const uint8_t* stream,
+                            unsigned stream_length) {
+  // Assume everything will fit in the buffer and stream won't be needed.
+  last_byte_of_buffer_unused_ = false;
+  unbuffered_start_ = NULL;
+  bool writing_to_buffer = true;
+  // Loop until stream is read, writing to buffer as long as buffer has space.
+  unsigned utf16_length = 0;
+  while (stream_length != 0) {
+    unsigned cursor = 0;
+    uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
+    ASSERT(cursor > 0 && cursor <= stream_length);
+    stream += cursor;
+    stream_length -= cursor;
+    bool is_two_characters = character > Utf16::kMaxNonSurrogateCharCode;
+    utf16_length += is_two_characters ? 2 : 1;
+    // Don't need to write to the buffer, but still need utf16_length.
+    if (!writing_to_buffer) continue;
+    // Write out the characters to the buffer.
+    // Must check for equality with buffer_length as we've already updated it.
+    if (utf16_length <= buffer_length) {
+      if (is_two_characters) {
+        *buffer++ = Utf16::LeadSurrogate(character);
+        *buffer++ = Utf16::TrailSurrogate(character);
+      } else {
+        *buffer++ = character;
+      }
+      if (utf16_length == buffer_length) {
+        // Just wrote last character of buffer
+        writing_to_buffer = false;
+        unbuffered_start_ = stream;
+      }
+      continue;
+    }
+    // Have gone over buffer.
+    // Last char of buffer is unused, set cursor back.
+    ASSERT(is_two_characters);
+    writing_to_buffer = false;
+    last_byte_of_buffer_unused_ = true;
+    unbuffered_start_ = stream - cursor;
+  }
+  utf16_length_ = utf16_length;
+}
+
+
+void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream,
+                                     uint16_t* data,
+                                     unsigned data_length) {
+  while (data_length != 0) {
+    unsigned cursor = 0;
+    uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor);
+    // There's a total lack of bounds checking for stream
+    // as it was already done in Reset.
+    stream += cursor;
+    if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      *data++ = Utf16::LeadSurrogate(character);
+      *data++ = Utf16::TrailSurrogate(character);
+      ASSERT(data_length > 1);
+      data_length -= 2;
+    } else {
+      *data++ = character;
+      data_length -= 1;
+    }
+  }
+}
+
+
 // Uppercase:            point.category == 'Lu'

 static const uint16_t kUppercaseTable0Size = 450;
--- a/src/unicode.h
+++ b/src/unicode.h
@ -29,7 +29,8 @@
 #define V8_UNICODE_H_

 #include <sys/types.h>
-
+#include <stdint.h>
+#include <globals.h>
 /**
 * \file
 * Definitions and convenience functions for working with unicode.
@ -140,10 +141,10 @@ class Utf16 {
  // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
  // The illegality stems from the surrogate not being part of a pair.
  static const int kUtf8BytesToCodeASurrogate = 3;
-  static inline uchar LeadSurrogate(int char_code) {
+  static inline uint16_t LeadSurrogate(uint32_t char_code) {
    return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
  }
-  static inline uchar TrailSurrogate(int char_code) {
+  static inline uint16_t TrailSurrogate(uint32_t char_code) {
    return 0xdc00 + (char_code & 0x3ff);
  }
 };
@ -154,8 +155,6 @@ class Utf8 {
  static inline uchar Length(uchar chr, int previous);
  static inline unsigned Encode(
      char* out, uchar c, int previous);
-  static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
-      unsigned capacity, unsigned* chars_read, unsigned* offset);
  static uchar CalculateValue(const byte* str,
                              unsigned length,
                              unsigned* cursor);
@ -241,17 +240,42 @@ class InputBuffer : public CharacterStream {
  byte util_buffer_[kSize];
 };

-// --- U t f 8   I n p u t   B u f f e r ---

-template <unsigned s = 256>
-class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
+class Utf8DecoderBase {
 public:
-  inline Utf8InputBuffer() { }
-  inline Utf8InputBuffer(const char* data, unsigned length);
-  inline void Reset(const char* data, unsigned length) {
-    InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
-        Buffer<const char*>(data, length));
-  }
+  // Initialization done in subclass.
+  inline Utf8DecoderBase();
+  inline Utf8DecoderBase(uint16_t* buffer,
+                         unsigned buffer_length,
+                         const uint8_t* stream,
+                         unsigned stream_length);
+  inline unsigned Utf16Length() const { return utf16_length_; }
+ protected:
+  // This reads all characters and sets the utf16_length_.
+  // The first buffer_length utf16 chars are cached in the buffer.
+  void Reset(uint16_t* buffer,
+             unsigned buffer_length,
+             const uint8_t* stream,
+             unsigned stream_length);
+  static void WriteUtf16Slow(const uint8_t* stream,
+                             uint16_t* data,
+                             unsigned length);
+  const uint8_t* unbuffered_start_;
+  unsigned utf16_length_;
+  bool last_byte_of_buffer_unused_;
+ private:
+  DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
+};
+
+template <unsigned kBufferSize>
+class Utf8Decoder : public Utf8DecoderBase {
+ public:
+  inline Utf8Decoder() {}
+  inline Utf8Decoder(const char* stream, unsigned length);
+  inline void Reset(const char* stream, unsigned length);
+  inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
+ private:
+  uint16_t buffer_[kBufferSize];
 };


--- a/test/cctest/test-regexp.cc
+++ b/test/cctest/test-regexp.cc
@ -98,7 +98,6 @@ static SmartArrayPointer<const char> Parse(const char* input) {
 static bool CheckSimple(const char* input) {
  V8::Initialize(NULL);
  v8::HandleScope scope;
-  unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
  ZoneScope zone_scope(Isolate::Current()->runtime_zone(), DELETE_ON_EXIT);
  FlatStringReader reader(Isolate::Current(), CStrVector(input));
  RegExpCompileData result;
@ -117,7 +116,6 @@ struct MinMaxPair {
 static MinMaxPair CheckMinMaxMatch(const char* input) {
  V8::Initialize(NULL);
  v8::HandleScope scope;
-  unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
  ZoneScope zone_scope(Isolate::Current()->runtime_zone(), DELETE_ON_EXIT);
  FlatStringReader reader(Isolate::Current(), CStrVector(input));
  RegExpCompileData result;