Streamline the scanner for external two byte string input.

Review URL: http://codereview.chromium.org/165403 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@2703 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2009-08-18 07:14:02 +00:00 · 2009-08-18 07:14:02 +00:00 · 1c70072f7f
commit 1c70072f7f
parent 33fb11c12f
8 changed files with 132 additions and 52 deletions
--- a/src/api.cc
+++ b/src/api.cc
@ -1046,7 +1046,7 @@ void ObjectTemplate::SetInternalFieldCount(int value) {

 ScriptData* ScriptData::PreCompile(const char* input, int length) {
  unibrow::Utf8InputBuffer<> buf(input, length);
-  return i::PreParse(&buf, NULL);
+  return i::PreParse(i::Handle<i::String>(), &buf, NULL);
 }


--- a/src/compiler.cc
+++ b/src/compiler.cc
@ -266,7 +266,7 @@ Handle<JSFunction> Compiler::Compile(Handle<String> source,
    if (pre_data == NULL && source_length >= FLAG_min_preparse_length) {
      Access<SafeStringInputBuffer> buf(&safe_string_input_buffer);
      buf->Reset(source.location());
-      pre_data = PreParse(buf.value(), extension);
+      pre_data = PreParse(source, buf.value(), extension);
    }

    // Create a script object describing the script to be compiled.
--- a/src/factory.cc
+++ b/src/factory.cc
@ -87,8 +87,10 @@ Handle<String> Factory::NewStringFromUtf8(Vector<const char> string,
 }


-Handle<String> Factory::NewStringFromTwoByte(Vector<const uc16> string) {
-  CALL_HEAP_FUNCTION(Heap::AllocateStringFromTwoByte(string), String);
+Handle<String> Factory::NewStringFromTwoByte(Vector<const uc16> string,
+                                             PretenureFlag pretenure) {
+  CALL_HEAP_FUNCTION(Heap::AllocateStringFromTwoByte(string, pretenure),
+                     String);
 }


--- a/src/factory.h
+++ b/src/factory.h
@ -92,7 +92,8 @@ class Factory : public AllStatic {
      Vector<const char> str,
      PretenureFlag pretenure = NOT_TENURED);

-  static Handle<String> NewStringFromTwoByte(Vector<const uc16> str);
+  static Handle<String> NewStringFromTwoByte(Vector<const uc16> str,
+      PretenureFlag pretenure = NOT_TENURED);

  // Allocates and partially initializes a TwoByte String. The characters of
  // the string are uninitialized. Currently used in regexp code only, where
--- a/src/parser.cc
+++ b/src/parser.cc
@ -97,7 +97,7 @@ class Parser {

  // Pre-parse the program from the character stream; returns true on
  // success, false if a stack-overflow happened during parsing.
-  bool PreParseProgram(unibrow::CharacterStream* stream);
+  bool PreParseProgram(Handle<String> source, unibrow::CharacterStream* stream);

  void ReportMessage(const char* message, Vector<const char*> args);
  virtual void ReportMessageAt(Scanner::Location loc,
@ -1167,13 +1167,14 @@ Parser::Parser(Handle<Script> script,
 }


-bool Parser::PreParseProgram(unibrow::CharacterStream* stream) {
+bool Parser::PreParseProgram(Handle<String> source,
+                             unibrow::CharacterStream* stream) {
  HistogramTimerScope timer(&Counters::pre_parse);
  StackGuard guard;
  AssertNoZoneAllocation assert_no_zone_allocation;
  AssertNoAllocation assert_no_allocation;
  NoHandleAllocation no_handle_allocation;
-  scanner_.Init(Handle<String>(), stream, 0);
+  scanner_.Init(source, stream, 0);
  ASSERT(target_stack_ == NULL);
  mode_ = PARSE_EAGERLY;
  DummyScope top_scope;
@ -4593,7 +4594,8 @@ unsigned* ScriptDataImpl::Data() {
 }


-ScriptDataImpl* PreParse(unibrow::CharacterStream* stream,
+ScriptDataImpl* PreParse(Handle<String> source,
+                         unibrow::CharacterStream* stream,
                         v8::Extension* extension) {
  Handle<Script> no_script;
  bool allow_natives_syntax =
@ -4601,7 +4603,7 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream,
      FLAG_allow_natives_syntax ||
      Bootstrapper::IsActive();
  PreParser parser(no_script, allow_natives_syntax, extension);
-  if (!parser.PreParseProgram(stream)) return NULL;
+  if (!parser.PreParseProgram(source, stream)) return NULL;
  // The list owns the backing store so we need to clone the vector.
  // That way, the result will be exactly the right size rather than
  // the expected 50% too large.
--- a/src/parser.h
+++ b/src/parser.h
@ -143,7 +143,8 @@ FunctionLiteral* MakeAST(bool compile_in_global_context,
                         ScriptDataImpl* pre_data);


-ScriptDataImpl* PreParse(unibrow::CharacterStream* stream,
+ScriptDataImpl* PreParse(Handle<String> source,
+                         unibrow::CharacterStream* stream,
                         v8::Extension* extension);


--- a/src/scanner.cc
+++ b/src/scanner.cc
@ -92,18 +92,7 @@ void UTF8Buffer::AddCharSlow(uc32 c) {


 UTF16Buffer::UTF16Buffer()
-  : pos_(0),
-    pushback_buffer_(0),
-    last_(0),
-    stream_(NULL) { }
-
-
-void UTF16Buffer::Initialize(Handle<String> data,
-                             unibrow::CharacterStream* input) {
-  data_ = data;
-  pos_ = 0;
-  stream_ = input;
-}
+    : pos_(0), size_(0) { }


 Handle<String> UTF16Buffer::SubString(int start, int end) {
@ -111,14 +100,27 @@ Handle<String> UTF16Buffer::SubString(int start, int end) {
 }


-void UTF16Buffer::PushBack(uc32 ch) {
+// CharacterStreamUTF16Buffer
+CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()
+    : pushback_buffer_(0), last_(0), stream_(NULL) { }
+
+
+void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,
+                                            unibrow::CharacterStream* input) {
+  data_ = data;
+  pos_ = 0;
+  stream_ = input;
+}
+
+
+void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {
  pushback_buffer()->Add(last_);
  last_ = ch;
  pos_--;
 }


-uc32 UTF16Buffer::Advance() {
+uc32 CharacterStreamUTF16Buffer::Advance() {
  // NOTE: It is of importance to Persian / Farsi resources that we do
  // *not* strip format control characters in the scanner; see
  //
@ -135,7 +137,7 @@ uc32 UTF16Buffer::Advance() {
    uc32 next = stream_->GetNext();
    return last_ = next;
  } else {
-    // note: currently the following increment is necessary to avoid a
+    // Note: currently the following increment is necessary to avoid a
    // test-parser problem!
    pos_++;
    return last_ = static_cast<uc32>(-1);
@ -143,13 +145,53 @@ uc32 UTF16Buffer::Advance() {
 }


-void UTF16Buffer::SeekForward(int pos) {
+void CharacterStreamUTF16Buffer::SeekForward(int pos) {
  pos_ = pos;
  ASSERT(pushback_buffer()->is_empty());
  stream_->Seek(pos);
 }


+// TwoByteStringUTF16Buffer
+TwoByteStringUTF16Buffer::TwoByteStringUTF16Buffer()
+    : raw_data_(NULL) { }
+
+
+void TwoByteStringUTF16Buffer::Initialize(
+     Handle<ExternalTwoByteString> data) {
+  ASSERT(!data.is_null());
+
+  data_ = data;
+  pos_ = 0;
+
+  raw_data_ = data->resource()->data();
+  size_ = data->length();
+}
+
+
+uc32 TwoByteStringUTF16Buffer::Advance() {
+  if (pos_ < size_) {
+    return raw_data_[pos_++];
+  } else {
+    // note: currently the following increment is necessary to avoid a
+    // test-parser problem!
+    pos_++;
+    return static_cast<uc32>(-1);
+  }
+}
+
+
+void TwoByteStringUTF16Buffer::PushBack(uc32 ch) {
+  pos_--;
+  ASSERT(pos_ >= 0 && raw_data_[pos_] == ch);
+}
+
+
+void TwoByteStringUTF16Buffer::SeekForward(int pos) {
+  pos_ = pos;
+}
+
+
 // ----------------------------------------------------------------------------
 // Scanner

@ -161,7 +203,15 @@ Scanner::Scanner(bool pre) : stack_overflow_(false), is_pre_parsing_(pre) {
 void Scanner::Init(Handle<String> source, unibrow::CharacterStream* stream,
    int position) {
  // Initialize the source buffer.
-  source_.Initialize(source, stream);
+  if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {
+    two_byte_string_buffer_.Initialize(
+        Handle<ExternalTwoByteString>::cast(source));
+    source_ = &two_byte_string_buffer_;
+  } else {
+    char_stream_buffer_.Initialize(source, stream);
+    source_ = &char_stream_buffer_;
+  }
+
  position_ = position;

  // Reset literals buffer
@ -180,7 +230,7 @@ void Scanner::Init(Handle<String> source, unibrow::CharacterStream* stream,


 Handle<String> Scanner::SubString(int start, int end) {
-  return source_.SubString(start - position_, end - position_);
+  return source_->SubString(start - position_, end - position_);
 }


@ -223,17 +273,6 @@ void Scanner::AddCharAdvance() {
 }


-void Scanner::Advance() {
-  c0_ = source_.Advance();
-}
-
-
-void Scanner::PushBack(uc32 ch) {
-  source_.PushBack(ch);
-  c0_ = ch;
-}
-
-
 static inline bool IsByteOrderMark(uc32 c) {
  // The Unicode value U+FFFE is guaranteed never to be assigned as a
  // Unicode character; this implies that in a Unicode context the
@ -583,7 +622,7 @@ void Scanner::Scan() {


 void Scanner::SeekForward(int pos) {
-  source_.SeekForward(pos - 1);
+  source_->SeekForward(pos - 1);
  Advance();
  Scan();
 }
--- a/src/scanner.h
+++ b/src/scanner.h
@ -73,24 +73,53 @@ class UTF8Buffer {
 class UTF16Buffer {
 public:
  UTF16Buffer();
+  virtual ~UTF16Buffer() {}
+
+  virtual void PushBack(uc32 ch) = 0;
+  // returns a value < 0 when the buffer end is reached
+  virtual uc32 Advance() = 0;
+  virtual void SeekForward(int pos) = 0;

-  void Initialize(Handle<String> data, unibrow::CharacterStream* stream);
-  void PushBack(uc32 ch);
-  uc32 Advance();  // returns a value < 0 when the buffer end is reached
-  uint16_t CharAt(int index);
  int pos() const { return pos_; }
  int size() const { return size_; }
  Handle<String> SubString(int start, int end);
-  List<uc32>* pushback_buffer() { return &pushback_buffer_; }
-  void SeekForward(int pos);

- private:
+ protected:
  Handle<String> data_;
  int pos_;
  int size_;
+};
+
+
+class CharacterStreamUTF16Buffer: public UTF16Buffer {
+ public:
+  CharacterStreamUTF16Buffer();
+  virtual ~CharacterStreamUTF16Buffer() {}
+  void Initialize(Handle<String> data, unibrow::CharacterStream* stream);
+  virtual void PushBack(uc32 ch);
+  virtual uc32 Advance();
+  virtual void SeekForward(int pos);
+
+ private:
  List<uc32> pushback_buffer_;
  uc32 last_;
  unibrow::CharacterStream* stream_;
+
+  List<uc32>* pushback_buffer() { return &pushback_buffer_; }
+};
+
+
+class TwoByteStringUTF16Buffer: public UTF16Buffer {
+ public:
+  TwoByteStringUTF16Buffer();
+  virtual ~TwoByteStringUTF16Buffer() {}
+  void Initialize(Handle<ExternalTwoByteString> data);
+  virtual void PushBack(uc32 ch);
+  virtual uc32 Advance();
+  virtual void SeekForward(int pos);
+
+ private:
+  const uint16_t* raw_data_;
 };


@ -184,8 +213,11 @@ class Scanner {
  static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;

 private:
+  CharacterStreamUTF16Buffer char_stream_buffer_;
+  TwoByteStringUTF16Buffer two_byte_string_buffer_;
+
  // Source.
-  UTF16Buffer source_;
+  UTF16Buffer* source_;
  int position_;

  // Buffer to hold literal values (identifiers, strings, numbers)
@ -219,8 +251,11 @@ class Scanner {
  void TerminateLiteral();

  // Low-level scanning support.
-  void Advance();
-  void PushBack(uc32 ch);
+  void Advance() { c0_ = source_->Advance(); }
+  void PushBack(uc32 ch) {
+    source_->PushBack(ch);
+    c0_ = ch;
+  }

  bool SkipWhiteSpace();
  Token::Value SkipSingleLineComment();
@ -243,7 +278,7 @@ class Scanner {

  // Return the current source position.
  int source_pos() {
-    return source_.pos() - kCharacterLookaheadBufferSize + position_;
+    return source_->pos() - kCharacterLookaheadBufferSize + position_;
  }

  // Decodes a unicode escape-sequence which is part of an identifier.