// Copyright 2006-2008 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "v8.h" #include "ast.h" #include "scanner.h" namespace v8 { namespace internal { // ---------------------------------------------------------------------------- // Character predicates unibrow::Predicate Scanner::kIsIdentifierStart; unibrow::Predicate Scanner::kIsIdentifierPart; unibrow::Predicate Scanner::kIsLineTerminator; unibrow::Predicate Scanner::kIsWhiteSpace; StaticResource Scanner::utf8_decoder_; // ---------------------------------------------------------------------------- // UTF8Buffer UTF8Buffer::UTF8Buffer() : data_(NULL) { Initialize(NULL, 0); } UTF8Buffer::~UTF8Buffer() { DeleteArray(data_); } void UTF8Buffer::Initialize(char* src, int length) { DeleteArray(data_); data_ = src; size_ = length; Reset(); } void UTF8Buffer::AddChar(uc32 c) { const int min_size = 1024; if (pos_ + static_cast(unibrow::Utf8::kMaxEncodedSize) > size_) { int new_size = size_ * 2; if (new_size < min_size) { new_size = min_size; } char* new_data = NewArray(new_size); memcpy(new_data, data_, pos_); DeleteArray(data_); data_ = new_data; size_ = new_size; } if (static_cast(c) < unibrow::Utf8::kMaxOneByteChar) { data_[pos_++] = c; // common case: 7bit ASCII } else { pos_ += unibrow::Utf8::Encode(&data_[pos_], c); } ASSERT(pos_ <= size_); } // ---------------------------------------------------------------------------- // UTF16Buffer UTF16Buffer::UTF16Buffer() : pos_(0), pushback_buffer_(0), last_(0), stream_(NULL) { } void UTF16Buffer::Initialize(Handle data, unibrow::CharacterStream* input) { data_ = data; pos_ = 0; stream_ = input; } Handle UTF16Buffer::SubString(int start, int end) { return internal::SubString(data_, start, end); } void UTF16Buffer::PushBack(uc32 ch) { pushback_buffer()->Add(last_); last_ = ch; pos_--; } uc32 UTF16Buffer::Advance() { // NOTE: It is of importance to Persian / Farsi resources that we do // *not* strip format control characters in the scanner; see // // https://bugzilla.mozilla.org/show_bug.cgi?id=274152 // // So, even though ECMA-262, section 7.1, page 11, dictates that we // must remove Unicode format-control characters, we do not. This is // in line with how IE and SpiderMonkey handles it. if (!pushback_buffer()->is_empty()) { pos_++; return last_ = pushback_buffer()->RemoveLast(); } else if (stream_->has_more()) { pos_++; uc32 next = stream_->GetNext(); return last_ = next; } else { // note: currently the following increment is necessary to avoid a // test-parser problem! pos_++; return last_ = static_cast(-1); } } void UTF16Buffer::SeekForward(int pos) { pos_ = pos; ASSERT(pushback_buffer()->is_empty()); stream_->Seek(pos); } // ---------------------------------------------------------------------------- // Scanner Scanner::Scanner(bool pre) : stack_overflow_(false), is_pre_parsing_(pre) { Token::Initialize(); } void Scanner::Init(Handle source, unibrow::CharacterStream* stream, int position) { // Initialize the source buffer. source_.Initialize(source, stream); position_ = position; // Reset literals buffer literals_.Reset(); // Set c0_ (one character ahead) ASSERT(kCharacterLookaheadBufferSize == 1); Advance(); // Skip initial whitespace (allowing HTML comment ends) and scan // first token. SkipWhiteSpace(true); Scan(); } Handle Scanner::SubString(int start, int end) { return source_.SubString(start - position_, end - position_); } Token::Value Scanner::Next() { // BUG 1215673: Find a thread safe way to set a stack limit in // pre-parse mode. Otherwise, we cannot safely pre-parse from other // threads. current_ = next_; // Check for stack-overflow before returning any tokens. StackLimitCheck check; if (check.HasOverflowed()) { stack_overflow_ = true; next_.token = Token::ILLEGAL; } else { Scan(); } return current_.token; } void Scanner::StartLiteral() { next_.literal_pos = literals_.pos(); } void Scanner::AddChar(uc32 c) { literals_.AddChar(c); } void Scanner::TerminateLiteral() { next_.literal_end = literals_.pos(); AddChar(0); } void Scanner::AddCharAdvance() { AddChar(c0_); Advance(); } void Scanner::Advance() { c0_ = source_.Advance(); } void Scanner::PushBack(uc32 ch) { source_.PushBack(ch); c0_ = ch; } static inline bool IsByteOrderMark(uc32 c) { // The Unicode value U+FFFE is guaranteed never to be assigned as a // Unicode character; this implies that in a Unicode context the // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF // character expressed in little-endian byte order (since it could // not be a U+FFFE character expressed in big-endian byte // order). Nevertheless, we check for it to be compatible with // Spidermonkey. return c == 0xFEFF || c == 0xFFFE; } void Scanner::SkipWhiteSpace(bool initial) { has_line_terminator_before_next_ = initial; while (true) { // We treat byte-order marks (BOMs) as whitespace for better // compatibility with Spidermonkey and other JavaScript engines. while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) { // IsWhiteSpace() includes line terminators! if (kIsLineTerminator.get(c0_)) // Ignore line terminators, but remember them. This is necessary // for automatic semicolon insertion. has_line_terminator_before_next_ = true; Advance(); } // If there is an HTML comment end '-->' at the beginning of a // line (with only whitespace in front of it), we treat the rest // of the line as a comment. This is in line with the way // SpiderMonkey handles it. if (c0_ == '-' && has_line_terminator_before_next_) { Advance(); if (c0_ == '-') { Advance(); if (c0_ == '>') { // Treat the rest of the line as a comment. SkipSingleLineComment(); // Continue skipping white space after the comment. continue; } PushBack('-'); // undo Advance() } PushBack('-'); // undo Advance() } return; } } Token::Value Scanner::SkipSingleLineComment() { Advance(); // The line terminator at the end of the line is not considered // to be part of the single-line comment; it is recognized // separately by the lexical grammar and becomes part of the // stream of input elements for the syntactic grammar (see // ECMA-262, section 7.4, page 12). while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) { Advance(); } return Token::COMMENT; } Token::Value Scanner::SkipMultiLineComment() { ASSERT(c0_ == '*'); Advance(); while (c0_ >= 0) { char ch = c0_; Advance(); // If we have reached the end of the multi-line comment, we // consume the '/' and insert a whitespace. This way all // multi-line comments are treated as whitespace - even the ones // containing line terminators. This contradicts ECMA-262, section // 7.4, page 12, that says that multi-line comments containing // line terminators should be treated as a line terminator, but it // matches the behaviour of SpiderMonkey and KJS. if (ch == '*' && c0_ == '/') { c0_ = ' '; return Token::COMMENT; } } // Unterminated multi-line comment. return Token::ILLEGAL; } Token::Value Scanner::ScanHtmlComment() { // Check for