diff --git a/src/scanner.cc b/src/scanner.cc index 34cac08c3d..1a8ed75342 100644 --- a/src/scanner.cc +++ b/src/scanner.cc @@ -42,27 +42,29 @@ unibrow::Predicate Scanner::kIsIdentifierPart; unibrow::Predicate Scanner::kIsLineTerminator; unibrow::Predicate Scanner::kIsWhiteSpace; + StaticResource Scanner::utf8_decoder_; + // ---------------------------------------------------------------------------- // UTF8Buffer -UTF8Buffer::UTF8Buffer() : - data_(NULL), limit_(NULL) { -} +UTF8Buffer::UTF8Buffer() : data_(NULL), limit_(NULL) { } + UTF8Buffer::~UTF8Buffer() { DeleteArray(data_); } + void UTF8Buffer::AddCharSlow(uc32 c) { static const int kCapacityGrowthLimit = 1 * MB; if (cursor_ > limit_) { int old_capacity = Capacity(); int old_position = pos(); - int new_capacity = Min(old_capacity * 3, old_capacity - + kCapacityGrowthLimit); - char* new_data = NewArray (new_capacity); + int new_capacity = + Min(old_capacity * 3, old_capacity + kCapacityGrowthLimit); + char* new_data = NewArray(new_capacity); memcpy(new_data, data_, old_position); DeleteArray(data_); data_ = new_data; @@ -70,30 +72,32 @@ void UTF8Buffer::AddCharSlow(uc32 c) { limit_ = ComputeLimit(new_data, new_capacity); ASSERT(Capacity() == new_capacity && pos() == old_position); } - if (static_cast (c) <= unibrow::Utf8::kMaxOneByteChar) { - *cursor_++ = c; // Common case: 7-bit ASCII. + if (static_cast(c) <= unibrow::Utf8::kMaxOneByteChar) { + *cursor_++ = c; // Common case: 7-bit ASCII. } else { cursor_ += unibrow::Utf8::Encode(cursor_, c); } ASSERT(pos() <= Capacity()); } + // ---------------------------------------------------------------------------- // UTF16Buffer -UTF16Buffer::UTF16Buffer() : - pos_(0), size_(0) { -} +UTF16Buffer::UTF16Buffer() + : pos_(0), size_(0) { } + Handle UTF16Buffer::SubString(int start, int end) { return internal::SubString(data_, start, end); } + // CharacterStreamUTF16Buffer -CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer() : - pushback_buffer_(0), last_(0), stream_(NULL) { -} +CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer() + : pushback_buffer_(0), last_(0), stream_(NULL) { } + void CharacterStreamUTF16Buffer::Initialize(Handle data, unibrow::CharacterStream* input) { @@ -102,12 +106,14 @@ void CharacterStreamUTF16Buffer::Initialize(Handle data, stream_ = input; } + void CharacterStreamUTF16Buffer::PushBack(uc32 ch) { pushback_buffer()->Add(last_); last_ = ch; pos_--; } + uc32 CharacterStreamUTF16Buffer::Advance() { // NOTE: It is of importance to Persian / Farsi resources that we do // *not* strip format control characters in the scanner; see @@ -128,22 +134,25 @@ uc32 CharacterStreamUTF16Buffer::Advance() { // Note: currently the following increment is necessary to avoid a // test-parser problem! pos_++; - return last_ = static_cast (-1); + return last_ = static_cast(-1); } } + void CharacterStreamUTF16Buffer::SeekForward(int pos) { pos_ = pos; ASSERT(pushback_buffer()->is_empty()); stream_->Seek(pos); } -// TwoByteStringUTF16Buffer -TwoByteStringUTF16Buffer::TwoByteStringUTF16Buffer() : - raw_data_(NULL) { -} -void TwoByteStringUTF16Buffer::Initialize(Handle data) { +// TwoByteStringUTF16Buffer +TwoByteStringUTF16Buffer::TwoByteStringUTF16Buffer() + : raw_data_(NULL) { } + + +void TwoByteStringUTF16Buffer::Initialize( + Handle data) { ASSERT(!data.is_null()); data_ = data; @@ -153,6 +162,7 @@ void TwoByteStringUTF16Buffer::Initialize(Handle data) { size_ = data->length(); } + uc32 TwoByteStringUTF16Buffer::Advance() { if (pos_ < size_) { return raw_data_[pos_++]; @@ -160,35 +170,50 @@ uc32 TwoByteStringUTF16Buffer::Advance() { // note: currently the following increment is necessary to avoid a // test-parser problem! pos_++; - return static_cast (-1); + return static_cast(-1); } } + void TwoByteStringUTF16Buffer::PushBack(uc32 ch) { pos_--; ASSERT(pos_ >= Scanner::kCharacterLookaheadBufferSize); ASSERT(raw_data_[pos_ - Scanner::kCharacterLookaheadBufferSize] == ch); } + void TwoByteStringUTF16Buffer::SeekForward(int pos) { pos_ = pos; } + // ---------------------------------------------------------------------------- // Keyword Matcher -KeywordMatcher::FirstState KeywordMatcher::first_states_[] = { { "break", - KEYWORD_PREFIX, Token::BREAK }, { NULL, C, Token::ILLEGAL }, { NULL, D, - Token::ILLEGAL }, { "else", KEYWORD_PREFIX, Token::ELSE }, { NULL, F, - Token::ILLEGAL }, { NULL, UNMATCHABLE, Token::ILLEGAL }, { NULL, - UNMATCHABLE, Token::ILLEGAL }, { NULL, I, Token::ILLEGAL }, { NULL, - UNMATCHABLE, Token::ILLEGAL }, { NULL, UNMATCHABLE, Token::ILLEGAL }, { - NULL, UNMATCHABLE, Token::ILLEGAL }, { NULL, UNMATCHABLE, Token::ILLEGAL }, - { NULL, N, Token::ILLEGAL }, { NULL, UNMATCHABLE, Token::ILLEGAL }, { NULL, - UNMATCHABLE, Token::ILLEGAL }, { NULL, UNMATCHABLE, Token::ILLEGAL }, { - "return", KEYWORD_PREFIX, Token::RETURN }, { "switch", KEYWORD_PREFIX, - Token::SWITCH }, { NULL, T, Token::ILLEGAL }, { NULL, UNMATCHABLE, - Token::ILLEGAL }, { NULL, V, Token::ILLEGAL }, { NULL, W, - Token::ILLEGAL } }; +KeywordMatcher::FirstState KeywordMatcher::first_states_[] = { + { "break", KEYWORD_PREFIX, Token::BREAK }, + { NULL, C, Token::ILLEGAL }, + { NULL, D, Token::ILLEGAL }, + { "else", KEYWORD_PREFIX, Token::ELSE }, + { NULL, F, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, I, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, N, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { "return", KEYWORD_PREFIX, Token::RETURN }, + { "switch", KEYWORD_PREFIX, Token::SWITCH }, + { NULL, T, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, V, Token::ILLEGAL }, + { NULL, W, Token::ILLEGAL } +}; + void KeywordMatcher::Step(uc32 input) { switch (state_) { @@ -222,56 +247,38 @@ void KeywordMatcher::Step(uc32 input) { token_ = Token::IDENTIFIER; break; case C: - if (MatchState(input, 'a', CA)) - return; - if (MatchState(input, 'o', CO)) - return; + if (MatchState(input, 'a', CA)) return; + if (MatchState(input, 'o', CO)) return; break; case CA: - if (MatchKeywordStart(input, "case", 2, Token::CASE)) - return; - if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) - return; + if (MatchKeywordStart(input, "case", 2, Token::CASE)) return; + if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return; break; case CO: - if (MatchState(input, 'n', CON)) - return; + if (MatchState(input, 'n', CON)) return; break; case CON: - if (MatchKeywordStart(input, "const", 3, Token::CONST)) - return; - if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) - return; + if (MatchKeywordStart(input, "const", 3, Token::CONST)) return; + if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return; break; case D: - if (MatchState(input, 'e', DE)) - return; - if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) - return; + if (MatchState(input, 'e', DE)) return; + if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return; break; case DE: - if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) - return; - if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) - return; - if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) - return; + if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return; + if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return; + if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return; break; case F: - if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) - return; - if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) - return; - if (MatchKeywordStart(input, "for", 1, Token::FOR)) - return; - if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) - return; + if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return; + if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return; + if (MatchKeywordStart(input, "for", 1, Token::FOR)) return; + if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return; break; case I: - if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) - return; - if (MatchKeyword(input, 'n', IN, Token::IN)) - return; + if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return; + if (MatchKeyword(input, 'n', IN, Token::IN)) return; break; case IN: token_ = Token::IDENTIFIER; @@ -280,44 +287,30 @@ void KeywordMatcher::Step(uc32 input) { } break; case N: - if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) - return; - if (MatchKeywordStart(input, "new", 1, Token::NEW)) - return; - if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) - return; + if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return; + if (MatchKeywordStart(input, "new", 1, Token::NEW)) return; + if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return; break; case T: - if (MatchState(input, 'h', TH)) - return; - if (MatchState(input, 'r', TR)) - return; - if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) - return; + if (MatchState(input, 'h', TH)) return; + if (MatchState(input, 'r', TR)) return; + if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return; break; case TH: - if (MatchKeywordStart(input, "this", 2, Token::THIS)) - return; - if (MatchKeywordStart(input, "throw", 2, Token::THROW)) - return; + if (MatchKeywordStart(input, "this", 2, Token::THIS)) return; + if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return; break; case TR: - if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) - return; - if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) - return; + if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return; + if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return; break; case V: - if (MatchKeywordStart(input, "var", 1, Token::VAR)) - return; - if (MatchKeywordStart(input, "void", 1, Token::VOID)) - return; + if (MatchKeywordStart(input, "var", 1, Token::VAR)) return; + if (MatchKeywordStart(input, "void", 1, Token::VOID)) return; break; case W: - if (MatchKeywordStart(input, "while", 1, Token::WHILE)) - return; - if (MatchKeywordStart(input, "with", 1, Token::WITH)) - return; + if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return; + if (MatchKeywordStart(input, "with", 1, Token::WITH)) return; break; default: UNREACHABLE(); @@ -326,20 +319,19 @@ void KeywordMatcher::Step(uc32 input) { state_ = UNMATCHABLE; } + // ---------------------------------------------------------------------------- // Scanner -Scanner::Scanner(bool pre) : - stack_overflow_(false), is_pre_parsing_(pre) { -} +Scanner::Scanner(bool pre) : stack_overflow_(false), is_pre_parsing_(pre) { } + void Scanner::Init(Handle source, unibrow::CharacterStream* stream, - int position) { + int position) { // Initialize the source buffer. if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) { two_byte_string_buffer_.Initialize( - Handle::cast( - source)); + Handle::cast(source)); source_ = &two_byte_string_buffer_; } else { char_stream_buffer_.Initialize(source, stream); @@ -359,10 +351,12 @@ void Scanner::Init(Handle source, unibrow::CharacterStream* stream, Scan(); } + Handle Scanner::SubString(int start, int end) { return source_->SubString(start - position_, end - position_); } + Token::Value Scanner::Next() { // BUG 1215673: Find a thread safe way to set a stack limit in // pre-parse mode. Otherwise, we cannot safely pre-parse from other @@ -380,30 +374,35 @@ Token::Value Scanner::Next() { return current_.token; } + void Scanner::StartLiteral() { // Use the first buffer unless it's currently in use by the current_ token. // In most cases we won't have two literals/identifiers in a row, so // the second buffer won't be used very often and is unlikely to grow much. UTF8Buffer* free_buffer = (current_.literal_buffer != &literal_buffer_1_) ? &literal_buffer_1_ - : &literal_buffer_2_; + : &literal_buffer_2_; next_.literal_buffer = free_buffer; free_buffer->Reset(); } + void Scanner::AddChar(uc32 c) { next_.literal_buffer->AddChar(c); } + void Scanner::TerminateLiteral() { AddChar(0); } + void Scanner::AddCharAdvance() { AddChar(c0_); Advance(); } + static inline bool IsByteOrderMark(uc32 c) { // The Unicode value U+FFFE is guaranteed never to be assigned as a // Unicode character; this implies that in a Unicode context the @@ -415,6 +414,7 @@ static inline bool IsByteOrderMark(uc32 c) { return c == 0xFEFF || c == 0xFFFE; } + bool Scanner::SkipWhiteSpace() { int start_position = source_pos(); @@ -445,15 +445,16 @@ bool Scanner::SkipWhiteSpace() { // Continue skipping white space after the comment. continue; } - PushBack('-'); // undo Advance() + PushBack('-'); // undo Advance() } - PushBack('-'); // undo Advance() + PushBack('-'); // undo Advance() } // Return whether or not we skipped any characters. return source_pos() != start_position; } } + Token::Value Scanner::SkipSingleLineComment() { Advance(); @@ -469,6 +470,7 @@ Token::Value Scanner::SkipSingleLineComment() { return Token::WHITESPACE; } + Token::Value Scanner::SkipMultiLineComment() { ASSERT(c0_ == '*'); Advance(); @@ -493,21 +495,22 @@ Token::Value Scanner::SkipMultiLineComment() { return Token::ILLEGAL; } + Token::Value Scanner::ScanHtmlComment() { // Check for