From 273e860cc1c953c8189577eeea8ec946cba71983 Mon Sep 17 00:00:00 2001 From: "lrn@chromium.org" Date: Thu, 5 Nov 2009 10:11:38 +0000 Subject: [PATCH] Changed keyword token recognition to be done inline in the identifier scanner. Review URL: http://codereview.chromium.org/360048 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@3221 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/scanner.cc | 155 ++++++++++++++++++++++++++++++++---- src/scanner.h | 115 ++++++++++++++++++++++++++ test/cctest/SConscript | 1 + test/cctest/test-parsing.cc | 128 +++++++++++++++++++++++++++++ 4 files changed, 384 insertions(+), 15 deletions(-) create mode 100755 test/cctest/test-parsing.cc diff --git a/src/scanner.cc b/src/scanner.cc index 3dae414f9d..ec2b2c3fcf 100644 --- a/src/scanner.cc +++ b/src/scanner.cc @@ -193,6 +193,139 @@ void TwoByteStringUTF16Buffer::SeekForward(int pos) { } +// ---------------------------------------------------------------------------- +// Keyword Matcher +KeywordMatcher::FirstState KeywordMatcher::first_states_[] = { + { "break", KEYWORD_PREFIX, Token::BREAK }, + { NULL, C, Token::ILLEGAL }, + { NULL, D, Token::ILLEGAL }, + { "else", KEYWORD_PREFIX, Token::ELSE }, + { NULL, F, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, I, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, N, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { "return", KEYWORD_PREFIX, Token::RETURN }, + { "switch", KEYWORD_PREFIX, Token::SWITCH }, + { NULL, T, Token::ILLEGAL }, + { NULL, UNMATCHABLE, Token::ILLEGAL }, + { NULL, V, Token::ILLEGAL }, + { NULL, W, Token::ILLEGAL } +}; + + +void KeywordMatcher::Step(uc32 input) { + switch (state_) { + case INITIAL: { + // matching the first character is the only state with significant fanout. + // Match only lower-case letters in range 'b'..'w'. + unsigned int offset = input - kFirstCharRangeMin; + if (offset < kFirstCharRangeLength) { + state_ = first_states_[offset].state; + if (state_ == KEYWORD_PREFIX) { + keyword_ = first_states_[offset].keyword; + counter_ = 1; + keyword_token_ = first_states_[offset].token; + } + return; + } + break; + } + case KEYWORD_PREFIX: + if (keyword_[counter_] == input) { + ASSERT_NE(input, '\0'); + counter_++; + if (keyword_[counter_] == '\0') { + state_ = KEYWORD_MATCHED; + token_ = keyword_token_; + } + return; + } + break; + case KEYWORD_MATCHED: + token_ = Token::IDENTIFIER; + break; + case C: + if (MatchState(input, 'a', CA)) return; + if (MatchState(input, 'o', CO)) return; + break; + case CA: + if (MatchKeywordStart(input, "case", 2, Token::CASE)) return; + if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return; + break; + case CO: + if (MatchState(input, 'n', CON)) return; + break; + case CON: + if (MatchKeywordStart(input, "const", 3, Token::CONST)) return; + if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return; + break; + case D: + if (MatchState(input, 'e', DE)) return; + if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return; + break; + case DE: + if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return; + if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return; + if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return; + break; + case F: + if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return; + if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return; + if (MatchKeywordStart(input, "for", 1, Token::FOR)) return; + if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return; + break; + case I: + if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return; + if (MatchKeyword(input, 'n', IN, Token::IN)) return; + break; + case IN: + token_ = Token::IDENTIFIER; + if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) { + return; + } + break; + case N: + if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return; + if (MatchKeywordStart(input, "new", 1, Token::NEW)) return; + if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return; + break; + case T: + if (MatchState(input, 'h', TH)) return; + if (MatchState(input, 'r', TR)) return; + if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return; + break; + case TH: + if (MatchKeywordStart(input, "this", 2, Token::THIS)) return; + if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return; + break; + case TR: + if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return; + if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return; + break; + case V: + if (MatchKeywordStart(input, "var", 1, Token::VAR)) return; + if (MatchKeywordStart(input, "void", 1, Token::VOID)) return; + break; + case W: + if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return; + if (MatchKeywordStart(input, "with", 1, Token::WITH)) return; + break; + default: + UNREACHABLE(); + } + // On fallthrough, it's a failure. + state_ = UNMATCHABLE; +} + + // ---------------------------------------------------------------------------- // Scanner @@ -855,48 +988,40 @@ uc32 Scanner::ScanIdentifierUnicodeEscape() { Token::Value Scanner::ScanIdentifier() { ASSERT(kIsIdentifierStart.get(c0_)); - bool has_escapes = false; StartLiteral(); + KeywordMatcher keyword_match; + // Scan identifier start character. if (c0_ == '\\') { - has_escapes = true; uc32 c = ScanIdentifierUnicodeEscape(); // Only allow legal identifier start characters. if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL; AddChar(c); + keyword_match.Fail(); } else { AddChar(c0_); + keyword_match.AddChar(c0_); Advance(); } // Scan the rest of the identifier characters. while (kIsIdentifierPart.get(c0_)) { if (c0_ == '\\') { - has_escapes = true; uc32 c = ScanIdentifierUnicodeEscape(); // Only allow legal identifier part characters. if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL; AddChar(c); + keyword_match.Fail(); } else { AddChar(c0_); + keyword_match.AddChar(c0_); Advance(); } } TerminateLiteral(); - // We don't have any 1-letter keywords (this is probably a common case). - if ((next_.literal_end - next_.literal_pos) == 1) { - return Token::IDENTIFIER; - } - - // If the identifier contains unicode escapes, it must not be - // resolved to a keyword. - if (has_escapes) { - return Token::IDENTIFIER; - } - - return Token::Lookup(&literals_.data()[next_.literal_pos]); + return keyword_match.token(); } diff --git a/src/scanner.h b/src/scanner.h index a201d0e976..201803da5e 100644 --- a/src/scanner.h +++ b/src/scanner.h @@ -123,6 +123,121 @@ class TwoByteStringUTF16Buffer: public UTF16Buffer { }; +class KeywordMatcher { +// Incrementally recognize keywords. +// +// Recognized keywords: +// break case catch const* continue debugger* default delete do else +// finally false for function if in instanceof native* new null +// return switch this throw true try typeof var void while with +// +// *: Actually "future reserved keywords". These are the only ones we +// recognized, the remaining are allowed as identifiers. + public: + KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {} + + Token::Value token() { return token_; } + + inline void AddChar(uc32 input) { + if (state_ != UNMATCHABLE) { + Step(input); + } + } + + void Fail() { + token_ = Token::IDENTIFIER; + state_ = UNMATCHABLE; + } + + private: + enum State { + UNMATCHABLE, + INITIAL, + KEYWORD_PREFIX, + KEYWORD_MATCHED, + C, + CA, + CO, + CON, + D, + DE, + F, + I, + IN, + N, + T, + TH, + TR, + V, + W + }; + + struct FirstState { + const char* keyword; + State state; + Token::Value token; + }; + + // Range of possible first characters of a keyword. + static const unsigned int kFirstCharRangeMin = 'b'; + static const unsigned int kFirstCharRangeMax = 'w'; + static const unsigned int kFirstCharRangeLength = + kFirstCharRangeMax - kFirstCharRangeMin + 1; + // State map for first keyword character range. + static FirstState first_states_[kFirstCharRangeLength]; + + // Current state. + State state_; + // Token for currently added characters. + Token::Value token_; + + // Matching a specific keyword string (there is only one possible valid + // keyword with the current prefix). + const char* keyword_; + int counter_; + Token::Value keyword_token_; + + // If input equals keyword's character at position, continue matching keyword + // from that position. + inline bool MatchKeywordStart(uc32 input, + const char* keyword, + int position, + Token::Value token_if_match) { + if (input == keyword[position]) { + state_ = KEYWORD_PREFIX; + this->keyword_ = keyword; + this->counter_ = position + 1; + this->keyword_token_ = token_if_match; + return true; + } + return false; + } + + // If input equals match character, transition to new state and return true. + inline bool MatchState(uc32 input, char match, State new_state) { + if (input == match) { + state_ = new_state; + return true; + } + return false; + } + + inline bool MatchKeyword(uc32 input, + char match, + State new_state, + Token::Value keyword_token) { + if (input == match) { // Matched "do". + state_ = new_state; + token_ = keyword_token; + return true; + } + return false; + } + + void Step(uc32 input); +}; + + class Scanner { public: diff --git a/test/cctest/SConscript b/test/cctest/SConscript index 9deefa5542..e6c81d80e4 100644 --- a/test/cctest/SConscript +++ b/test/cctest/SConscript @@ -52,6 +52,7 @@ SOURCES = { 'test-log.cc', 'test-log-utils.cc', 'test-mark-compact.cc', + 'test-parsing.cc', 'test-regexp.cc', 'test-serialize.cc', 'test-sockets.cc', diff --git a/test/cctest/test-parsing.cc b/test/cctest/test-parsing.cc new file mode 100755 index 0000000000..eec7677d05 --- /dev/null +++ b/test/cctest/test-parsing.cc @@ -0,0 +1,128 @@ +// Copyright 2006-2009 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include + +#include "v8.h" + +#include "token.h" +#include "scanner.h" + +#include "cctest.h" + +namespace i = ::v8::internal; + +TEST(KeywordMatcher) { + struct KeywordToken { + const char* keyword; + i::Token::Value token; + }; + + static const KeywordToken keywords[] = { +#define KEYWORD(t, s, d) { s, i::Token::t }, +#define IGNORE(t, s, d) /* */ + TOKEN_LIST(IGNORE, KEYWORD, IGNORE) +#undef KEYWORD + { NULL, i::Token::IDENTIFIER } + }; + + static const char* future_keywords[] = { +#define FUTURE(t, s, d) s, + TOKEN_LIST(IGNORE, IGNORE, FUTURE) +#undef FUTURE +#undef IGNORE + NULL + }; + + KeywordToken key_token; + for (int i = 0; (key_token = keywords[i]).keyword != NULL; i++) { + i::KeywordMatcher matcher; + const char* keyword = key_token.keyword; + int length = strlen(keyword); + for (int j = 0; j < length; j++) { + if (key_token.token == i::Token::INSTANCEOF && j == 2) { + // "in" is a prefix of "instanceof". It's the only keyword + // that is a prefix of another. + CHECK_EQ(i::Token::IN, matcher.token()); + } else { + CHECK_EQ(i::Token::IDENTIFIER, matcher.token()); + } + matcher.AddChar(keyword[j]); + } + CHECK_EQ(key_token.token, matcher.token()); + // Adding more characters will make keyword matching fail. + matcher.AddChar('z'); + CHECK_EQ(i::Token::IDENTIFIER, matcher.token()); + // Adding a keyword later will not make it match again. + matcher.AddChar('i'); + matcher.AddChar('f'); + CHECK_EQ(i::Token::IDENTIFIER, matcher.token()); + } + + // Future keywords are not recognized. + const char* future_keyword; + for (int i = 0; (future_keyword = future_keywords[i]) != NULL; i++) { + i::KeywordMatcher matcher; + int length = strlen(future_keyword); + for (int j = 0; j < length; j++) { + matcher.AddChar(future_keyword[j]); + } + CHECK_EQ(i::Token::IDENTIFIER, matcher.token()); + } + + // Zero isn't ignored at first. + i::KeywordMatcher bad_start; + bad_start.AddChar(0); + CHECK_EQ(i::Token::IDENTIFIER, bad_start.token()); + bad_start.AddChar('i'); + bad_start.AddChar('f'); + CHECK_EQ(i::Token::IDENTIFIER, bad_start.token()); + + // Zero isn't ignored at end. + i::KeywordMatcher bad_end; + bad_end.AddChar('i'); + bad_end.AddChar('f'); + CHECK_EQ(i::Token::IF, bad_end.token()); + bad_end.AddChar(0); + CHECK_EQ(i::Token::IDENTIFIER, bad_end.token()); + + // Case isn't ignored. + i::KeywordMatcher bad_case; + bad_case.AddChar('i'); + bad_case.AddChar('F'); + CHECK_EQ(i::Token::IDENTIFIER, bad_case.token()); + + // If we mark it as failure, continuing won't help. + i::KeywordMatcher full_stop; + full_stop.AddChar('i'); + CHECK_EQ(i::Token::IDENTIFIER, full_stop.token()); + full_stop.Fail(); + CHECK_EQ(i::Token::IDENTIFIER, full_stop.token()); + full_stop.AddChar('f'); + CHECK_EQ(i::Token::IDENTIFIER, full_stop.token()); +} +