Changed keyword token recognition to be done inline in the identifier scanner.

Review URL: http://codereview.chromium.org/360048


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@3221 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
lrn@chromium.org 2009-11-05 10:11:38 +00:00
parent 877db0f539
commit 273e860cc1
4 changed files with 384 additions and 15 deletions

View File

@ -193,6 +193,139 @@ void TwoByteStringUTF16Buffer::SeekForward(int pos) {
}
// ----------------------------------------------------------------------------
// Keyword Matcher
KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
{ "break", KEYWORD_PREFIX, Token::BREAK },
{ NULL, C, Token::ILLEGAL },
{ NULL, D, Token::ILLEGAL },
{ "else", KEYWORD_PREFIX, Token::ELSE },
{ NULL, F, Token::ILLEGAL },
{ NULL, UNMATCHABLE, Token::ILLEGAL },
{ NULL, UNMATCHABLE, Token::ILLEGAL },
{ NULL, I, Token::ILLEGAL },
{ NULL, UNMATCHABLE, Token::ILLEGAL },
{ NULL, UNMATCHABLE, Token::ILLEGAL },
{ NULL, UNMATCHABLE, Token::ILLEGAL },
{ NULL, UNMATCHABLE, Token::ILLEGAL },
{ NULL, N, Token::ILLEGAL },
{ NULL, UNMATCHABLE, Token::ILLEGAL },
{ NULL, UNMATCHABLE, Token::ILLEGAL },
{ NULL, UNMATCHABLE, Token::ILLEGAL },
{ "return", KEYWORD_PREFIX, Token::RETURN },
{ "switch", KEYWORD_PREFIX, Token::SWITCH },
{ NULL, T, Token::ILLEGAL },
{ NULL, UNMATCHABLE, Token::ILLEGAL },
{ NULL, V, Token::ILLEGAL },
{ NULL, W, Token::ILLEGAL }
};
void KeywordMatcher::Step(uc32 input) {
switch (state_) {
case INITIAL: {
// matching the first character is the only state with significant fanout.
// Match only lower-case letters in range 'b'..'w'.
unsigned int offset = input - kFirstCharRangeMin;
if (offset < kFirstCharRangeLength) {
state_ = first_states_[offset].state;
if (state_ == KEYWORD_PREFIX) {
keyword_ = first_states_[offset].keyword;
counter_ = 1;
keyword_token_ = first_states_[offset].token;
}
return;
}
break;
}
case KEYWORD_PREFIX:
if (keyword_[counter_] == input) {
ASSERT_NE(input, '\0');
counter_++;
if (keyword_[counter_] == '\0') {
state_ = KEYWORD_MATCHED;
token_ = keyword_token_;
}
return;
}
break;
case KEYWORD_MATCHED:
token_ = Token::IDENTIFIER;
break;
case C:
if (MatchState(input, 'a', CA)) return;
if (MatchState(input, 'o', CO)) return;
break;
case CA:
if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
break;
case CO:
if (MatchState(input, 'n', CON)) return;
break;
case CON:
if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
break;
case D:
if (MatchState(input, 'e', DE)) return;
if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
break;
case DE:
if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
break;
case F:
if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
break;
case I:
if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
if (MatchKeyword(input, 'n', IN, Token::IN)) return;
break;
case IN:
token_ = Token::IDENTIFIER;
if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) {
return;
}
break;
case N:
if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return;
if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
break;
case T:
if (MatchState(input, 'h', TH)) return;
if (MatchState(input, 'r', TR)) return;
if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
break;
case TH:
if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
break;
case TR:
if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
break;
case V:
if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
break;
case W:
if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
break;
default:
UNREACHABLE();
}
// On fallthrough, it's a failure.
state_ = UNMATCHABLE;
}
// ----------------------------------------------------------------------------
// Scanner
@ -855,48 +988,40 @@ uc32 Scanner::ScanIdentifierUnicodeEscape() {
Token::Value Scanner::ScanIdentifier() {
ASSERT(kIsIdentifierStart.get(c0_));
bool has_escapes = false;
StartLiteral();
KeywordMatcher keyword_match;
// Scan identifier start character.
if (c0_ == '\\') {
has_escapes = true;
uc32 c = ScanIdentifierUnicodeEscape();
// Only allow legal identifier start characters.
if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL;
AddChar(c);
keyword_match.Fail();
} else {
AddChar(c0_);
keyword_match.AddChar(c0_);
Advance();
}
// Scan the rest of the identifier characters.
while (kIsIdentifierPart.get(c0_)) {
if (c0_ == '\\') {
has_escapes = true;
uc32 c = ScanIdentifierUnicodeEscape();
// Only allow legal identifier part characters.
if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL;
AddChar(c);
keyword_match.Fail();
} else {
AddChar(c0_);
keyword_match.AddChar(c0_);
Advance();
}
}
TerminateLiteral();
// We don't have any 1-letter keywords (this is probably a common case).
if ((next_.literal_end - next_.literal_pos) == 1) {
return Token::IDENTIFIER;
}
// If the identifier contains unicode escapes, it must not be
// resolved to a keyword.
if (has_escapes) {
return Token::IDENTIFIER;
}
return Token::Lookup(&literals_.data()[next_.literal_pos]);
return keyword_match.token();
}

View File

@ -123,6 +123,121 @@ class TwoByteStringUTF16Buffer: public UTF16Buffer {
};
class KeywordMatcher {
// Incrementally recognize keywords.
//
// Recognized keywords:
// break case catch const* continue debugger* default delete do else
// finally false for function if in instanceof native* new null
// return switch this throw true try typeof var void while with
//
// *: Actually "future reserved keywords". These are the only ones we
// recognized, the remaining are allowed as identifiers.
public:
KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {}
Token::Value token() { return token_; }
inline void AddChar(uc32 input) {
if (state_ != UNMATCHABLE) {
Step(input);
}
}
void Fail() {
token_ = Token::IDENTIFIER;
state_ = UNMATCHABLE;
}
private:
enum State {
UNMATCHABLE,
INITIAL,
KEYWORD_PREFIX,
KEYWORD_MATCHED,
C,
CA,
CO,
CON,
D,
DE,
F,
I,
IN,
N,
T,
TH,
TR,
V,
W
};
struct FirstState {
const char* keyword;
State state;
Token::Value token;
};
// Range of possible first characters of a keyword.
static const unsigned int kFirstCharRangeMin = 'b';
static const unsigned int kFirstCharRangeMax = 'w';
static const unsigned int kFirstCharRangeLength =
kFirstCharRangeMax - kFirstCharRangeMin + 1;
// State map for first keyword character range.
static FirstState first_states_[kFirstCharRangeLength];
// Current state.
State state_;
// Token for currently added characters.
Token::Value token_;
// Matching a specific keyword string (there is only one possible valid
// keyword with the current prefix).
const char* keyword_;
int counter_;
Token::Value keyword_token_;
// If input equals keyword's character at position, continue matching keyword
// from that position.
inline bool MatchKeywordStart(uc32 input,
const char* keyword,
int position,
Token::Value token_if_match) {
if (input == keyword[position]) {
state_ = KEYWORD_PREFIX;
this->keyword_ = keyword;
this->counter_ = position + 1;
this->keyword_token_ = token_if_match;
return true;
}
return false;
}
// If input equals match character, transition to new state and return true.
inline bool MatchState(uc32 input, char match, State new_state) {
if (input == match) {
state_ = new_state;
return true;
}
return false;
}
inline bool MatchKeyword(uc32 input,
char match,
State new_state,
Token::Value keyword_token) {
if (input == match) { // Matched "do".
state_ = new_state;
token_ = keyword_token;
return true;
}
return false;
}
void Step(uc32 input);
};
class Scanner {
public:

View File

@ -52,6 +52,7 @@ SOURCES = {
'test-log.cc',
'test-log-utils.cc',
'test-mark-compact.cc',
'test-parsing.cc',
'test-regexp.cc',
'test-serialize.cc',
'test-sockets.cc',

128
test/cctest/test-parsing.cc Executable file
View File

@ -0,0 +1,128 @@
// Copyright 2006-2009 the V8 project authors. All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdlib.h>
#include "v8.h"
#include "token.h"
#include "scanner.h"
#include "cctest.h"
namespace i = ::v8::internal;
TEST(KeywordMatcher) {
struct KeywordToken {
const char* keyword;
i::Token::Value token;
};
static const KeywordToken keywords[] = {
#define KEYWORD(t, s, d) { s, i::Token::t },
#define IGNORE(t, s, d) /* */
TOKEN_LIST(IGNORE, KEYWORD, IGNORE)
#undef KEYWORD
{ NULL, i::Token::IDENTIFIER }
};
static const char* future_keywords[] = {
#define FUTURE(t, s, d) s,
TOKEN_LIST(IGNORE, IGNORE, FUTURE)
#undef FUTURE
#undef IGNORE
NULL
};
KeywordToken key_token;
for (int i = 0; (key_token = keywords[i]).keyword != NULL; i++) {
i::KeywordMatcher matcher;
const char* keyword = key_token.keyword;
int length = strlen(keyword);
for (int j = 0; j < length; j++) {
if (key_token.token == i::Token::INSTANCEOF && j == 2) {
// "in" is a prefix of "instanceof". It's the only keyword
// that is a prefix of another.
CHECK_EQ(i::Token::IN, matcher.token());
} else {
CHECK_EQ(i::Token::IDENTIFIER, matcher.token());
}
matcher.AddChar(keyword[j]);
}
CHECK_EQ(key_token.token, matcher.token());
// Adding more characters will make keyword matching fail.
matcher.AddChar('z');
CHECK_EQ(i::Token::IDENTIFIER, matcher.token());
// Adding a keyword later will not make it match again.
matcher.AddChar('i');
matcher.AddChar('f');
CHECK_EQ(i::Token::IDENTIFIER, matcher.token());
}
// Future keywords are not recognized.
const char* future_keyword;
for (int i = 0; (future_keyword = future_keywords[i]) != NULL; i++) {
i::KeywordMatcher matcher;
int length = strlen(future_keyword);
for (int j = 0; j < length; j++) {
matcher.AddChar(future_keyword[j]);
}
CHECK_EQ(i::Token::IDENTIFIER, matcher.token());
}
// Zero isn't ignored at first.
i::KeywordMatcher bad_start;
bad_start.AddChar(0);
CHECK_EQ(i::Token::IDENTIFIER, bad_start.token());
bad_start.AddChar('i');
bad_start.AddChar('f');
CHECK_EQ(i::Token::IDENTIFIER, bad_start.token());
// Zero isn't ignored at end.
i::KeywordMatcher bad_end;
bad_end.AddChar('i');
bad_end.AddChar('f');
CHECK_EQ(i::Token::IF, bad_end.token());
bad_end.AddChar(0);
CHECK_EQ(i::Token::IDENTIFIER, bad_end.token());
// Case isn't ignored.
i::KeywordMatcher bad_case;
bad_case.AddChar('i');
bad_case.AddChar('F');
CHECK_EQ(i::Token::IDENTIFIER, bad_case.token());
// If we mark it as failure, continuing won't help.
i::KeywordMatcher full_stop;
full_stop.AddChar('i');
CHECK_EQ(i::Token::IDENTIFIER, full_stop.token());
full_stop.Fail();
CHECK_EQ(i::Token::IDENTIFIER, full_stop.token());
full_stop.AddChar('f');
CHECK_EQ(i::Token::IDENTIFIER, full_stop.token());
}