[parser] Perfect hash for keywords

Use gperf to generate a perfect hash table for keyword lookup. Adds a
python script which munges the output of gperf and adds additional
cleanup and optimisations.

Change-Id: I3656a7287dbd0688917893de3a671faef9e4578a
Reviewed-on: https://chromium-review.googlesource.com/c/1349240
Commit-Queue: Leszek Swirski <leszeks@chromium.org>
Reviewed-by: Toon Verwaest <verwaest@chromium.org>
Reviewed-by: Marja Hölttä <marja@chromium.org>
Cr-Commit-Position: refs/heads/master@{#57790}
This commit is contained in:
Leszek Swirski 2018-11-23 15:21:10 +01:00 committed by Commit Bot
parent 1e85444372
commit ca086a497c
4 changed files with 518 additions and 38 deletions

223
src/parsing/keywords-gen.h Normal file
View File

@ -0,0 +1,223 @@
// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file is automatically generated by gen-keywords-gen-h.py and should not
// be modified manually.
#ifndef V8_PARSING_KEYWORDS_GEN_H_
#define V8_PARSING_KEYWORDS_GEN_H_
#include "src/parsing/token.h"
namespace v8 {
namespace internal {
/* C++ code produced by gperf version 3.1 */
/* Command-line: gperf -m100 src/parsing/keywords.txt */
/* Computed positions: -k'1-2' */
#if !( \
(' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' == 37) && \
('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) && \
('*' == 42) && ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && \
('/' == 47) && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && \
('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && \
('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) && ('=' == 61) && \
('>' == 62) && ('?' == 63) && ('A' == 65) && ('B' == 66) && ('C' == 67) && \
('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) && ('H' == 72) && \
('I' == 73) && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) && \
('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) && ('R' == 82) && \
('S' == 83) && ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) && \
('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) && \
('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) && \
('a' == 97) && ('b' == 98) && ('c' == 99) && ('d' == 100) && \
('e' == 101) && ('f' == 102) && ('g' == 103) && ('h' == 104) && \
('i' == 105) && ('j' == 106) && ('k' == 107) && ('l' == 108) && \
('m' == 109) && ('n' == 110) && ('o' == 111) && ('p' == 112) && \
('q' == 113) && ('r' == 114) && ('s' == 115) && ('t' == 116) && \
('u' == 117) && ('v' == 118) && ('w' == 119) && ('x' == 120) && \
('y' == 121) && ('z' == 122) && ('{' == 123) && ('|' == 124) && \
('}' == 125) && ('~' == 126))
/* The character set is not based on ISO-646. */
#error "gperf generated tables don't work with this execution character set."
// If you see this error, please report a bug to <bug-gperf@gnu.org>.
#endif
#line 16 "src/parsing/keywords.txt"
struct PerfectKeywordHashTableEntry {
const char* name;
Token::Value value;
};
enum {
TOTAL_KEYWORDS = 47,
MIN_WORD_LENGTH = 2,
MAX_WORD_LENGTH = 10,
MIN_HASH_VALUE = 2,
MAX_HASH_VALUE = 51
};
/* maximum key range = 50, duplicates = 0 */
class PerfectKeywordHash {
private:
static inline unsigned int Hash(const char* str, int len);
public:
static inline Token::Value GetToken(const char* str, int len);
};
inline unsigned int PerfectKeywordHash::Hash(const char* str, int len) {
static const unsigned char asso_values[] = {
52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
52, 8, 2, 6, 0, 0, 9, 52, 21, 0, 52, 52, 36, 40, 0, 3,
6, 52, 17, 13, 16, 16, 38, 25, 6, 26, 52, 52, 52, 52, 52, 52};
return len + asso_values[static_cast<unsigned char>(str[1])] +
asso_values[static_cast<unsigned char>(str[0])];
}
static const unsigned char kPerfectKeywordLengthTable[64] = {
0, 0, 2, 3, 4, 2, 6, 7, 8, 9, 10, 2, 6, 7, 5, 3, 7, 8, 4, 5, 4, 7,
5, 6, 5, 0, 5, 0, 6, 4, 7, 5, 9, 8, 5, 6, 3, 4, 5, 3, 4, 4, 5, 0,
6, 4, 6, 5, 6, 3, 10, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static const struct PerfectKeywordHashTableEntry kPerfectKeywordHashTable[64] =
{{"", Token::IDENTIFIER},
{"", Token::IDENTIFIER},
#line 41 "src/parsing/keywords.txt"
{"in", Token::IN},
#line 45 "src/parsing/keywords.txt"
{"new", Token::NEW},
#line 31 "src/parsing/keywords.txt"
{"enum", Token::ENUM},
#line 29 "src/parsing/keywords.txt"
{"do", Token::DO},
#line 28 "src/parsing/keywords.txt"
{"delete", Token::DELETE},
#line 27 "src/parsing/keywords.txt"
{"default", Token::DEFAULT},
#line 26 "src/parsing/keywords.txt"
{"debugger", Token::DEBUGGER},
#line 43 "src/parsing/keywords.txt"
{"interface", Token::FUTURE_STRICT_RESERVED_WORD},
#line 42 "src/parsing/keywords.txt"
{"instanceof", Token::INSTANCEOF},
#line 38 "src/parsing/keywords.txt"
{"if", Token::IF},
#line 32 "src/parsing/keywords.txt"
{"export", Token::EXPORT},
#line 33 "src/parsing/keywords.txt"
{"extends", Token::EXTENDS},
#line 24 "src/parsing/keywords.txt"
{"const", Token::CONST},
#line 36 "src/parsing/keywords.txt"
{"for", Token::FOR},
#line 35 "src/parsing/keywords.txt"
{"finally", Token::FINALLY},
#line 25 "src/parsing/keywords.txt"
{"continue", Token::CONTINUE},
#line 21 "src/parsing/keywords.txt"
{"case", Token::CASE},
#line 22 "src/parsing/keywords.txt"
{"catch", Token::CATCH},
#line 46 "src/parsing/keywords.txt"
{"null", Token::NULL_LITERAL},
#line 47 "src/parsing/keywords.txt"
{"package", Token::FUTURE_STRICT_RESERVED_WORD},
#line 34 "src/parsing/keywords.txt"
{"false", Token::FALSE_LITERAL},
#line 51 "src/parsing/keywords.txt"
{"return", Token::RETURN},
#line 20 "src/parsing/keywords.txt"
{"break", Token::BREAK},
{"", Token::IDENTIFIER},
#line 18 "src/parsing/keywords.txt"
{"async", Token::ASYNC},
{"", Token::IDENTIFIER},
#line 50 "src/parsing/keywords.txt"
{"public", Token::FUTURE_STRICT_RESERVED_WORD},
#line 63 "src/parsing/keywords.txt"
{"with", Token::WITH},
#line 48 "src/parsing/keywords.txt"
{"private", Token::FUTURE_STRICT_RESERVED_WORD},
#line 64 "src/parsing/keywords.txt"
{"yield", Token::YIELD},
#line 49 "src/parsing/keywords.txt"
{"protected", Token::FUTURE_STRICT_RESERVED_WORD},
#line 37 "src/parsing/keywords.txt"
{"function", Token::FUNCTION},
#line 53 "src/parsing/keywords.txt"
{"super", Token::SUPER},
#line 52 "src/parsing/keywords.txt"
{"static", Token::STATIC},
#line 58 "src/parsing/keywords.txt"
{"try", Token::TRY},
#line 57 "src/parsing/keywords.txt"
{"true", Token::TRUE_LITERAL},
#line 19 "src/parsing/keywords.txt"
{"await", Token::AWAIT},
#line 44 "src/parsing/keywords.txt"
{"let", Token::LET},
#line 30 "src/parsing/keywords.txt"
{"else", Token::ELSE},
#line 55 "src/parsing/keywords.txt"
{"this", Token::THIS},
#line 56 "src/parsing/keywords.txt"
{"throw", Token::THROW},
{"", Token::IDENTIFIER},
#line 54 "src/parsing/keywords.txt"
{"switch", Token::SWITCH},
#line 61 "src/parsing/keywords.txt"
{"void", Token::VOID},
#line 40 "src/parsing/keywords.txt"
{"import", Token::IMPORT},
#line 23 "src/parsing/keywords.txt"
{"class", Token::CLASS},
#line 59 "src/parsing/keywords.txt"
{"typeof", Token::TYPEOF},
#line 60 "src/parsing/keywords.txt"
{"var", Token::VAR},
#line 39 "src/parsing/keywords.txt"
{"implements", Token::FUTURE_STRICT_RESERVED_WORD},
#line 62 "src/parsing/keywords.txt"
{"while", Token::WHILE},
{"", Token::IDENTIFIER},
{"", Token::IDENTIFIER},
{"", Token::IDENTIFIER},
{"", Token::IDENTIFIER},
{"", Token::IDENTIFIER},
{"", Token::IDENTIFIER},
{"", Token::IDENTIFIER},
{"", Token::IDENTIFIER},
{"", Token::IDENTIFIER},
{"", Token::IDENTIFIER},
{"", Token::IDENTIFIER},
{"", Token::IDENTIFIER}};
inline Token::Value PerfectKeywordHash::GetToken(const char* str, int len) {
if (IsInRange(len, MIN_WORD_LENGTH, MAX_WORD_LENGTH)) {
unsigned int key = Hash(str, len) & 0x3f;
DCHECK_LT(key, arraysize(kPerfectKeywordLengthTable));
DCHECK_LT(key, arraysize(kPerfectKeywordHashTable));
if (len == kPerfectKeywordLengthTable[key]) {
const char* s = kPerfectKeywordHashTable[key].name;
while (*s != 0) {
if (*s++ != *str++) return Token::IDENTIFIER;
}
return kPerfectKeywordHashTable[key].value;
}
}
return Token::IDENTIFIER;
}
} // namespace internal
} // namespace v8
#endif // V8_PARSING_KEYWORDS_GEN_H_

64
src/parsing/keywords.txt Normal file
View File

@ -0,0 +1,64 @@
%struct-type
%language=C++
%global-table
%define initializer-suffix ,Token::IDENTIFIER
%define hash-function-name Hash
%define lookup-function-name GetToken
%define class-name PerfectKeywordHash
%define word-array-name kPerfectKeywordHashTable
%define length-table-name kPerfectKeywordLengthTable
%7bit
%compare-lengths
%enum
%readonly-tables
%compare-strncmp
struct PerfectKeywordHashTableEntry { const char* name; Token::Value value; };
%%
async, Token::ASYNC
await, Token::AWAIT
break, Token::BREAK
case, Token::CASE
catch, Token::CATCH
class, Token::CLASS
const, Token::CONST
continue, Token::CONTINUE
debugger, Token::DEBUGGER
default, Token::DEFAULT
delete, Token::DELETE
do, Token::DO
else, Token::ELSE
enum, Token::ENUM
export, Token::EXPORT
extends, Token::EXTENDS
false, Token::FALSE_LITERAL
finally, Token::FINALLY
for, Token::FOR
function, Token::FUNCTION
if, Token::IF
implements, Token::FUTURE_STRICT_RESERVED_WORD
import, Token::IMPORT
in, Token::IN
instanceof, Token::INSTANCEOF
interface, Token::FUTURE_STRICT_RESERVED_WORD
let, Token::LET
new, Token::NEW
null, Token::NULL_LITERAL
package, Token::FUTURE_STRICT_RESERVED_WORD
private, Token::FUTURE_STRICT_RESERVED_WORD
protected, Token::FUTURE_STRICT_RESERVED_WORD
public, Token::FUTURE_STRICT_RESERVED_WORD
return, Token::RETURN
static, Token::STATIC
super, Token::SUPER
switch, Token::SWITCH
this, Token::THIS
throw, Token::THROW
true, Token::TRUE_LITERAL
try, Token::TRY
typeof, Token::TYPEOF
var, Token::VAR
void, Token::VOID
while, Token::WHILE
with, Token::WITH
yield, Token::YIELD

View File

@ -6,6 +6,7 @@
#define V8_PARSING_SCANNER_INL_H_ #define V8_PARSING_SCANNER_INL_H_
#include "src/char-predicates-inl.h" #include "src/char-predicates-inl.h"
#include "src/parsing/keywords-gen.h"
#include "src/parsing/scanner.h" #include "src/parsing/scanner.h"
namespace v8 { namespace v8 {
@ -90,44 +91,8 @@ constexpr bool IsKeywordStart(char c) {
V8_INLINE Token::Value KeywordOrIdentifierToken(const uint8_t* input, V8_INLINE Token::Value KeywordOrIdentifierToken(const uint8_t* input,
int input_length) { int input_length) {
DCHECK_GE(input_length, 1); DCHECK_GE(input_length, 1);
const int kMinLength = 2; return PerfectKeywordHash::GetToken(reinterpret_cast<const char*>(input),
const int kMaxLength = 10; input_length);
if (!IsInRange(input_length, kMinLength, kMaxLength)) {
return Token::IDENTIFIER;
}
switch (input[0]) {
default:
#define KEYWORD_GROUP_CASE(ch) \
break; \
case ch:
#define KEYWORD(keyword, token) \
{ \
/* 'keyword' is a char array, so sizeof(keyword) is */ \
/* strlen(keyword) plus 1 for the NUL char. */ \
const int keyword_length = sizeof(keyword) - 1; \
STATIC_ASSERT(keyword_length >= kMinLength); \
STATIC_ASSERT(keyword_length <= kMaxLength); \
DCHECK_EQ(input[0], keyword[0]); \
DCHECK(token == Token::FUTURE_STRICT_RESERVED_WORD || \
0 == strncmp(keyword, Token::String(token), sizeof(keyword))); \
if (input_length == keyword_length && input[1] == keyword[1] && \
(keyword_length <= 2 || input[2] == keyword[2]) && \
(keyword_length <= 3 || input[3] == keyword[3]) && \
(keyword_length <= 4 || input[4] == keyword[4]) && \
(keyword_length <= 5 || input[5] == keyword[5]) && \
(keyword_length <= 6 || input[6] == keyword[6]) && \
(keyword_length <= 7 || input[7] == keyword[7]) && \
(keyword_length <= 8 || input[8] == keyword[8]) && \
(keyword_length <= 9 || input[9] == keyword[9]) && \
(keyword_length <= 10 || input[10] == keyword[10])) { \
return token; \
} \
}
KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
}
return Token::IDENTIFIER;
#undef KEYWORD
#undef KEYWORD_GROUP_CASE
} }
// Recursive constexpr template magic to check if a character is in a given // Recursive constexpr template magic to check if a character is in a given

228
tools/gen-keywords-gen-h.py Executable file
View File

@ -0,0 +1,228 @@
#!/usr/bin/env python
# Copyright 2018 the V8 project authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import os
import sys
import subprocess
import re
import math
import datetime
INPUT_PATH = "src/parsing/keywords.txt"
OUTPUT_PATH = "src/parsing/keywords-gen.h"
def next_power_of_2(x):
return 1 if x == 0 else 2**int(math.ceil(math.log(x, 2)))
def call_with_input(cmd, input_string=""):
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
stdout, _ = p.communicate(input_string)
retcode = p.wait()
if retcode != 0:
raise subprocess.CalledProcessError(retcode, cmd)
return stdout
def checked_sub(pattern, sub, out, count=1, flags=0):
out, n = re.subn(pattern, sub, out, flags=flags)
if n != count:
raise Exception("Didn't get exactly %d replacement(s) for pattern: %s" %
(count, pattern))
return out
def change_sizet_to_int(out):
# Literal buffer lengths are given as ints, not size_t
return checked_sub(r'\bsize_t\b', 'int', out, count=4)
def trim_char_table(out):
# Potential keyword strings are known to be lowercase ascii, so chop off the
# rest of the table and mask out the char
out = checked_sub(
r'static const unsigned char asso_values\[\]\s*=\s*\{(\s*\d+\s*,){96}',
r'static const unsigned char asso_values[] = {',
out,
flags=re.MULTILINE)
out = checked_sub(
r'asso_values\[static_cast<unsigned char>\(str\[(\d+)\]\)\]',
r'asso_values[static_cast<unsigned char>(str[(\1)]&31)]',
out,
flags=re.MULTILINE)
return out
def use_isinrange(out):
# Our IsInRange method is more efficient than checking for min/max length
return checked_sub(r'if \(len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH\)',
r'if (IsInRange(len, MIN_WORD_LENGTH, MAX_WORD_LENGTH))',
out)
def pad_tables(out):
# We don't want to compare against the max hash value, so pad the tables up
# to a power of two and mask the hash.
# First get the new size
max_hash_value = int(re.search(r'MAX_HASH_VALUE\s*=\s*(\d+)', out).group(1))
old_table_length = max_hash_value + 1
new_table_length = next_power_of_2(old_table_length)
table_padding_len = new_table_length - old_table_length
# Pad the length table.
single_lengthtable_entry = r'\d+'
out = checked_sub(
r"""
static\ const\ unsigned\ char\ kPerfectKeywordLengthTable\[\]\s*=\s*\{
(
\s*%(single_lengthtable_entry)s\s*
(?:,\s*%(single_lengthtable_entry)s\s*)*
)
\}
""" % {'single_lengthtable_entry': single_lengthtable_entry},
r'static const unsigned char kPerfectKeywordLengthTable[%d] = { \1 %s }'
% (new_table_length, "".join([',0'] * table_padding_len)),
out,
flags=re.MULTILINE | re.VERBOSE)
# Pad the word list.
single_wordlist_entry = r"""
(?:\#line\ \d+\ ".*"$\s*)?
\{\s*"[a-z]*"\s*,\s*Token::[A-Z_]+\}
"""
out = checked_sub(
r"""
static\ const\ struct\ PerfectKeywordHashTableEntry\ kPerfectKeywordHashTable\[\]\s*=\s*\{
(
\s*%(single_wordlist_entry)s\s*
(?:,\s*%(single_wordlist_entry)s\s*)*
)
\}
""" % {'single_wordlist_entry': single_wordlist_entry},
r'static const struct PerfectKeywordHashTableEntry kPerfectKeywordHashTable[%d] = {\1 %s }'
% (new_table_length, "".join(
[',{"",Token::IDENTIFIER}'] * table_padding_len)),
out,
flags=re.MULTILINE | re.VERBOSE)
# Mask the hash and replace the range check with DCHECKs.
out = checked_sub(r'Hash\s*\(\s*str,\s*len\s*\)',
r'Hash(str, len)&0x%x' % (new_table_length - 1), out)
out = checked_sub(
r'if \(key <= MAX_HASH_VALUE\)',
r'DCHECK_LT(key, arraysize(kPerfectKeywordLengthTable));DCHECK_LT(key, arraysize(kPerfectKeywordHashTable));',
out)
return out
def return_token(out):
# We want to return the actual token rather than the table entry.
# Change the return type of the function. Make it inline too.
out = checked_sub(
r'const\s*struct\s*PerfectKeywordHashTableEntry\s*\*\s*((?:PerfectKeywordHash::)?GetToken)',
r'inline Token::Value \1',
out,
count=2)
# Change the return value when the keyword is found
out = checked_sub(r'return &kPerfectKeywordHashTable\[key\];',
r'return kPerfectKeywordHashTable[key].value;', out)
# Change the return value when the keyword is not found
out = checked_sub(r'return 0;', r'return Token::IDENTIFIER;', out)
return out
def memcmp_to_while(out):
# It's faster to loop over the keyword with a while loop than calling memcmp.
# Careful, this replacement is quite flaky, because otherwise the regex is
# unreadable.
return checked_sub(
re.escape("if (*str == *s && !memcmp (str + 1, s + 1, len - 1))") + r"\s*"
+ re.escape("return kPerfectKeywordHashTable[key].value;"),
"""
while(*s!=0) {
if (*s++ != *str++) return Token::IDENTIFIER;
}
return kPerfectKeywordHashTable[key].value;
""",
out,
flags=re.MULTILINE)
def wrap_namespace(out):
return """// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file is automatically generated by gen-keywords-gen-h.py and should not
// be modified manually.
#ifndef V8_PARSING_KEYWORDS_GEN_H_
#define V8_PARSING_KEYWORDS_GEN_H_
#include "src/parsing/token.h"
namespace v8 {
namespace internal {
%s
} // namespace internal
} // namespace v8
#endif // V8_PARSING_KEYWORDS_GEN_H_
""" % (out)
def trim_character_set_warning(out):
# gperf generates an error message that is too large, trim it
return out.replace(
'"gperf generated tables don\'t work with this execution character set. Please report a bug to <bug-gperf@gnu.org>."',
'"gperf generated tables don\'t work with this execution character set."\\\n// If you see this error, please report a bug to <bug-gperf@gnu.org>.'
)
def main():
try:
script_dir = os.path.dirname(sys.argv[0])
root_dir = os.path.join(script_dir, '..')
out = subprocess.check_output(["gperf", "-m100", INPUT_PATH], cwd=root_dir)
# And now some munging of the generated file.
out = change_sizet_to_int(out)
# TODO(leszeks): This seems to regress performance, investigate.
#out = trim_char_table(out)
out = use_isinrange(out)
out = pad_tables(out)
out = return_token(out)
out = memcmp_to_while(out)
out = wrap_namespace(out)
out = trim_character_set_warning(out)
# Final formatting.
clang_format_path = os.path.join(root_dir,
'third_party/depot_tools/clang-format')
out = call_with_input([clang_format_path], out)
with open(os.path.join(root_dir, OUTPUT_PATH), 'w') as f:
f.write(out)
return 0
except subprocess.CalledProcessError as e:
sys.stderr.write("Error calling '{}'\n".format(" ".join(e.cmd)))
return e.returncode
if __name__ == '__main__':
sys.exit(main())