v8/tools/gen-keywords-gen-h.py
Leszek Swirski 47daa48696 Reland "[parser] Perfect hash for keywords"
This is a reland of ca086a497c

Original change's description:
> [parser] Perfect hash for keywords
> 
> Use gperf to generate a perfect hash table for keyword lookup. Adds a
> python script which munges the output of gperf and adds additional
> cleanup and optimisations.
> 
> Change-Id: I3656a7287dbd0688917893de3a671faef9e4578a
> Reviewed-on: https://chromium-review.googlesource.com/c/1349240
> Commit-Queue: Leszek Swirski <leszeks@chromium.org>
> Reviewed-by: Toon Verwaest <verwaest@chromium.org>
> Reviewed-by: Marja Hölttä <marja@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#57790}

Change-Id: Ifb53527ba3d0652ea4f5d03740f7c856ad5d91da
Reviewed-on: https://chromium-review.googlesource.com/c/1350121
Reviewed-by: Toon Verwaest <verwaest@chromium.org>
Commit-Queue: Leszek Swirski <leszeks@chromium.org>
Cr-Commit-Position: refs/heads/master@{#57831}
2018-11-26 14:10:24 +00:00

248 lines
7.6 KiB
Python
Executable File

#!/usr/bin/env python
# Copyright 2018 the V8 project authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import os
import sys
import subprocess
import re
import math
import datetime
INPUT_PATH = "src/parsing/keywords.txt"
OUTPUT_PATH = "src/parsing/keywords-gen.h"
# TODO(leszeks): Trimming seems to regress performance, investigate.
TRIM_CHAR_TABLE = False
def next_power_of_2(x):
return 1 if x == 0 else 2**int(math.ceil(math.log(x, 2)))
def call_with_input(cmd, input_string=""):
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
stdout, _ = p.communicate(input_string)
retcode = p.wait()
if retcode != 0:
raise subprocess.CalledProcessError(retcode, cmd)
return stdout
def checked_sub(pattern, sub, out, count=1, flags=0):
out, n = re.subn(pattern, sub, out, flags=flags)
if n != count:
raise Exception("Didn't get exactly %d replacement(s) for pattern: %s" %
(count, pattern))
return out
def change_sizet_to_int(out):
# Literal buffer lengths are given as ints, not size_t
return checked_sub(r'\bsize_t\b', 'int', out, count=4)
def trim_and_dcheck_char_table(out):
# Potential keyword strings are known to be lowercase ascii, so chop off the
# rest of the table and mask out the char
reads_re = re.compile(
r'asso_values\[static_cast<unsigned char>\(str\[(\d+)\]\)\]')
dchecks = []
for str_read in reads_re.finditer(out):
dchecks.append("DCHECK_LT(str[%d], 128);" % int(str_read.group(1)))
if TRIM_CHAR_TABLE:
out = checked_sub(
r'static const unsigned char asso_values\[\]\s*=\s*\{(\s*\d+\s*,){96}',
"".join(dchecks) + r'static const unsigned char asso_values[32] = {',
out,
flags=re.MULTILINE)
out = checked_sub(
reads_re.pattern,
r'asso_values[static_cast<unsigned char>(str[(\1)]&31)]',
out,
count=len(dchecks),
flags=re.MULTILINE)
else:
out = checked_sub(
r'static const unsigned char asso_values\[\]\s*=\s*\{',
"".join(dchecks) + r'static const unsigned char asso_values[128] = {',
out,
flags=re.MULTILINE)
return out
def use_isinrange(out):
# Our IsInRange method is more efficient than checking for min/max length
return checked_sub(r'if \(len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH\)',
r'if (IsInRange(len, MIN_WORD_LENGTH, MAX_WORD_LENGTH))',
out)
def pad_tables(out):
# We don't want to compare against the max hash value, so pad the tables up
# to a power of two and mask the hash.
# First get the new size
max_hash_value = int(re.search(r'MAX_HASH_VALUE\s*=\s*(\d+)', out).group(1))
old_table_length = max_hash_value + 1
new_table_length = next_power_of_2(old_table_length)
table_padding_len = new_table_length - old_table_length
# Pad the length table.
single_lengthtable_entry = r'\d+'
out = checked_sub(
r"""
static\ const\ unsigned\ char\ kPerfectKeywordLengthTable\[\]\s*=\s*\{
(
\s*%(single_lengthtable_entry)s\s*
(?:,\s*%(single_lengthtable_entry)s\s*)*
)
\}
""" % {'single_lengthtable_entry': single_lengthtable_entry},
r'static const unsigned char kPerfectKeywordLengthTable[%d] = { \1 %s }'
% (new_table_length, "".join([',0'] * table_padding_len)),
out,
flags=re.MULTILINE | re.VERBOSE)
# Pad the word list.
single_wordlist_entry = r"""
(?:\#line\ \d+\ ".*"$\s*)?
\{\s*"[a-z]*"\s*,\s*Token::[A-Z_]+\}
"""
out = checked_sub(
r"""
static\ const\ struct\ PerfectKeywordHashTableEntry\ kPerfectKeywordHashTable\[\]\s*=\s*\{
(
\s*%(single_wordlist_entry)s\s*
(?:,\s*%(single_wordlist_entry)s\s*)*
)
\}
""" % {'single_wordlist_entry': single_wordlist_entry},
r'static const struct PerfectKeywordHashTableEntry kPerfectKeywordHashTable[%d] = {\1 %s }'
% (new_table_length, "".join(
[',{"",Token::IDENTIFIER}'] * table_padding_len)),
out,
flags=re.MULTILINE | re.VERBOSE)
# Mask the hash and replace the range check with DCHECKs.
out = checked_sub(r'Hash\s*\(\s*str,\s*len\s*\)',
r'Hash(str, len)&0x%x' % (new_table_length - 1), out)
out = checked_sub(
r'if \(key <= MAX_HASH_VALUE\)',
r'DCHECK_LT(key, arraysize(kPerfectKeywordLengthTable));DCHECK_LT(key, arraysize(kPerfectKeywordHashTable));',
out)
return out
def return_token(out):
# We want to return the actual token rather than the table entry.
# Change the return type of the function. Make it inline too.
out = checked_sub(
r'const\s*struct\s*PerfectKeywordHashTableEntry\s*\*\s*((?:PerfectKeywordHash::)?GetToken)',
r'inline Token::Value \1',
out,
count=2)
# Change the return value when the keyword is found
out = checked_sub(r'return &kPerfectKeywordHashTable\[key\];',
r'return kPerfectKeywordHashTable[key].value;', out)
# Change the return value when the keyword is not found
out = checked_sub(r'return 0;', r'return Token::IDENTIFIER;', out)
return out
def memcmp_to_while(out):
# It's faster to loop over the keyword with a while loop than calling memcmp.
# Careful, this replacement is quite flaky, because otherwise the regex is
# unreadable.
return checked_sub(
re.escape("if (*str == *s && !memcmp (str + 1, s + 1, len - 1))") + r"\s*"
+ re.escape("return kPerfectKeywordHashTable[key].value;"),
"""
while(*s!=0) {
if (*s++ != *str++) return Token::IDENTIFIER;
}
return kPerfectKeywordHashTable[key].value;
""",
out,
flags=re.MULTILINE)
def wrap_namespace(out):
return """// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file is automatically generated by gen-keywords-gen-h.py and should not
// be modified manually.
#ifndef V8_PARSING_KEYWORDS_GEN_H_
#define V8_PARSING_KEYWORDS_GEN_H_
#include "src/parsing/token.h"
namespace v8 {
namespace internal {
%s
} // namespace internal
} // namespace v8
#endif // V8_PARSING_KEYWORDS_GEN_H_
""" % (out)
def trim_character_set_warning(out):
# gperf generates an error message that is too large, trim it
return out.replace(
'"gperf generated tables don\'t work with this execution character set. Please report a bug to <bug-gperf@gnu.org>."',
'"gperf generated tables don\'t work with this execution character set."\\\n// If you see this error, please report a bug to <bug-gperf@gnu.org>.'
)
def main():
try:
script_dir = os.path.dirname(sys.argv[0])
root_dir = os.path.join(script_dir, '..')
out = subprocess.check_output(["gperf", "-m100", INPUT_PATH], cwd=root_dir)
# And now some munging of the generated file.
out = change_sizet_to_int(out)
out = trim_and_dcheck_char_table(out)
out = use_isinrange(out)
out = pad_tables(out)
out = return_token(out)
out = memcmp_to_while(out)
out = wrap_namespace(out)
out = trim_character_set_warning(out)
# Final formatting.
clang_format_path = os.path.join(root_dir,
'third_party/depot_tools/clang-format')
out = call_with_input([clang_format_path], out)
with open(os.path.join(root_dir, OUTPUT_PATH), 'w') as f:
f.write(out)
return 0
except subprocess.CalledProcessError as e:
sys.stderr.write("Error calling '{}'\n".format(" ".join(e.cmd)))
return e.returncode
if __name__ == '__main__':
sys.exit(main())