47daa48696
This is a reland of ca086a497c
Original change's description:
> [parser] Perfect hash for keywords
>
> Use gperf to generate a perfect hash table for keyword lookup. Adds a
> python script which munges the output of gperf and adds additional
> cleanup and optimisations.
>
> Change-Id: I3656a7287dbd0688917893de3a671faef9e4578a
> Reviewed-on: https://chromium-review.googlesource.com/c/1349240
> Commit-Queue: Leszek Swirski <leszeks@chromium.org>
> Reviewed-by: Toon Verwaest <verwaest@chromium.org>
> Reviewed-by: Marja Hölttä <marja@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#57790}
Change-Id: Ifb53527ba3d0652ea4f5d03740f7c856ad5d91da
Reviewed-on: https://chromium-review.googlesource.com/c/1350121
Reviewed-by: Toon Verwaest <verwaest@chromium.org>
Commit-Queue: Leszek Swirski <leszeks@chromium.org>
Cr-Commit-Position: refs/heads/master@{#57831}
248 lines
7.6 KiB
Python
Executable File
248 lines
7.6 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# Copyright 2018 the V8 project authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
import os
|
|
import sys
|
|
import subprocess
|
|
import re
|
|
import math
|
|
import datetime
|
|
|
|
INPUT_PATH = "src/parsing/keywords.txt"
|
|
OUTPUT_PATH = "src/parsing/keywords-gen.h"
|
|
|
|
# TODO(leszeks): Trimming seems to regress performance, investigate.
|
|
TRIM_CHAR_TABLE = False
|
|
|
|
|
|
def next_power_of_2(x):
|
|
return 1 if x == 0 else 2**int(math.ceil(math.log(x, 2)))
|
|
|
|
|
|
def call_with_input(cmd, input_string=""):
|
|
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
|
stdout, _ = p.communicate(input_string)
|
|
retcode = p.wait()
|
|
if retcode != 0:
|
|
raise subprocess.CalledProcessError(retcode, cmd)
|
|
return stdout
|
|
|
|
|
|
def checked_sub(pattern, sub, out, count=1, flags=0):
|
|
out, n = re.subn(pattern, sub, out, flags=flags)
|
|
if n != count:
|
|
raise Exception("Didn't get exactly %d replacement(s) for pattern: %s" %
|
|
(count, pattern))
|
|
return out
|
|
|
|
|
|
def change_sizet_to_int(out):
|
|
# Literal buffer lengths are given as ints, not size_t
|
|
return checked_sub(r'\bsize_t\b', 'int', out, count=4)
|
|
|
|
|
|
def trim_and_dcheck_char_table(out):
|
|
# Potential keyword strings are known to be lowercase ascii, so chop off the
|
|
# rest of the table and mask out the char
|
|
|
|
reads_re = re.compile(
|
|
r'asso_values\[static_cast<unsigned char>\(str\[(\d+)\]\)\]')
|
|
|
|
dchecks = []
|
|
for str_read in reads_re.finditer(out):
|
|
dchecks.append("DCHECK_LT(str[%d], 128);" % int(str_read.group(1)))
|
|
|
|
if TRIM_CHAR_TABLE:
|
|
out = checked_sub(
|
|
r'static const unsigned char asso_values\[\]\s*=\s*\{(\s*\d+\s*,){96}',
|
|
"".join(dchecks) + r'static const unsigned char asso_values[32] = {',
|
|
out,
|
|
flags=re.MULTILINE)
|
|
out = checked_sub(
|
|
reads_re.pattern,
|
|
r'asso_values[static_cast<unsigned char>(str[(\1)]&31)]',
|
|
out,
|
|
count=len(dchecks),
|
|
flags=re.MULTILINE)
|
|
else:
|
|
out = checked_sub(
|
|
r'static const unsigned char asso_values\[\]\s*=\s*\{',
|
|
"".join(dchecks) + r'static const unsigned char asso_values[128] = {',
|
|
out,
|
|
flags=re.MULTILINE)
|
|
|
|
return out
|
|
|
|
|
|
def use_isinrange(out):
|
|
# Our IsInRange method is more efficient than checking for min/max length
|
|
return checked_sub(r'if \(len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH\)',
|
|
r'if (IsInRange(len, MIN_WORD_LENGTH, MAX_WORD_LENGTH))',
|
|
out)
|
|
|
|
|
|
def pad_tables(out):
|
|
# We don't want to compare against the max hash value, so pad the tables up
|
|
# to a power of two and mask the hash.
|
|
|
|
# First get the new size
|
|
max_hash_value = int(re.search(r'MAX_HASH_VALUE\s*=\s*(\d+)', out).group(1))
|
|
old_table_length = max_hash_value + 1
|
|
new_table_length = next_power_of_2(old_table_length)
|
|
table_padding_len = new_table_length - old_table_length
|
|
|
|
# Pad the length table.
|
|
single_lengthtable_entry = r'\d+'
|
|
out = checked_sub(
|
|
r"""
|
|
static\ const\ unsigned\ char\ kPerfectKeywordLengthTable\[\]\s*=\s*\{
|
|
(
|
|
\s*%(single_lengthtable_entry)s\s*
|
|
(?:,\s*%(single_lengthtable_entry)s\s*)*
|
|
)
|
|
\}
|
|
""" % {'single_lengthtable_entry': single_lengthtable_entry},
|
|
r'static const unsigned char kPerfectKeywordLengthTable[%d] = { \1 %s }'
|
|
% (new_table_length, "".join([',0'] * table_padding_len)),
|
|
out,
|
|
flags=re.MULTILINE | re.VERBOSE)
|
|
|
|
# Pad the word list.
|
|
single_wordlist_entry = r"""
|
|
(?:\#line\ \d+\ ".*"$\s*)?
|
|
\{\s*"[a-z]*"\s*,\s*Token::[A-Z_]+\}
|
|
"""
|
|
out = checked_sub(
|
|
r"""
|
|
static\ const\ struct\ PerfectKeywordHashTableEntry\ kPerfectKeywordHashTable\[\]\s*=\s*\{
|
|
(
|
|
\s*%(single_wordlist_entry)s\s*
|
|
(?:,\s*%(single_wordlist_entry)s\s*)*
|
|
)
|
|
\}
|
|
""" % {'single_wordlist_entry': single_wordlist_entry},
|
|
r'static const struct PerfectKeywordHashTableEntry kPerfectKeywordHashTable[%d] = {\1 %s }'
|
|
% (new_table_length, "".join(
|
|
[',{"",Token::IDENTIFIER}'] * table_padding_len)),
|
|
out,
|
|
flags=re.MULTILINE | re.VERBOSE)
|
|
|
|
# Mask the hash and replace the range check with DCHECKs.
|
|
out = checked_sub(r'Hash\s*\(\s*str,\s*len\s*\)',
|
|
r'Hash(str, len)&0x%x' % (new_table_length - 1), out)
|
|
out = checked_sub(
|
|
r'if \(key <= MAX_HASH_VALUE\)',
|
|
r'DCHECK_LT(key, arraysize(kPerfectKeywordLengthTable));DCHECK_LT(key, arraysize(kPerfectKeywordHashTable));',
|
|
out)
|
|
|
|
return out
|
|
|
|
|
|
def return_token(out):
|
|
# We want to return the actual token rather than the table entry.
|
|
|
|
# Change the return type of the function. Make it inline too.
|
|
out = checked_sub(
|
|
r'const\s*struct\s*PerfectKeywordHashTableEntry\s*\*\s*((?:PerfectKeywordHash::)?GetToken)',
|
|
r'inline Token::Value \1',
|
|
out,
|
|
count=2)
|
|
|
|
# Change the return value when the keyword is found
|
|
out = checked_sub(r'return &kPerfectKeywordHashTable\[key\];',
|
|
r'return kPerfectKeywordHashTable[key].value;', out)
|
|
|
|
# Change the return value when the keyword is not found
|
|
out = checked_sub(r'return 0;', r'return Token::IDENTIFIER;', out)
|
|
|
|
return out
|
|
|
|
|
|
def memcmp_to_while(out):
|
|
# It's faster to loop over the keyword with a while loop than calling memcmp.
|
|
# Careful, this replacement is quite flaky, because otherwise the regex is
|
|
# unreadable.
|
|
return checked_sub(
|
|
re.escape("if (*str == *s && !memcmp (str + 1, s + 1, len - 1))") + r"\s*"
|
|
+ re.escape("return kPerfectKeywordHashTable[key].value;"),
|
|
"""
|
|
while(*s!=0) {
|
|
if (*s++ != *str++) return Token::IDENTIFIER;
|
|
}
|
|
return kPerfectKeywordHashTable[key].value;
|
|
""",
|
|
out,
|
|
flags=re.MULTILINE)
|
|
|
|
|
|
def wrap_namespace(out):
|
|
return """// Copyright 2018 the V8 project authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
// This file is automatically generated by gen-keywords-gen-h.py and should not
|
|
// be modified manually.
|
|
|
|
#ifndef V8_PARSING_KEYWORDS_GEN_H_
|
|
#define V8_PARSING_KEYWORDS_GEN_H_
|
|
|
|
#include "src/parsing/token.h"
|
|
|
|
namespace v8 {
|
|
namespace internal {
|
|
|
|
%s
|
|
|
|
} // namespace internal
|
|
} // namespace v8
|
|
|
|
#endif // V8_PARSING_KEYWORDS_GEN_H_
|
|
""" % (out)
|
|
|
|
|
|
def trim_character_set_warning(out):
|
|
# gperf generates an error message that is too large, trim it
|
|
|
|
return out.replace(
|
|
'"gperf generated tables don\'t work with this execution character set. Please report a bug to <bug-gperf@gnu.org>."',
|
|
'"gperf generated tables don\'t work with this execution character set."\\\n// If you see this error, please report a bug to <bug-gperf@gnu.org>.'
|
|
)
|
|
|
|
|
|
def main():
|
|
try:
|
|
script_dir = os.path.dirname(sys.argv[0])
|
|
root_dir = os.path.join(script_dir, '..')
|
|
|
|
out = subprocess.check_output(["gperf", "-m100", INPUT_PATH], cwd=root_dir)
|
|
|
|
# And now some munging of the generated file.
|
|
out = change_sizet_to_int(out)
|
|
out = trim_and_dcheck_char_table(out)
|
|
out = use_isinrange(out)
|
|
out = pad_tables(out)
|
|
out = return_token(out)
|
|
out = memcmp_to_while(out)
|
|
out = wrap_namespace(out)
|
|
out = trim_character_set_warning(out)
|
|
|
|
# Final formatting.
|
|
clang_format_path = os.path.join(root_dir,
|
|
'third_party/depot_tools/clang-format')
|
|
out = call_with_input([clang_format_path], out)
|
|
|
|
with open(os.path.join(root_dir, OUTPUT_PATH), 'w') as f:
|
|
f.write(out)
|
|
|
|
return 0
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
sys.stderr.write("Error calling '{}'\n".format(" ".join(e.cmd)))
|
|
return e.returncode
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|