Reland "[parser] Perfect hash for keywords"

This is a reland of ca086a497c Original change's description: > [parser] Perfect hash for keywords > > Use gperf to generate a perfect hash table for keyword lookup. Adds a > python script which munges the output of gperf and adds additional > cleanup and optimisations. > > Change-Id: I3656a7287dbd0688917893de3a671faef9e4578a > Reviewed-on: https://chromium-review.googlesource.com/c/1349240 > Commit-Queue: Leszek Swirski <leszeks@chromium.org> > Reviewed-by: Toon Verwaest <verwaest@chromium.org> > Reviewed-by: Marja Hölttä <marja@chromium.org> > Cr-Commit-Position: refs/heads/master@{#57790} Change-Id: Ifb53527ba3d0652ea4f5d03740f7c856ad5d91da Reviewed-on: https://chromium-review.googlesource.com/c/1350121 Reviewed-by: Toon Verwaest <verwaest@chromium.org> Commit-Queue: Leszek Swirski <leszeks@chromium.org> Cr-Commit-Position: refs/heads/master@{#57831}
2018-11-26 11:07:26 +01:00 · 2018-11-26 11:07:26 +01:00 · 47daa48696
commit 47daa48696
parent 9b8937c9d0
6 changed files with 556 additions and 42 deletions
--- a/src/parsing/keywords-gen.h
+++ b/src/parsing/keywords-gen.h
@ -0,0 +1,225 @@
+// Copyright 2018 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// This file is automatically generated by gen-keywords-gen-h.py and should not
+// be modified manually.
+
+#ifndef V8_PARSING_KEYWORDS_GEN_H_
+#define V8_PARSING_KEYWORDS_GEN_H_
+
+#include "src/parsing/token.h"
+
+namespace v8 {
+namespace internal {
+
+/* C++ code produced by gperf version 3.1 */
+/* Command-line: gperf -m100 src/parsing/keywords.txt  */
+/* Computed positions: -k'1-2' */
+
+#if !(                                                                         \
+    (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' == 37) && \
+    ('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) &&               \
+    ('*' == 42) && ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && \
+    ('/' == 47) && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && \
+    ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && \
+    ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) && ('=' == 61) && \
+    ('>' == 62) && ('?' == 63) && ('A' == 65) && ('B' == 66) && ('C' == 67) && \
+    ('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) && ('H' == 72) && \
+    ('I' == 73) && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) && \
+    ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) && ('R' == 82) && \
+    ('S' == 83) && ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) && \
+    ('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) &&                \
+    ('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) &&               \
+    ('a' == 97) && ('b' == 98) && ('c' == 99) && ('d' == 100) &&               \
+    ('e' == 101) && ('f' == 102) && ('g' == 103) && ('h' == 104) &&            \
+    ('i' == 105) && ('j' == 106) && ('k' == 107) && ('l' == 108) &&            \
+    ('m' == 109) && ('n' == 110) && ('o' == 111) && ('p' == 112) &&            \
+    ('q' == 113) && ('r' == 114) && ('s' == 115) && ('t' == 116) &&            \
+    ('u' == 117) && ('v' == 118) && ('w' == 119) && ('x' == 120) &&            \
+    ('y' == 121) && ('z' == 122) && ('{' == 123) && ('|' == 124) &&            \
+    ('}' == 125) && ('~' == 126))
+/* The character set is not based on ISO-646.  */
+#error "gperf generated tables don't work with this execution character set."
+// If you see this error, please report a bug to <bug-gperf@gnu.org>.
+#endif
+
+#line 16 "src/parsing/keywords.txt"
+struct PerfectKeywordHashTableEntry {
+  const char* name;
+  Token::Value value;
+};
+enum {
+  TOTAL_KEYWORDS = 47,
+  MIN_WORD_LENGTH = 2,
+  MAX_WORD_LENGTH = 10,
+  MIN_HASH_VALUE = 2,
+  MAX_HASH_VALUE = 51
+};
+
+/* maximum key range = 50, duplicates = 0 */
+
+class PerfectKeywordHash {
+ private:
+  static inline unsigned int Hash(const char* str, int len);
+
+ public:
+  static inline Token::Value GetToken(const char* str, int len);
+};
+
+inline unsigned int PerfectKeywordHash::Hash(const char* str, int len) {
+  DCHECK_LT(str[1], 128);
+  DCHECK_LT(str[0], 128);
+  static const unsigned char asso_values[128] = {
+      52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+      52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+      52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+      52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+      52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+      52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+      52, 8,  2,  6,  0,  0,  9,  52, 21, 0,  52, 52, 36, 40, 0,  3,
+      6,  52, 17, 13, 16, 16, 38, 25, 6,  26, 52, 52, 52, 52, 52, 52};
+  return len + asso_values[static_cast<unsigned char>(str[1])] +
+         asso_values[static_cast<unsigned char>(str[0])];
+}
+
+static const unsigned char kPerfectKeywordLengthTable[64] = {
+    0, 0, 2, 3, 4, 2, 6,  7, 8, 9, 10, 2, 6, 7, 5, 3, 7, 8, 4, 5, 4, 7,
+    5, 6, 5, 0, 5, 0, 6,  4, 7, 5, 9,  8, 5, 6, 3, 4, 5, 3, 4, 4, 5, 0,
+    6, 4, 6, 5, 6, 3, 10, 5, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+static const struct PerfectKeywordHashTableEntry kPerfectKeywordHashTable[64] =
+    {{"", Token::IDENTIFIER},
+     {"", Token::IDENTIFIER},
+#line 41 "src/parsing/keywords.txt"
+     {"in", Token::IN},
+#line 45 "src/parsing/keywords.txt"
+     {"new", Token::NEW},
+#line 31 "src/parsing/keywords.txt"
+     {"enum", Token::ENUM},
+#line 29 "src/parsing/keywords.txt"
+     {"do", Token::DO},
+#line 28 "src/parsing/keywords.txt"
+     {"delete", Token::DELETE},
+#line 27 "src/parsing/keywords.txt"
+     {"default", Token::DEFAULT},
+#line 26 "src/parsing/keywords.txt"
+     {"debugger", Token::DEBUGGER},
+#line 43 "src/parsing/keywords.txt"
+     {"interface", Token::FUTURE_STRICT_RESERVED_WORD},
+#line 42 "src/parsing/keywords.txt"
+     {"instanceof", Token::INSTANCEOF},
+#line 38 "src/parsing/keywords.txt"
+     {"if", Token::IF},
+#line 32 "src/parsing/keywords.txt"
+     {"export", Token::EXPORT},
+#line 33 "src/parsing/keywords.txt"
+     {"extends", Token::EXTENDS},
+#line 24 "src/parsing/keywords.txt"
+     {"const", Token::CONST},
+#line 36 "src/parsing/keywords.txt"
+     {"for", Token::FOR},
+#line 35 "src/parsing/keywords.txt"
+     {"finally", Token::FINALLY},
+#line 25 "src/parsing/keywords.txt"
+     {"continue", Token::CONTINUE},
+#line 21 "src/parsing/keywords.txt"
+     {"case", Token::CASE},
+#line 22 "src/parsing/keywords.txt"
+     {"catch", Token::CATCH},
+#line 46 "src/parsing/keywords.txt"
+     {"null", Token::NULL_LITERAL},
+#line 47 "src/parsing/keywords.txt"
+     {"package", Token::FUTURE_STRICT_RESERVED_WORD},
+#line 34 "src/parsing/keywords.txt"
+     {"false", Token::FALSE_LITERAL},
+#line 51 "src/parsing/keywords.txt"
+     {"return", Token::RETURN},
+#line 20 "src/parsing/keywords.txt"
+     {"break", Token::BREAK},
+     {"", Token::IDENTIFIER},
+#line 18 "src/parsing/keywords.txt"
+     {"async", Token::ASYNC},
+     {"", Token::IDENTIFIER},
+#line 50 "src/parsing/keywords.txt"
+     {"public", Token::FUTURE_STRICT_RESERVED_WORD},
+#line 63 "src/parsing/keywords.txt"
+     {"with", Token::WITH},
+#line 48 "src/parsing/keywords.txt"
+     {"private", Token::FUTURE_STRICT_RESERVED_WORD},
+#line 64 "src/parsing/keywords.txt"
+     {"yield", Token::YIELD},
+#line 49 "src/parsing/keywords.txt"
+     {"protected", Token::FUTURE_STRICT_RESERVED_WORD},
+#line 37 "src/parsing/keywords.txt"
+     {"function", Token::FUNCTION},
+#line 53 "src/parsing/keywords.txt"
+     {"super", Token::SUPER},
+#line 52 "src/parsing/keywords.txt"
+     {"static", Token::STATIC},
+#line 58 "src/parsing/keywords.txt"
+     {"try", Token::TRY},
+#line 57 "src/parsing/keywords.txt"
+     {"true", Token::TRUE_LITERAL},
+#line 19 "src/parsing/keywords.txt"
+     {"await", Token::AWAIT},
+#line 44 "src/parsing/keywords.txt"
+     {"let", Token::LET},
+#line 30 "src/parsing/keywords.txt"
+     {"else", Token::ELSE},
+#line 55 "src/parsing/keywords.txt"
+     {"this", Token::THIS},
+#line 56 "src/parsing/keywords.txt"
+     {"throw", Token::THROW},
+     {"", Token::IDENTIFIER},
+#line 54 "src/parsing/keywords.txt"
+     {"switch", Token::SWITCH},
+#line 61 "src/parsing/keywords.txt"
+     {"void", Token::VOID},
+#line 40 "src/parsing/keywords.txt"
+     {"import", Token::IMPORT},
+#line 23 "src/parsing/keywords.txt"
+     {"class", Token::CLASS},
+#line 59 "src/parsing/keywords.txt"
+     {"typeof", Token::TYPEOF},
+#line 60 "src/parsing/keywords.txt"
+     {"var", Token::VAR},
+#line 39 "src/parsing/keywords.txt"
+     {"implements", Token::FUTURE_STRICT_RESERVED_WORD},
+#line 62 "src/parsing/keywords.txt"
+     {"while", Token::WHILE},
+     {"", Token::IDENTIFIER},
+     {"", Token::IDENTIFIER},
+     {"", Token::IDENTIFIER},
+     {"", Token::IDENTIFIER},
+     {"", Token::IDENTIFIER},
+     {"", Token::IDENTIFIER},
+     {"", Token::IDENTIFIER},
+     {"", Token::IDENTIFIER},
+     {"", Token::IDENTIFIER},
+     {"", Token::IDENTIFIER},
+     {"", Token::IDENTIFIER},
+     {"", Token::IDENTIFIER}};
+
+inline Token::Value PerfectKeywordHash::GetToken(const char* str, int len) {
+  if (IsInRange(len, MIN_WORD_LENGTH, MAX_WORD_LENGTH)) {
+    unsigned int key = Hash(str, len) & 0x3f;
+
+    DCHECK_LT(key, arraysize(kPerfectKeywordLengthTable));
+    DCHECK_LT(key, arraysize(kPerfectKeywordHashTable));
+    if (len == kPerfectKeywordLengthTable[key]) {
+      const char* s = kPerfectKeywordHashTable[key].name;
+
+      while (*s != 0) {
+        if (*s++ != *str++) return Token::IDENTIFIER;
+      }
+      return kPerfectKeywordHashTable[key].value;
+    }
+  }
+  return Token::IDENTIFIER;
+}
+
+}  // namespace internal
+}  // namespace v8
+
+#endif  // V8_PARSING_KEYWORDS_GEN_H_
--- a/src/parsing/keywords.txt
+++ b/src/parsing/keywords.txt
@ -0,0 +1,64 @@
+%struct-type
+%language=C++
+%global-table
+%define initializer-suffix ,Token::IDENTIFIER
+%define hash-function-name Hash
+%define lookup-function-name GetToken
+%define class-name PerfectKeywordHash
+%define word-array-name kPerfectKeywordHashTable
+%define length-table-name kPerfectKeywordLengthTable
+%7bit
+%compare-lengths
+%enum
+%readonly-tables
+%compare-strncmp
+
+struct PerfectKeywordHashTableEntry { const char* name; Token::Value value; };
+%%
+async, Token::ASYNC
+await, Token::AWAIT
+break, Token::BREAK
+case, Token::CASE
+catch, Token::CATCH
+class, Token::CLASS
+const, Token::CONST
+continue, Token::CONTINUE
+debugger, Token::DEBUGGER
+default, Token::DEFAULT
+delete, Token::DELETE
+do, Token::DO
+else, Token::ELSE
+enum, Token::ENUM
+export, Token::EXPORT
+extends, Token::EXTENDS
+false, Token::FALSE_LITERAL
+finally, Token::FINALLY
+for, Token::FOR
+function, Token::FUNCTION
+if, Token::IF
+implements, Token::FUTURE_STRICT_RESERVED_WORD
+import, Token::IMPORT
+in, Token::IN
+instanceof, Token::INSTANCEOF
+interface, Token::FUTURE_STRICT_RESERVED_WORD
+let, Token::LET
+new, Token::NEW
+null, Token::NULL_LITERAL
+package, Token::FUTURE_STRICT_RESERVED_WORD
+private, Token::FUTURE_STRICT_RESERVED_WORD
+protected, Token::FUTURE_STRICT_RESERVED_WORD
+public, Token::FUTURE_STRICT_RESERVED_WORD
+return, Token::RETURN
+static, Token::STATIC
+super, Token::SUPER
+switch, Token::SWITCH
+this, Token::THIS
+throw, Token::THROW
+true, Token::TRUE_LITERAL
+try, Token::TRY
+typeof, Token::TYPEOF
+var, Token::VAR
+void, Token::VOID
+while, Token::WHILE
+with, Token::WITH
+yield, Token::YIELD
--- a/src/parsing/scanner-inl.h
+++ b/src/parsing/scanner-inl.h
@ -6,6 +6,7 @@
 #define V8_PARSING_SCANNER_INL_H_

 #include "src/char-predicates-inl.h"
+#include "src/parsing/keywords-gen.h"
 #include "src/parsing/scanner.h"

 namespace v8 {
@ -90,44 +91,8 @@ constexpr bool IsKeywordStart(char c) {
 V8_INLINE Token::Value KeywordOrIdentifierToken(const uint8_t* input,
                                                int input_length) {
  DCHECK_GE(input_length, 1);
-  const int kMinLength = 2;
-  const int kMaxLength = 10;
-  if (!IsInRange(input_length, kMinLength, kMaxLength)) {
-    return Token::IDENTIFIER;
-  }
-  switch (input[0]) {
-    default:
-#define KEYWORD_GROUP_CASE(ch) \
-  break;                       \
-  case ch:
-#define KEYWORD(keyword, token)                                           \
-  {                                                                       \
-    /* 'keyword' is a char array, so sizeof(keyword) is */                \
-    /* strlen(keyword) plus 1 for the NUL char. */                        \
-    const int keyword_length = sizeof(keyword) - 1;                       \
-    STATIC_ASSERT(keyword_length >= kMinLength);                          \
-    STATIC_ASSERT(keyword_length <= kMaxLength);                          \
-    DCHECK_EQ(input[0], keyword[0]);                                      \
-    DCHECK(token == Token::FUTURE_STRICT_RESERVED_WORD ||                 \
-           0 == strncmp(keyword, Token::String(token), sizeof(keyword))); \
-    if (input_length == keyword_length && input[1] == keyword[1] &&       \
-        (keyword_length <= 2 || input[2] == keyword[2]) &&                \
-        (keyword_length <= 3 || input[3] == keyword[3]) &&                \
-        (keyword_length <= 4 || input[4] == keyword[4]) &&                \
-        (keyword_length <= 5 || input[5] == keyword[5]) &&                \
-        (keyword_length <= 6 || input[6] == keyword[6]) &&                \
-        (keyword_length <= 7 || input[7] == keyword[7]) &&                \
-        (keyword_length <= 8 || input[8] == keyword[8]) &&                \
-        (keyword_length <= 9 || input[9] == keyword[9]) &&                \
-        (keyword_length <= 10 || input[10] == keyword[10])) {             \
-      return token;                                                       \
-    }                                                                     \
-  }
-      KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
-  }
-  return Token::IDENTIFIER;
-#undef KEYWORD
-#undef KEYWORD_GROUP_CASE
+  return PerfectKeywordHash::GetToken(reinterpret_cast<const char*>(input),
+                                      input_length);
 }

 // Recursive constexpr template magic to check if a character is in a given
@ -269,9 +234,15 @@ static constexpr const uint8_t character_scan_flags[128] = {
 #undef CALL_GET_SCAN_FLAGS
 };

+inline bool CharCanBeKeyword(uc32 c) {
+  return static_cast<uint32_t>(c) < arraysize(character_scan_flags) &&
+         CanBeKeyword(character_scan_flags[c]);
+}
+
 V8_INLINE Token::Value Scanner::ScanIdentifierOrKeywordInner() {
  DCHECK(IsIdentifierStart(c0_));
  bool escaped = false;
+  bool can_be_keyword = true;

  STATIC_ASSERT(arraysize(character_scan_flags) == kMaxAscii + 1);
  if (V8_LIKELY(static_cast<uint32_t>(c0_) <= kMaxAscii)) {
@ -310,6 +281,8 @@ V8_INLINE Token::Value Scanner::ScanIdentifierOrKeywordInner() {
        Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
        return KeywordOrIdentifierToken(chars.start(), chars.length());
      }
+
+      can_be_keyword = CanBeKeyword(scan_flags);
    } else {
      // Special case for escapes at the start of an identifier.
      escaped = true;
@ -319,10 +292,11 @@ V8_INLINE Token::Value Scanner::ScanIdentifierOrKeywordInner() {
        return Token::ILLEGAL;
      }
      AddLiteralChar(c);
+      can_be_keyword = CharCanBeKeyword(c);
    }
  }

-  return ScanIdentifierOrKeywordInnerSlow(escaped);
+  return ScanIdentifierOrKeywordInnerSlow(escaped, can_be_keyword);
 }

 V8_INLINE Token::Value Scanner::SkipWhiteSpace() {
--- a/src/parsing/scanner.cc
+++ b/src/parsing/scanner.cc
@ -987,7 +987,8 @@ uc32 Scanner::ScanUnicodeEscape() {
  return ScanHexNumber<capture_raw, unicode>(4);
 }

-Token::Value Scanner::ScanIdentifierOrKeywordInnerSlow(bool escaped) {
+Token::Value Scanner::ScanIdentifierOrKeywordInnerSlow(bool escaped,
+                                                       bool can_be_keyword) {
  while (true) {
    if (c0_ == '\\') {
      escaped = true;
@ -999,16 +1000,18 @@ Token::Value Scanner::ScanIdentifierOrKeywordInnerSlow(bool escaped) {
      if (c == '\\' || !IsIdentifierPart(c)) {
        return Token::ILLEGAL;
      }
+      can_be_keyword = can_be_keyword && CharCanBeKeyword(c);
      AddLiteralChar(c);
    } else if (IsIdentifierPart(c0_) ||
               (CombineSurrogatePair() && IsIdentifierPart(c0_))) {
+      can_be_keyword = can_be_keyword && CharCanBeKeyword(c0_);
      AddLiteralCharAdvance();
    } else {
      break;
    }
  }

-  if (next().literal_chars.is_one_byte()) {
+  if (can_be_keyword && next().literal_chars.is_one_byte()) {
    Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
    Token::Value token =
        KeywordOrIdentifierToken(chars.start(), chars.length());
--- a/src/parsing/scanner.h
+++ b/src/parsing/scanner.h
@ -729,7 +729,8 @@ class Scanner {
  Token::Value ScanNumber(bool seen_period);
  V8_INLINE Token::Value ScanIdentifierOrKeyword();
  V8_INLINE Token::Value ScanIdentifierOrKeywordInner();
-  Token::Value ScanIdentifierOrKeywordInnerSlow(bool escaped);
+  Token::Value ScanIdentifierOrKeywordInnerSlow(bool escaped,
+                                                bool can_be_keyword);

  Token::Value ScanString();
  Token::Value ScanPrivateName();
--- a/tools/gen-keywords-gen-h.py
+++ b/tools/gen-keywords-gen-h.py
@ -0,0 +1,247 @@
+#!/usr/bin/env python
+# Copyright 2018 the V8 project authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import os
+import sys
+import subprocess
+import re
+import math
+import datetime
+
+INPUT_PATH = "src/parsing/keywords.txt"
+OUTPUT_PATH = "src/parsing/keywords-gen.h"
+
+# TODO(leszeks): Trimming seems to regress performance, investigate.
+TRIM_CHAR_TABLE = False
+
+
+def next_power_of_2(x):
+  return 1 if x == 0 else 2**int(math.ceil(math.log(x, 2)))
+
+
+def call_with_input(cmd, input_string=""):
+  p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+  stdout, _ = p.communicate(input_string)
+  retcode = p.wait()
+  if retcode != 0:
+    raise subprocess.CalledProcessError(retcode, cmd)
+  return stdout
+
+
+def checked_sub(pattern, sub, out, count=1, flags=0):
+  out, n = re.subn(pattern, sub, out, flags=flags)
+  if n != count:
+    raise Exception("Didn't get exactly %d replacement(s) for pattern: %s" %
+                    (count, pattern))
+  return out
+
+
+def change_sizet_to_int(out):
+  # Literal buffer lengths are given as ints, not size_t
+  return checked_sub(r'\bsize_t\b', 'int', out, count=4)
+
+
+def trim_and_dcheck_char_table(out):
+  # Potential keyword strings are known to be lowercase ascii, so chop off the
+  # rest of the table and mask out the char
+
+  reads_re = re.compile(
+      r'asso_values\[static_cast<unsigned char>\(str\[(\d+)\]\)\]')
+
+  dchecks = []
+  for str_read in reads_re.finditer(out):
+    dchecks.append("DCHECK_LT(str[%d], 128);" % int(str_read.group(1)))
+
+  if TRIM_CHAR_TABLE:
+    out = checked_sub(
+        r'static const unsigned char asso_values\[\]\s*=\s*\{(\s*\d+\s*,){96}',
+        "".join(dchecks) + r'static const unsigned char asso_values[32] = {',
+        out,
+        flags=re.MULTILINE)
+    out = checked_sub(
+        reads_re.pattern,
+        r'asso_values[static_cast<unsigned char>(str[(\1)]&31)]',
+        out,
+        count=len(dchecks),
+        flags=re.MULTILINE)
+  else:
+    out = checked_sub(
+        r'static const unsigned char asso_values\[\]\s*=\s*\{',
+        "".join(dchecks) + r'static const unsigned char asso_values[128] = {',
+        out,
+        flags=re.MULTILINE)
+
+  return out
+
+
+def use_isinrange(out):
+  # Our IsInRange method is more efficient than checking for min/max length
+  return checked_sub(r'if \(len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH\)',
+                     r'if (IsInRange(len, MIN_WORD_LENGTH, MAX_WORD_LENGTH))',
+                     out)
+
+
+def pad_tables(out):
+  # We don't want to compare against the max hash value, so pad the tables up
+  # to a power of two and mask the hash.
+
+  # First get the new size
+  max_hash_value = int(re.search(r'MAX_HASH_VALUE\s*=\s*(\d+)', out).group(1))
+  old_table_length = max_hash_value + 1
+  new_table_length = next_power_of_2(old_table_length)
+  table_padding_len = new_table_length - old_table_length
+
+  # Pad the length table.
+  single_lengthtable_entry = r'\d+'
+  out = checked_sub(
+      r"""
+      static\ const\ unsigned\ char\ kPerfectKeywordLengthTable\[\]\s*=\s*\{
+        (
+          \s*%(single_lengthtable_entry)s\s*
+          (?:,\s*%(single_lengthtable_entry)s\s*)*
+        )
+      \}
+    """ % {'single_lengthtable_entry': single_lengthtable_entry},
+      r'static const unsigned char kPerfectKeywordLengthTable[%d] = { \1 %s }'
+      % (new_table_length, "".join([',0'] * table_padding_len)),
+      out,
+      flags=re.MULTILINE | re.VERBOSE)
+
+  # Pad the word list.
+  single_wordlist_entry = r"""
+      (?:\#line\ \d+\ ".*"$\s*)?
+      \{\s*"[a-z]*"\s*,\s*Token::[A-Z_]+\}
+    """
+  out = checked_sub(
+      r"""
+      static\ const\ struct\ PerfectKeywordHashTableEntry\ kPerfectKeywordHashTable\[\]\s*=\s*\{
+        (
+          \s*%(single_wordlist_entry)s\s*
+          (?:,\s*%(single_wordlist_entry)s\s*)*
+        )
+      \}
+    """ % {'single_wordlist_entry': single_wordlist_entry},
+      r'static const struct PerfectKeywordHashTableEntry kPerfectKeywordHashTable[%d] = {\1 %s }'
+      % (new_table_length, "".join(
+          [',{"",Token::IDENTIFIER}'] * table_padding_len)),
+      out,
+      flags=re.MULTILINE | re.VERBOSE)
+
+  # Mask the hash and replace the range check with DCHECKs.
+  out = checked_sub(r'Hash\s*\(\s*str,\s*len\s*\)',
+                    r'Hash(str, len)&0x%x' % (new_table_length - 1), out)
+  out = checked_sub(
+      r'if \(key <= MAX_HASH_VALUE\)',
+      r'DCHECK_LT(key, arraysize(kPerfectKeywordLengthTable));DCHECK_LT(key, arraysize(kPerfectKeywordHashTable));',
+      out)
+
+  return out
+
+
+def return_token(out):
+  # We want to return the actual token rather than the table entry.
+
+  # Change the return type of the function. Make it inline too.
+  out = checked_sub(
+      r'const\s*struct\s*PerfectKeywordHashTableEntry\s*\*\s*((?:PerfectKeywordHash::)?GetToken)',
+      r'inline Token::Value \1',
+      out,
+      count=2)
+
+  # Change the return value when the keyword is found
+  out = checked_sub(r'return &kPerfectKeywordHashTable\[key\];',
+                    r'return kPerfectKeywordHashTable[key].value;', out)
+
+  # Change the return value when the keyword is not found
+  out = checked_sub(r'return 0;', r'return Token::IDENTIFIER;', out)
+
+  return out
+
+
+def memcmp_to_while(out):
+  # It's faster to loop over the keyword with a while loop than calling memcmp.
+  # Careful, this replacement is quite flaky, because otherwise the regex is
+  # unreadable.
+  return checked_sub(
+      re.escape("if (*str == *s && !memcmp (str + 1, s + 1, len - 1))") + r"\s*"
+      + re.escape("return kPerfectKeywordHashTable[key].value;"),
+      """
+      while(*s!=0) {
+        if (*s++ != *str++) return Token::IDENTIFIER;
+      }
+      return kPerfectKeywordHashTable[key].value;
+      """,
+      out,
+      flags=re.MULTILINE)
+
+
+def wrap_namespace(out):
+  return """// Copyright 2018 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// This file is automatically generated by gen-keywords-gen-h.py and should not
+// be modified manually.
+
+#ifndef V8_PARSING_KEYWORDS_GEN_H_
+#define V8_PARSING_KEYWORDS_GEN_H_
+
+#include "src/parsing/token.h"
+
+namespace v8 {
+namespace internal {
+
+%s
+
+}  // namespace internal
+}  // namespace v8
+
+#endif  // V8_PARSING_KEYWORDS_GEN_H_
+""" % (out)
+
+
+def trim_character_set_warning(out):
+  # gperf generates an error message that is too large, trim it
+
+  return out.replace(
+      '"gperf generated tables don\'t work with this execution character set. Please report a bug to <bug-gperf@gnu.org>."',
+      '"gperf generated tables don\'t work with this execution character set."\\\n// If you see this error, please report a bug to <bug-gperf@gnu.org>.'
+  )
+
+
+def main():
+  try:
+    script_dir = os.path.dirname(sys.argv[0])
+    root_dir = os.path.join(script_dir, '..')
+
+    out = subprocess.check_output(["gperf", "-m100", INPUT_PATH], cwd=root_dir)
+
+    # And now some munging of the generated file.
+    out = change_sizet_to_int(out)
+    out = trim_and_dcheck_char_table(out)
+    out = use_isinrange(out)
+    out = pad_tables(out)
+    out = return_token(out)
+    out = memcmp_to_while(out)
+    out = wrap_namespace(out)
+    out = trim_character_set_warning(out)
+
+    # Final formatting.
+    clang_format_path = os.path.join(root_dir,
+                                     'third_party/depot_tools/clang-format')
+    out = call_with_input([clang_format_path], out)
+
+    with open(os.path.join(root_dir, OUTPUT_PATH), 'w') as f:
+      f.write(out)
+
+    return 0
+
+  except subprocess.CalledProcessError as e:
+    sys.stderr.write("Error calling '{}'\n".format(" ".join(e.cmd)))
+    return e.returncode
+
+
+if __name__ == '__main__':
+  sys.exit(main())