mirror of
https://github.com/google/brotli.git
synced 2024-11-21 19:20:09 +00:00
Add tools to download and transform static dictionary data. (#670)
This commit is contained in:
parent
f5ed35d065
commit
a4581c158e
3
scripts/dictionary/README.md
Normal file
3
scripts/dictionary/README.md
Normal file
@ -0,0 +1,3 @@
|
||||
Set of tools that can be used to download brotli RFC, extract and validate
|
||||
binary dictionary, and generate dictionary derivatives
|
||||
(e.g. Java `DictionaryData` class constants).
|
16
scripts/dictionary/step-01-download-rfc.py
Normal file
16
scripts/dictionary/step-01-download-rfc.py
Normal file
@ -0,0 +1,16 @@
|
||||
# Step 01 - download RFC7932.
|
||||
#
|
||||
# RFC is the ultimate source for brotli format and constants, including
|
||||
# static dictionary.
|
||||
|
||||
import urllib2
|
||||
|
||||
response = urllib2.urlopen('https://tools.ietf.org/rfc/rfc7932.txt')
|
||||
|
||||
text = response.read()
|
||||
path = "rfc7932.txt"
|
||||
|
||||
with open(path, "w") as rfc:
|
||||
rfc.write(text)
|
||||
|
||||
print("Downloaded and saved " + str(len(text)) + " bytes to " + path)
|
34
scripts/dictionary/step-02-rfc-to-bin.py
Normal file
34
scripts/dictionary/step-02-rfc-to-bin.py
Normal file
@ -0,0 +1,34 @@
|
||||
# Step 02 - parse RFC.
|
||||
#
|
||||
# Static dictionary is described in "Appendix A" section in a hexadecimal form.
|
||||
# This tool locates dictionary data in RFC and converts it to raw binary format.
|
||||
|
||||
import re
|
||||
|
||||
rfc_path = "rfc7932.txt"
|
||||
|
||||
with open(rfc_path, "r") as rfc:
|
||||
lines = rfc.readlines()
|
||||
|
||||
re_data_line = re.compile("^ [0-9a-f]{64}$")
|
||||
|
||||
appendix_a_found = False
|
||||
dictionary = []
|
||||
for line in lines:
|
||||
if appendix_a_found:
|
||||
if re_data_line.match(line) is not None:
|
||||
data = line.strip()
|
||||
for i in range(32):
|
||||
dictionary.append(int(data[2 * i : 2 * i + 2], 16))
|
||||
if len(dictionary) == 122784:
|
||||
break
|
||||
else:
|
||||
if line.startswith("Appendix A."):
|
||||
appendix_a_found = True
|
||||
|
||||
bin_path = "dictionary.bin"
|
||||
|
||||
with open(bin_path, "wb") as output:
|
||||
output.write(bytearray(dictionary))
|
||||
|
||||
print("Parsed and saved " + str(len(dictionary)) + " bytes to " + bin_path)
|
38
scripts/dictionary/step-03-validate-bin.py
Normal file
38
scripts/dictionary/step-03-validate-bin.py
Normal file
@ -0,0 +1,38 @@
|
||||
# Step 03 - validate raw dictionary file.
|
||||
#
|
||||
# CRC32, MD5, SHA1 and SHA256 checksums for raw binary dictionary are checked.
|
||||
|
||||
import hashlib
|
||||
import zlib
|
||||
|
||||
bin_path = "dictionary.bin"
|
||||
|
||||
with open(bin_path, "rb") as raw:
|
||||
data = raw.read()
|
||||
|
||||
def check_digest(name, expected, actual):
|
||||
if expected == actual:
|
||||
print("[OK] " + name)
|
||||
else:
|
||||
print("[ERROR] " + name + " | " + expected + " != " + actual)
|
||||
|
||||
|
||||
check_digest(
|
||||
"CRC32", # This is the only checksum provided in RFC.
|
||||
"0x5136cb04",
|
||||
hex(zlib.crc32(data)))
|
||||
|
||||
check_digest(
|
||||
"MD5",
|
||||
"96cecd2ee7a666d5aa3627d74735b32a",
|
||||
hashlib.md5(data).hexdigest())
|
||||
|
||||
check_digest(
|
||||
"SHA1",
|
||||
"72b41051cb61a9281ba3c4414c289da50d9a7640",
|
||||
hashlib.sha1(data).hexdigest())
|
||||
|
||||
check_digest(
|
||||
"SHA256",
|
||||
"20e42eb1b511c21806d4d227d07e5dd06877d8ce7b3a817f378f313653f35c70",
|
||||
hashlib.sha256(data).hexdigest())
|
79
scripts/dictionary/step-04-generate-java-literals.py
Normal file
79
scripts/dictionary/step-04-generate-java-literals.py
Normal file
@ -0,0 +1,79 @@
|
||||
# Step 04 - generate Java literals.
|
||||
#
|
||||
# Java byte-code has ridiculous restrictions. There is no such thing as
|
||||
# "array literal" - those are implemented as series of data[x] = y;
|
||||
# as a consequence N-byte array will use 7N bytes in class, plus N bytes
|
||||
# in instantiated variable. Also no literal could be longer than 64KiB.
|
||||
#
|
||||
# To keep dictionary data compact both in source code and in compiled format
|
||||
# we use the following tricks:
|
||||
# * use String as a data container
|
||||
# * store only lowest 7 bits; i.e. all characters fit ASCII table; this allows
|
||||
# efficient conversion to byte array; also ASCII characters use only 1 byte
|
||||
#. of memory (UTF-8 encoding)
|
||||
# * RLE-compress sequence of 8-th bits
|
||||
#
|
||||
# This script generates literals used in Java code.
|
||||
|
||||
bin_path = "dictionary.bin"
|
||||
|
||||
with open(bin_path, "rb") as raw:
|
||||
data = raw.read()
|
||||
|
||||
low = []
|
||||
hi = []
|
||||
is_skip = True
|
||||
skip_flip_offset = 36
|
||||
cntr = skip_flip_offset
|
||||
for b in data:
|
||||
value = ord(b)
|
||||
low.append(chr(value & 0x7F))
|
||||
if is_skip:
|
||||
if value < 0x80:
|
||||
cntr += 1
|
||||
else:
|
||||
is_skip = False
|
||||
hi.append(unichr(cntr))
|
||||
cntr = skip_flip_offset + 1
|
||||
else:
|
||||
if value >= 0x80:
|
||||
cntr += 1
|
||||
else:
|
||||
is_skip = True
|
||||
hi.append(unichr(cntr))
|
||||
cntr = skip_flip_offset + 1
|
||||
hi.append(unichr(cntr))
|
||||
|
||||
low0 = low[0 : len(low) // 2]
|
||||
low1 = low[len(low) // 2 : len(low)]
|
||||
|
||||
def escape(chars):
|
||||
result = []
|
||||
for c in chars:
|
||||
if "\r" == c:
|
||||
result.append("\\r")
|
||||
elif "\n" == c:
|
||||
result.append("\\n")
|
||||
elif "\t" == c:
|
||||
result.append("\\t")
|
||||
elif "\"" == c:
|
||||
result.append("\\\"")
|
||||
elif "\\" == c:
|
||||
result.append("\\\\")
|
||||
elif ord(c) < 32 or ord(c) >= 127:
|
||||
result.append("\\u%04X" % ord(c))
|
||||
else:
|
||||
result.append(c);
|
||||
return result
|
||||
|
||||
|
||||
source_code = [
|
||||
" private static final String DATA0 = \"", "".join(escape(low0)), "\";\n",
|
||||
" private static final String DATA1 = \"", "".join(escape(low1)), "\";\n",
|
||||
" private static final String SKIP_FLIP = \"", "".join(escape(hi)), "\";\n"
|
||||
]
|
||||
|
||||
src_path = "DictionaryData.inc.java"
|
||||
|
||||
with open(src_path, "w") as source:
|
||||
source.write("".join(source_code))
|
Loading…
Reference in New Issue
Block a user