mirror of
https://github.com/google/brotli.git
synced 2025-01-01 12:50:10 +00:00
2368 lines
90 KiB
Python
2368 lines
90 KiB
Python
#!python3
|
|
"""Program to dump contents of Brotli compressed files showing the compression format.
|
|
Jurjen N.E. Bos, 2016.
|
|
I found the following issues with the Brotli format:
|
|
- The distance alphabet has size 16+(48<<POSTFIX),
|
|
but the last symbols are useless.
|
|
It could be lowered to 16+(44-POSTFIX<<POSTFIX), and this could matter.
|
|
- The block type code is useless if NBLTYPES==2, you would only need 1 symbol
|
|
anyway, so why don't you just switch to "the other" type?
|
|
"""
|
|
import struct
|
|
from operator import itemgetter, methodcaller
|
|
from itertools import accumulate, repeat
|
|
from collections import defaultdict, deque
|
|
from functools import partial
|
|
|
|
DICTIONARY_PATH = 'dictionary.bin'
|
|
|
|
class InvalidStream(Exception): pass
|
|
#lookup table
|
|
L, I, D = "literal", "insert©", "distance"
|
|
pL, pI, pD = 'P'+L, 'P'+I, 'P'+D
|
|
|
|
def outputCharFormatter(c):
|
|
"""Show character in readable format
|
|
"""
|
|
#TODO 2: allow hex only output
|
|
if 32<c<127: return chr(c)
|
|
elif c==10: return '\\n'
|
|
elif c==13: return '\\r'
|
|
elif c==32: return '" "'
|
|
else: return '\\x{:02x}'.format(c)
|
|
|
|
def outputFormatter(s):
|
|
"""Show string or char.
|
|
"""
|
|
result = ''
|
|
def formatSubString(s):
|
|
for c in s:
|
|
if c==32: yield ' '
|
|
else: yield outputCharFormatter(c)
|
|
if len(result)<200: return ''.join(formatSubString(s))
|
|
else:
|
|
return ''.join(formatSubString(s[:100]))+'...'+ \
|
|
''.join(formatSubString(s[-100:]))
|
|
|
|
|
|
class BitStream:
|
|
"""Represent a bytes object. Can read bits and prefix codes the way
|
|
Brotli does.
|
|
"""
|
|
def __init__(self, byteString):
|
|
self.data = byteString
|
|
#position in bits: byte pos is pos>>3, bit pos is pos&7
|
|
self.pos = 0
|
|
|
|
def __repr__(self):
|
|
"""Representation
|
|
>>> olleke
|
|
BitStream(pos=0:0)
|
|
"""
|
|
return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7)
|
|
|
|
def read(self, n):
|
|
"""Read n bits from the stream and return as an integer.
|
|
Produces zero bits beyond the stream.
|
|
>>> olleke.data[0]==27
|
|
True
|
|
>>> olleke.read(5)
|
|
27
|
|
|
|
>>> olleke
|
|
BitStream(pos=0:5)
|
|
"""
|
|
value = self.peek(n)
|
|
self.pos += n
|
|
if self.pos>len(self.data)*8:
|
|
raise ValueError('Read past end of stream')
|
|
return value
|
|
|
|
def peek(self, n):
|
|
"""Peek an n bit integer from the stream without updating the pointer.
|
|
It is not an error to read beyond the end of the stream.
|
|
>>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803
|
|
True
|
|
>>> olleke.peek(15)
|
|
11803
|
|
>>> hex(olleke.peek(32))
|
|
'0x2e1b'
|
|
"""
|
|
#read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3]
|
|
#convert to int: int.from_bytes(..., 'little')
|
|
#shift out the bits from the first byte: >>(self.pos&7)
|
|
#mask unwanted bits: & (1<<n)-1
|
|
return int.from_bytes(
|
|
self.data[self.pos>>3:self.pos+n+7>>3],
|
|
'little')>>(self.pos&7) & (1<<n)-1
|
|
|
|
def readBytes(self, n):
|
|
"""Read n bytes from the stream on a byte boundary.
|
|
"""
|
|
if self.pos&7: raise ValueError('readBytes: need byte boundary')
|
|
result = self.data[self.pos>>3:(self.pos>>3)+n]
|
|
self.pos += 8*n
|
|
return result
|
|
|
|
#-----------------------Symbol-------------------------------------------
|
|
class Symbol:
|
|
"""A symbol in a code.
|
|
Refers back to the code that contains it.
|
|
Index is the place in the alphabet of the symbol.
|
|
"""
|
|
__slots__ = 'code', 'index'
|
|
def __init__(self, code, value):
|
|
self.code = code
|
|
self.index = value
|
|
|
|
def __repr__(self):
|
|
return 'Symbol({}, {})'.format(self.code.name, self.index)
|
|
|
|
def __len__(self):
|
|
"""Number of bits in the prefix notation of this symbol
|
|
"""
|
|
return self.code.length(self.index)
|
|
|
|
def __int__(self):
|
|
return self.index
|
|
|
|
#these routines call equivalent routine in Code class
|
|
def bitPattern(self):
|
|
"""Value of the symbol in the stream
|
|
"""
|
|
return self.code.bitPattern(self.index)
|
|
|
|
def extraBits(self):
|
|
"""Number of extra bits to read for this symbol
|
|
"""
|
|
return self.code.extraBits(self.index)
|
|
|
|
def __str__(self):
|
|
"""Short descriptor of the symbol without extra bits.
|
|
"""
|
|
return self.code.mnemonic(self.index)
|
|
|
|
#requiring optional extra bits, if self.code supports them
|
|
def value(self, extra=None):
|
|
"""The value used for processing. Can be a tuple.
|
|
with optional extra bits
|
|
"""
|
|
if isinstance(self.code, WithExtra):
|
|
if not 0<=extra<1<<self.extraBits():
|
|
raise ValueError("value: extra value doesn't fit in extraBits")
|
|
return self.code.value(self.index, extra)
|
|
if extra is not None:
|
|
raise ValueError('value: no extra bits for this code')
|
|
return self.code.value(self.index)
|
|
|
|
def explanation(self, extra=None):
|
|
"""Long explanation of the value from the numeric value
|
|
with optional extra bits
|
|
Used by Layout.verboseRead when printing the value
|
|
"""
|
|
if isinstance(self.code, WithExtra):
|
|
return self.code.callback(self, extra)
|
|
return self.code.callback(self)
|
|
|
|
#========================Code definitions==================================
|
|
class RangeDecoder:
|
|
"""A decoder for the Code class that assumes the symbols
|
|
are encoded consecutively in binary.
|
|
It all depends on the "alphabetSize" property.
|
|
The range runs from 0 to alphabetSize-1.
|
|
This is the default decoder.
|
|
"""
|
|
def __init__(self, *, alphabetSize=None, bitLength=None, **args):
|
|
if bitLength is not None: alphabetSize = 1<<bitLength
|
|
if alphabetSize is not None:
|
|
self.alphabetSize = alphabetSize
|
|
self.maxLength = (alphabetSize-1).bit_length()
|
|
|
|
def __len__(self):
|
|
return self.alphabetSize
|
|
|
|
def __iter__(self):
|
|
"""Produce all symbols.
|
|
"""
|
|
return map(partial(Symbol, self), range(len(self)))
|
|
|
|
def __getitem__(self, index):
|
|
if index>=self.alphabetSize: raise ValueError('index out of range')
|
|
return Symbol(self, index)
|
|
|
|
def bitPattern(self, index):
|
|
return '{:0{}b}'.format(index, self.maxLength)
|
|
|
|
def length(self, index):
|
|
"""Encoding length of given symbol.
|
|
Does not depend on index in this case.
|
|
"""
|
|
return self.maxLength
|
|
|
|
def decodePeek(self, data):
|
|
"""Find which symbol index matches the given data (from peek, as a number)
|
|
and return the number of bits decoded.
|
|
Can also be used to figure out length of a symbol.
|
|
"""
|
|
return self.maxLength, Symbol(self, data&(1<<self.maxLength)-1)
|
|
|
|
class PrefixDecoder:
|
|
"""A decoder for the Code class that uses a prefix code.
|
|
The code is determined by encoding:
|
|
encode[p] gives the index corresponding to bit pattern p.
|
|
Used setDecode(decodeTable) to switch the decoder from the default
|
|
to a prefix decoder, or pass decodeTable at init.
|
|
You can also use setLength(lengthTable)
|
|
to define the encoding from the lengths.
|
|
The set of symbol values does not need to be consecutive.
|
|
"""
|
|
def __init__(self, *, decodeTable=None, **args):
|
|
if decodeTable is not None: self.setDecode(decodeTable)
|
|
|
|
def __len__(self):
|
|
return len(self.decodeTable)
|
|
|
|
def __iter__(self):
|
|
def revBits(index):
|
|
return self.bitPattern(index)[::-1]
|
|
return (
|
|
Symbol(self, index)
|
|
for index in sorted(self.decodeTable.values(), key=revBits)
|
|
)
|
|
|
|
def __getitem__(self, index):
|
|
if index not in self.lengthTable:
|
|
raise ValueError('No symbol {}[{}]'.format(
|
|
self.__class__.__name__, index))
|
|
return Symbol(self, index)
|
|
|
|
def bitPattern(self, index):
|
|
bits = next(b for (b,s) in self.decodeTable.items() if s==index)
|
|
return '{:0{}b}'.format(bits, self.length(index))
|
|
|
|
def length(self, index):
|
|
"""Encoding length of given symbol.
|
|
"""
|
|
return self.lengthTable[index]
|
|
|
|
def decodePeek(self, data):
|
|
"""Find which symbol index matches the given data (from peek, as a number)
|
|
and return the number of bits decoded.
|
|
Can also be used to figure out length of a symbol.
|
|
"""
|
|
#do binary search for word length
|
|
#invariant: lo<=length<=hi
|
|
lo, hi = self.minLength, self.maxLength
|
|
while lo<=hi:
|
|
mid = lo+hi>>1
|
|
#note lo<=mid<hi at this point
|
|
mask = (1<<mid)-1
|
|
#lets see what happens if we guess length is mid
|
|
try: index = self.decodeTable[data&mask]
|
|
except KeyError:
|
|
#too many bits specified, reduce estimated length
|
|
hi = mid-1
|
|
continue
|
|
#we found a symbol, but there could be a longer match
|
|
symbolLength = self.lengthTable[index]
|
|
if symbolLength<=mid:
|
|
#all bits match, symbol must be right
|
|
return symbolLength, Symbol(self, index)
|
|
#there must be more bits to match
|
|
lo = mid+1
|
|
return lo, Symbol(self, index)
|
|
|
|
#routine to set up the tables
|
|
def setDecode(self, decodeTable):
|
|
"""Store decodeTable,
|
|
and compute lengthTable, minLength, maxLength from encodings.
|
|
"""
|
|
self.decodeTable = decodeTable
|
|
#set of symbols with unknown length
|
|
todo = set(decodeTable)
|
|
#bit size under investigation
|
|
maskLength = 0
|
|
lengthTable = {}
|
|
while todo:
|
|
mask = (1<<maskLength)-1
|
|
#split the encodings that we didn't find yet using b bits
|
|
splitSymbols = defaultdict(list)
|
|
for s in todo: splitSymbols[s&mask].append(s)
|
|
#unique encodings have a length of maskLength bits
|
|
#set length, and remove from todo list
|
|
for s,subset in splitSymbols.items():
|
|
if len(subset)==1:
|
|
lengthTable[self.decodeTable[s]] = maskLength
|
|
todo.remove(s)
|
|
#now investigate with longer mask
|
|
maskLength +=1
|
|
#save result
|
|
self.lengthTable = lengthTable
|
|
self.minLength = min(lengthTable.values())
|
|
self.maxLength = max(lengthTable.values())
|
|
self.switchToPrefix()
|
|
|
|
def setLength(self, lengthTable):
|
|
"""Given the bit pattern lengths for symbols given in lengthTable,
|
|
set decodeTable, minLength, maxLength
|
|
"""
|
|
self.lengthTable = lengthTable
|
|
self.minLength = min(lengthTable.values())
|
|
self.maxLength = max(lengthTable.values())
|
|
#compute the backwards codes first; then reverse them
|
|
#compute (backwards) first code for every separate lengths
|
|
nextCodes = []
|
|
#build codes for each length, from right to left
|
|
code = 0
|
|
for bits in range(self.maxLength+1):
|
|
code <<= 1
|
|
nextCodes.append(code)
|
|
code += sum(x==bits for x in lengthTable.values())
|
|
self.decodeTable = {}
|
|
#count codes for each length, and store reversed in the table
|
|
for symbol in sorted(lengthTable):
|
|
bits = lengthTable[symbol]
|
|
bitpattern = '{:0{}b}'.format(nextCodes[bits], bits)
|
|
self.decodeTable[int(bitpattern[::-1], 2)] = symbol
|
|
nextCodes[bits] += 1
|
|
self.switchToPrefix()
|
|
|
|
def switchToPrefix(self):
|
|
"""This routine makes sure the prefix decoder is activated.
|
|
"""
|
|
self.mode = PrefixDecoder
|
|
|
|
class Code(RangeDecoder, PrefixDecoder):
|
|
"""An alphabet of symbols, that can be read from a stream.
|
|
If you use setDecode or setLength, you have a prefix code,
|
|
otherwise you have a range code.
|
|
Features:
|
|
code[index] produces symbol with given index
|
|
value(index): value of symbol
|
|
mnemonic(index): short description of symbol
|
|
explanation(index): show meaning of symbol, shown in Layout.verboseRead
|
|
iter(code): produce all symbols in some order
|
|
name: show as context in Layout.verboseRead
|
|
"""
|
|
name = '?'
|
|
#callback is a function that gets the symbol and the extra bits
|
|
#default callback calls explanation
|
|
def __init__(self, name=None, *, callback=None, description='', **args):
|
|
"""Don't forget to set either alphabetSize or decodeTable
|
|
"""
|
|
#set name when provided, otherwise take class variable
|
|
if name is not None: self.name = name
|
|
if callback is not None: self.callback = callback
|
|
self.description = description
|
|
#mode switch
|
|
if 'bitLength' in args or 'alphabetSize' in args:
|
|
self.mode = RangeDecoder
|
|
RangeDecoder.__init__(self, **args)
|
|
elif 'decodeTable' in args:
|
|
self.mode = PrefixDecoder
|
|
PrefixDecoder.__init__(self, **args)
|
|
else:
|
|
super().__init__(**args)
|
|
|
|
def __repr__(self):
|
|
return self.__class__.__name__+' '+self.name
|
|
|
|
#the routines that get switched between RangeDecoder and PrefixDecoder
|
|
def __len__(self): return self.mode.__len__(self)
|
|
def __iter__(self): return self.mode.__iter__(self)
|
|
def __getitem__(self, index): return self.mode.__getitem__(self, index)
|
|
def bitPattern(self, index): return self.mode.bitPattern(self, index)
|
|
def length(self, index): return self.mode.length(self, index)
|
|
def decodePeek(self, data): return self.mode.decodePeek(self, data)
|
|
#general routines
|
|
def value(self, index, extra=None):
|
|
"""Get value of symbol for computations.
|
|
Override where needed.
|
|
"""
|
|
if extra is not None:
|
|
raise ValueError('value: no extra for this symbol')
|
|
return index
|
|
|
|
def mnemonic(self, index):
|
|
"""Give mnemonic of symbol.
|
|
Override where needed.
|
|
"""
|
|
return str(self.value(index))
|
|
|
|
def callback(self, symbol):
|
|
return self.explanation(symbol.index)
|
|
|
|
def explanation(self, index):
|
|
"""Long explanation of the value from the numeric value
|
|
This is a default routine.
|
|
You can customize in three ways:
|
|
- set description to add some text
|
|
- override to get more control
|
|
- set callback to make it dependent on you local variables
|
|
"""
|
|
value = self.value(index)
|
|
return '{0}{1}: {2}'.format(
|
|
self.description and self.description+': ',
|
|
self.bitPattern(index),
|
|
value,
|
|
)
|
|
|
|
def extraBits(self, index):
|
|
return 0
|
|
|
|
#Routines that use the decode interface
|
|
def showCode(self, width=80):
|
|
"""Show all words of the code in a nice format.
|
|
"""
|
|
#make table of all symbols with binary strings
|
|
symbolStrings = [
|
|
(self.bitPattern(s.index), self.mnemonic(s.index))
|
|
for s in self
|
|
]
|
|
#determine column widths the way Lisp programmers do it
|
|
leftColWidth, rightColWidth = map(max, map(
|
|
map,
|
|
repeat(len),
|
|
zip(*symbolStrings)
|
|
))
|
|
colwidth = leftColWidth+rightColWidth
|
|
columns = 81//(colwidth+2)
|
|
rows = -(-len(symbolStrings)//columns)
|
|
def justify(bs):
|
|
b,s = bs
|
|
return b.rjust(leftColWidth)+':'+s.ljust(rightColWidth)
|
|
for i in range(rows):
|
|
print(' '.join(map(justify, symbolStrings[i::rows])).rstrip())
|
|
|
|
def readTuple(self, stream):
|
|
"""Read symbol from stream. Returns symbol, length.
|
|
"""
|
|
length, symbol = self.decodePeek(stream.peek(self.maxLength))
|
|
stream.pos += length
|
|
return length, symbol
|
|
|
|
def readTupleAndExtra(self, stream):
|
|
return self.readTuple(stream)+(0, None)
|
|
|
|
class WithExtra(Code):
|
|
"""Extension for Code so that symbol may have extra bits associated.
|
|
If you supply an extraTable, you can use extraBits
|
|
You can define an extraTable,
|
|
which allows to call extraBits to get the number of extraBits.
|
|
Otherwise, you can supply extraBits yourself.
|
|
Routine readTupleAndExtra now reads the extra bits too.
|
|
Value probably needs to be overridden; see Enumerator.
|
|
Note: this does not give you an decodeTable.
|
|
"""
|
|
#redefine these if you don't want to use an extraTable
|
|
def extraBits(self, index):
|
|
"""Get the number of extra bits for this symbol.
|
|
"""
|
|
return self.extraTable[index]
|
|
|
|
def mnemonic(self, index):
|
|
"""This value must be independent of extra.
|
|
"""
|
|
return str(index)
|
|
|
|
def readTupleAndExtra(self, stream):
|
|
"""Read symbol and extrabits from stream.
|
|
Returns symbol length, symbol, extraBits, extra
|
|
>>> olleke.pos = 6
|
|
>>> MetablockLengthAlphabet().readTupleAndExtra(olleke)
|
|
(2, Symbol(MLEN, 4), 16, 46)
|
|
"""
|
|
length, symbol = self.decodePeek(stream.peek(self.maxLength))
|
|
stream.pos += length
|
|
extraBits = self.extraBits(symbol.index)
|
|
return length, symbol, extraBits, stream.read(extraBits)
|
|
|
|
def explanation(self, index, extra=None):
|
|
"""Expanded version of Code.explanation supporting extra bits.
|
|
If you don't supply extra, it is not mentioned.
|
|
"""
|
|
extraBits = 0 if extra is None else self.extraBits(index)
|
|
if not hasattr(self, 'extraTable'):
|
|
formatString = '{0}{3}'
|
|
lo = hi = value = self.value(index, extra)
|
|
elif extraBits==0:
|
|
formatString = '{0}{2}: {3}'
|
|
lo, hi = self.span(index)
|
|
value = lo
|
|
else:
|
|
formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}'
|
|
lo, hi = self.span(index)
|
|
value = lo+extra
|
|
return formatString.format(
|
|
self.description and self.description+': ',
|
|
'x'*extraBits,
|
|
self.bitPattern(index),
|
|
lo, hi,
|
|
extra,
|
|
value,
|
|
)
|
|
|
|
def callback(self, symbol, extra):
|
|
return self.explanation(symbol.index, extra)
|
|
|
|
class BoolCode(Code):
|
|
"""Same as Code(bitLength=1), but shows a boolean.
|
|
"""
|
|
def __init__(self, name=None, **args):
|
|
super().__init__(name, bitLength=1, **args)
|
|
|
|
def value(self, index, extra=None):
|
|
return bool(super().value(index, extra))
|
|
|
|
class Enumerator(WithExtra):
|
|
"""Code that is defined by the ExtraTable.
|
|
extraTable is a class variable that contains
|
|
the extraBits of the symbols from 0
|
|
value0 contains the value of symbol 0
|
|
encodings is not neccessary, but allowed.
|
|
Note: place for FixedCode to make sure extraBits works
|
|
"""
|
|
def __init__(self, name=None, **args):
|
|
#if there is no decodeTable to determine length, compute it ourselves
|
|
if 'decodeTable' not in args:
|
|
args['alphabetSize'] = len(self.extraTable)
|
|
super().__init__(name, **args)
|
|
|
|
def __len__(self):
|
|
return len(self.extraTable)
|
|
|
|
def __getitem__(self, index):
|
|
"""Faster than PrefixDecoder
|
|
"""
|
|
if index>=len(self.extraTable):
|
|
raise ValueError("No symbol {}[{}]".format(
|
|
self.__class__.__name__, index))
|
|
return Symbol(self, index)
|
|
|
|
def value(self, index, extra):
|
|
"""Override if you don't define value0 and extraTable
|
|
"""
|
|
lower, upper = self.span(index)
|
|
value = lower+(extra or 0)
|
|
if value>upper:
|
|
raise ValueError('value: extra out of range')
|
|
return value
|
|
|
|
def span(self, index):
|
|
"""Give the range of possible values in a tuple
|
|
Useful for mnemonic and explanation
|
|
"""
|
|
lower = self.value0+sum(1<<x for x in self.extraTable[:index])
|
|
upper = lower+(1<<self.extraTable[index])
|
|
return lower, upper-1
|
|
|
|
#======================Code subclasses======================================
|
|
#Alphabets used in the metablock header----------------------------------
|
|
#For prefix codes
|
|
class PrefixCodeHeader(WithExtra):
|
|
"""Header of prefix codes.
|
|
"""
|
|
def __init__(self, codename):
|
|
super().__init__('PFX', bitLength=2)
|
|
#this is the name of the code that it describes
|
|
self.codename = codename
|
|
|
|
def extraBits(self, index):
|
|
return 2 if index==1 else 0
|
|
|
|
def value(self, index, extra):
|
|
"""Returns ('Simple', #codewords) or ('Complex', HSKIP)
|
|
"""
|
|
if index==1:
|
|
if extra>3:
|
|
raise ValueError('value: extra out of range')
|
|
return 'Simple', extra+1
|
|
if extra:
|
|
raise ValueError('value: extra out of range')
|
|
return 'Complex', index
|
|
|
|
def explanation(self, index, extra):
|
|
if index==1:
|
|
return '{} is simple with {} code word{}'.format(
|
|
self.codename, extra+1, 's' if extra else '')
|
|
lengths = [1, 2, 3, 4, 0, 5, 17, 6]
|
|
return '{} is complex with lengths {}...'.format(
|
|
self.codename,
|
|
','.join(
|
|
map(str, lengths[index:index+5]))
|
|
)
|
|
|
|
class TreeShapeAlhabet(BoolCode):
|
|
"""The bit used to indicate if four word code is "deep" or "wide"
|
|
"""
|
|
name = 'SHAPE'
|
|
def value(self, index):
|
|
return [(2,2,2,2), (1,2,3,3)][index]
|
|
|
|
def explanation(self, index):
|
|
return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index))
|
|
|
|
class LengthOfLengthAlphabet(Code):
|
|
"""For use in decoding complex code descriptors.
|
|
>>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('')
|
|
>>> print(lengthOfLengthAlphabet[2])
|
|
coded with 2 bits
|
|
>>> len(lengthOfLengthAlphabet[0])
|
|
2
|
|
>>> [len(lengthOfLengthAlphabet[x]) for x in range(6)]
|
|
[2, 4, 3, 2, 2, 4]
|
|
>>> lengthOfLengthAlphabet.showCode()
|
|
00:skipped 01:coded with 4 bits 0111:coded with 1 bits
|
|
10:coded with 3 bits 011:coded with 2 bits 1111:coded with 5 bits
|
|
"""
|
|
decodeTable = {
|
|
0b00:0, 0b10:3,
|
|
0b0111:1, 0b01:4,
|
|
0b011:2, 0b1111:5,
|
|
}
|
|
|
|
def __init__(self, name=None, **args):
|
|
super().__init__(name, decodeTable=self.decodeTable, **args)
|
|
|
|
def mnemonic(self, index):
|
|
if index==0: return 'skipped'
|
|
return 'coded with {} bits'.format(index)
|
|
|
|
def explanation(self, index, extra=None):
|
|
return self.description+': '+self.mnemonic(index)
|
|
|
|
class LengthAlphabet(WithExtra):
|
|
"""Length of symbols
|
|
Used during construction of a code.
|
|
"""
|
|
def __init__(self, name):
|
|
super().__init__(name, alphabetSize=18)
|
|
|
|
def extraBits(self, index):
|
|
return {16:2, 17:3}.get(index, 0)
|
|
|
|
def mnemonic(self, index):
|
|
if index==0: return 'unused'
|
|
elif index==16: return 'rep xx'
|
|
elif index==17: return 'zero xxx'
|
|
else: return 'len {}'.format(index)
|
|
|
|
def explanation(self, index, extra):
|
|
return self.description.format(self[index], extra)
|
|
|
|
def value(self, index, extra):
|
|
#the caller got the length already, so extra is enough
|
|
return extra
|
|
|
|
#Stream header
|
|
class WindowSizeAlphabet(Code):
|
|
"""The alphabet used for window size in the stream header.
|
|
>>> WindowSizeAlphabet()[10].explanation()
|
|
'windowsize=(1<<10)-16=1008'
|
|
"""
|
|
decodeTable = {
|
|
0b0100001: 10, 0b1100001: 14, 0b0011: 18, 0b1011: 22,
|
|
0b0110001: 11, 0b1110001: 15, 0b0101: 19, 0b1101: 23,
|
|
0b1000001: 12, 0b0: 16, 0b0111: 20, 0b1111: 24,
|
|
0b1010001: 13, 0b0000001: 17, 0b1001: 21,
|
|
0b0010001: None,
|
|
}
|
|
|
|
name = 'WSIZE'
|
|
|
|
def __init__(self, name=None):
|
|
super().__init__(name, decodeTable=self.decodeTable)
|
|
|
|
def value(self, index):
|
|
#missing value gives index None
|
|
if index is None: return None
|
|
return (1<<index)-16
|
|
|
|
def explanation(self, index):
|
|
return 'windowsize=(1<<{})-16={}'.format(
|
|
index, (1<<index)-16)
|
|
|
|
#Metablock
|
|
class MetablockLengthAlphabet(WithExtra):
|
|
"""Used for the meta block length;
|
|
also indicates a block with no data
|
|
>>> metablockLengthAlphabet = MetablockLengthAlphabet()
|
|
>>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0])
|
|
Symbol(MLEN, 0)
|
|
'empty'
|
|
>>> metablockLengthAlphabet[3]
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: No symbol MetablockLengthAlphabet[3]
|
|
>>> print(metablockLengthAlphabet[4])
|
|
hhhh00
|
|
>>> metablockLengthAlphabet[4].value(0x1000)
|
|
4097
|
|
>>> metablockLengthAlphabet[5].value(0x1000)
|
|
Traceback (most recent call last):
|
|
...
|
|
InvalidStream: Zeros in high nibble of MLEN
|
|
>>> metablockLengthAlphabet[5].explanation(0x12345)
|
|
'data length: 12345h+1=74566'
|
|
>>> metablockLengthAlphabet.showCode()
|
|
00:hhhh00 10:hhhhhh10 01:hhhhh01 11:empty
|
|
"""
|
|
decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6}
|
|
|
|
name = 'MLEN'
|
|
def __init__(self, name=None):
|
|
super().__init__(name, decodeTable=self.decodeTable)
|
|
|
|
def extraBits(self, index):
|
|
return index*4
|
|
|
|
def mnemonic(self, index):
|
|
if index==0: return 'empty'
|
|
return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
|
|
|
|
def value(self, index, extra):
|
|
extraBits = self.extraBits(index)
|
|
if not 0<=extra<1<<extraBits:
|
|
raise ValueError('value: extra out of range')
|
|
if index==0: return 0
|
|
if index>4 and extra>>extraBits-4==0: raise InvalidStream(
|
|
'Zeros in high nibble of MLEN')
|
|
return extra+1
|
|
|
|
def explanation(self, index, extra):
|
|
if index==0: return '11: empty block'
|
|
extraBits = self.extraBits(index)
|
|
return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1)
|
|
|
|
|
|
class ReservedAlphabet(BoolCode):
|
|
"""The reserved bit that must be zero.
|
|
"""
|
|
name = 'RSVD'
|
|
def value(self, index):
|
|
if index: raise ValueError('Reserved bit is not zero')
|
|
|
|
def explanation(self, index):
|
|
return 'Reserved (must be zero)'
|
|
|
|
class FillerAlphabet(Code):
|
|
def __init__(self, *, streamPos):
|
|
super().__init__('SKIP', bitLength=(-streamPos)&7)
|
|
|
|
def explanation(self, index):
|
|
return '{} bit{} ignored'.format(
|
|
self.length(index),
|
|
'' if self.length(index)==1 else 's',
|
|
)
|
|
|
|
class SkipLengthAlphabet(WithExtra):
|
|
"""Used for the skip length in an empty metablock
|
|
>>> skipLengthAlphabet = SkipLengthAlphabet()
|
|
>>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0])
|
|
Symbol(SKIP, 0)
|
|
'empty'
|
|
>>> skipLengthAlphabet[4]
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: index out of range
|
|
>>> print(skipLengthAlphabet[3])
|
|
hhhhhh11
|
|
>>> skipLengthAlphabet[2].value(0x1000)
|
|
4097
|
|
>>> skipLengthAlphabet[3].value(0x1000)
|
|
Traceback (most recent call last):
|
|
...
|
|
InvalidStream: Zeros in high byte of SKIPBYTES
|
|
>>> skipLengthAlphabet[3].explanation(0x12345)
|
|
'skip length: 12345h+1=74566'
|
|
>>> skipLengthAlphabet.showCode()
|
|
00:empty 01:hh01 10:hhhh10 11:hhhhhh11
|
|
"""
|
|
def __init__(self):
|
|
super().__init__('SKIP', bitLength=2)
|
|
|
|
def extraBits(self, index):
|
|
return index*8
|
|
|
|
def mnemonic(self, index):
|
|
if index==0: return 'empty'
|
|
return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
|
|
|
|
def value(self, index, extra):
|
|
extraBits = self.extraBits(index)
|
|
if not 0<=extra<1<<extraBits:
|
|
raise ValueError('value: extra out of range')
|
|
if index==0: return 0
|
|
if index>1 and extra>>extraBits-8==0:
|
|
raise InvalidStream('Zeros in high byte of SKIPBYTES')
|
|
return extra+1
|
|
|
|
def explanation(self, index, extra):
|
|
if index==0: return '00: no skip'
|
|
extraBits = self.extraBits(index)
|
|
return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1)
|
|
|
|
|
|
class TypeCountAlphabet(Enumerator):
|
|
"""Used for giving block type counts and tree counts.
|
|
>>> TypeCountAlphabet(description='').showCode()
|
|
0:0 0101:xx,0101 1011:xxxxx,1011
|
|
0001:0001 1101:xxxxxx,1101 0111:xxx,0111
|
|
1001:xxxx,1001 0011:x,0011 1111:xxxxxxx,1111
|
|
"""
|
|
decodeTable = {
|
|
0b0: 0, 0b1001: 5,
|
|
0b0001: 1, 0b1011: 6,
|
|
0b0011: 2, 0b1101: 7,
|
|
0b0101: 3, 0b1111: 8,
|
|
0b0111: 4,
|
|
}
|
|
|
|
value0 = 1
|
|
extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7]
|
|
name = 'BT#'
|
|
|
|
def __init__(self, name=None, *, description):
|
|
super().__init__(
|
|
name,
|
|
decodeTable=self.decodeTable,
|
|
description=description)
|
|
|
|
def mnemonic(self, index):
|
|
if index==0: return '0'
|
|
if index==1: return '0001'
|
|
return 'x'*(self.extraBits(index))+','+self.bitPattern(index)
|
|
|
|
def explanation(self, index, extra):
|
|
value = self.value(index, extra)
|
|
description = self.description
|
|
if value==1: description = description[:-1]
|
|
return '{}: {} {}'.format(
|
|
self.mnemonic(index),
|
|
value,
|
|
description)
|
|
|
|
class BlockTypeAlphabet(Code):
|
|
"""The block types; this code works for all three kinds.
|
|
>>> b = BlockTypeAlphabet('T', NBLTYPES=5)
|
|
>>> print(*(x for x in b))
|
|
prev +1 #0 #1 #2 #3 #4
|
|
"""
|
|
def __init__(self, name, NBLTYPES, **args):
|
|
super().__init__(name, alphabetSize=NBLTYPES+2, **args)
|
|
self.NBLTYPES = NBLTYPES
|
|
|
|
def mnemonic(self, index):
|
|
if index==0: return 'prev'
|
|
elif index==1: return '+1'
|
|
else: return '#'+str(index-2)
|
|
|
|
def value(self, index):
|
|
return index-2
|
|
|
|
def explanation(self, index):
|
|
if index==0: return '0: previous'
|
|
elif index==1: return '1: increment'
|
|
else: return 'Set block type to: '+str(index-2)
|
|
|
|
class BlockCountAlphabet(Enumerator):
|
|
"""Block counts
|
|
>>> b = BlockCountAlphabet('L')
|
|
>>> print(b[25])
|
|
[24*x]: BC16625-16793840
|
|
"""
|
|
|
|
value0 = 1
|
|
extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24]
|
|
def __init__(self, name, **args):
|
|
super().__init__(name, alphabetSize=26, **args)
|
|
|
|
def mnemonic(self, index):
|
|
extraBits = self.extraBits(index)
|
|
return '{}: BC{}-{}'.format(
|
|
'x'*extraBits if index<5 else '[{}*x]'.format(extraBits),
|
|
*self.span(index))
|
|
|
|
def explanation(self, index, extra):
|
|
return 'Block count: '+super().explanation(index, extra)
|
|
|
|
class DistanceParamAlphabet(WithExtra):
|
|
"""The distance parameters NPOSTFIX and NDIRECT.
|
|
Although these are treated as two in the description, this is easier.
|
|
"""
|
|
def __init__(self):
|
|
super().__init__('DIST', bitLength=2)
|
|
|
|
def extraBits(self, index):
|
|
return 4
|
|
|
|
def value(self, index, extra):
|
|
"""Returns NPOSTFIX and NDIRECT<<NPOSTFIX
|
|
"""
|
|
if extra>15:
|
|
raise ValueError('value: extra out of range')
|
|
return index, extra<<index
|
|
|
|
def explanation(self, index, extra):
|
|
return '{} postfix bits and {:04b}<<{}={} direct codes'.format(
|
|
index, extra, index, extra<<index)
|
|
|
|
def mnemonic(self, index):
|
|
return 'PF'+str(index)
|
|
|
|
class LiteralContextMode(Code):
|
|
"""For the literal context modes.
|
|
>>> LiteralContextMode().showCode()
|
|
00:LSB6 01:MSB6 10:UTF8 11:Signed
|
|
>>> LiteralContextMode().explanation(2)
|
|
'Context mode for type 9: 2(UTF8)'
|
|
"""
|
|
|
|
def __init__(self, *, number=9):
|
|
super().__init__('LC'+str(number), bitLength=2)
|
|
self.number = number
|
|
|
|
def mnemonic(self, index):
|
|
return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index]
|
|
|
|
def explanation(self, index):
|
|
return 'Context mode for type {}: {}({})'.format(
|
|
self.number,
|
|
index,
|
|
self.mnemonic(index))
|
|
|
|
class RLEmaxAlphabet(Enumerator):
|
|
"""Used for describing the run length encoding used for describing context maps.
|
|
>>> RLEmaxAlphabet().showCode()
|
|
0:1 1:more
|
|
"""
|
|
value0 = 0
|
|
extraTable = [0, 4]
|
|
name = 'RLE#'
|
|
|
|
def mnemonic(self, index):
|
|
return ['1', 'more'][index]
|
|
|
|
def explanation(self, index, extra):
|
|
description = self.description and self.description+': '
|
|
if index==0: return description+'No RLE coding'
|
|
return '{}xxxx 1: RLEMAX={}'.format(description, extra+1)
|
|
|
|
class TreeAlphabet(WithExtra):
|
|
"""The alphabet to enumerate entries (called trees) in the context map.
|
|
parameters are RLEMAX and NTREES
|
|
>>> t = TreeAlphabet('', RLEMAX=3, NTREES=5)
|
|
>>> len(t)
|
|
8
|
|
>>> print(t[2])
|
|
xx+4 zeroes
|
|
>>> t[3].explanation(2)
|
|
'8+010=10 zeroes'
|
|
>>> t[0].value(0)
|
|
(1, 0)
|
|
"""
|
|
name = 'CMI'
|
|
def __init__(self, name=None, *, RLEMAX, NTREES, **args):
|
|
super().__init__(name, alphabetSize=RLEMAX+NTREES, **args)
|
|
self.RLEMAX = RLEMAX
|
|
self.NTREES = NTREES
|
|
|
|
def extraBits(self, index):
|
|
if 0<index<=self.RLEMAX: return index
|
|
return 0
|
|
|
|
def mnemonic(self, index):
|
|
if index==0: return 'map #0'
|
|
if index<=self.RLEMAX:
|
|
return '{}+{} zeroes'.format('x'*index, 1<<index)
|
|
return 'map #{}'.format(index-self.RLEMAX)
|
|
|
|
def value(self, index, extra):
|
|
"""Give count and value."""
|
|
index = index
|
|
if index==0: return 1, 0
|
|
if index<=self.RLEMAX: return (1<<index)+extra, 0
|
|
return 1, index-self.RLEMAX
|
|
|
|
def explanation(self, index, extra):
|
|
description = self.description and self.description+': '
|
|
if index==0: return description+'map #0'
|
|
if index<=self.RLEMAX:
|
|
return '{}+{:0{}b}={} zeroes'.format(
|
|
(1<<index),
|
|
extra, self.extraBits(index),
|
|
(1<<index)+extra)
|
|
return '{}map #{}-{}={}'.format(
|
|
description,
|
|
index, self.RLEMAX, index-self.RLEMAX)
|
|
|
|
#Prefix alphabets for the data stream----------------------------------
|
|
class LiteralAlphabet(Code):
|
|
"""Alphabet of symbols.
|
|
"""
|
|
minLength = maxLength = 8
|
|
def __init__(self, number):
|
|
super().__init__('L'+str(number), alphabetSize=1<<8)
|
|
|
|
def mnemonic(self, index):
|
|
return outputCharFormatter(index)
|
|
|
|
def value(self, index, extra=None):
|
|
return index
|
|
|
|
def explanation(self, index, extra=None):
|
|
return self.mnemonic(index)
|
|
|
|
class InsertLengthAlphabet(Enumerator):
|
|
"""Intern code for insert counts
|
|
"""
|
|
value0 = 0
|
|
extraTable = [0,0,0,0,0, 0,1,1,2,2, 3,3,4,4,5, 5,6,7,8,9, 10,12,14,24]
|
|
|
|
class CopyLengthAlphabet(Enumerator):
|
|
value0 = 2
|
|
extraTable = [0,0,0,0,0, 0,0,0,1,1, 2,2,3,3,4, 4,5,5,6,7, 8,9,10,24]
|
|
|
|
class InsertAndCopyAlphabet(WithExtra):
|
|
"""The insert and copy code
|
|
>>> for x in range(0,704,704//13):
|
|
... print('{:10b}'.format(x), InsertAndCopyAlphabet()[x])
|
|
0 I0C2&D=0
|
|
110110 I6+xC8&D=0
|
|
1101100 I5C22+xxx&D=0
|
|
10100010 I4C4
|
|
11011000 I3C10+x
|
|
100001110 I14+xxC8
|
|
101000100 I10+xxC22+xxx
|
|
101111010 I98+xxxxxC14+xx
|
|
110110000 I6+xC70+xxxxx
|
|
111100110 I1090+[10*x]C8
|
|
1000011100 I26+xxxC326+[8*x]
|
|
1001010010 I322+[8*x]C14+xx
|
|
1010001000 I194+[7*x]C70+xxxxx
|
|
1010111110 I22594+[24*x]C1094+[10*x]
|
|
"""
|
|
insertLengthAlphabet = InsertLengthAlphabet(None)
|
|
copyLengthAlphabet = CopyLengthAlphabet(None)
|
|
|
|
def __init__(self, number=''):
|
|
super().__init__('IC'+str(number), bitLength=10)
|
|
|
|
def __len__(self):
|
|
return 704
|
|
|
|
def extraBits(self, index):
|
|
insertSymbol, copySymbol, dist0 = self.splitSymbol(index)
|
|
return InsertLengthAlphabet.extraTable[insertSymbol.index] + \
|
|
CopyLengthAlphabet.extraTable[copySymbol.index]
|
|
|
|
def splitSymbol(self, index):
|
|
"""Give relevant values for computations:
|
|
(insertSymbol, copySymbol, dist0flag)
|
|
"""
|
|
#determine insert and copy upper bits from table
|
|
row = [0,0,1,1,2,2,1,3,2,3,3][index>>6]
|
|
col = [0,1,0,1,0,1,2,0,2,1,2][index>>6]
|
|
#determine inserts and copy sub codes
|
|
insertLengthCode = row<<3 | index>>3&7
|
|
if row: insertLengthCode -= 8
|
|
copyLengthCode = col<<3 | index&7
|
|
return (
|
|
Symbol(self.insertLengthAlphabet, insertLengthCode),
|
|
Symbol(self.copyLengthAlphabet, copyLengthCode),
|
|
row==0
|
|
)
|
|
|
|
def mnemonic(self, index):
|
|
"""Make a nice mnemonic
|
|
"""
|
|
i,c,d0 = self.splitSymbol(index)
|
|
iLower, _ = i.code.span(i.index)
|
|
iExtra = i.extraBits()
|
|
cLower, _ = c.code.span(c.index)
|
|
cExtra = c.extraBits()
|
|
return 'I{}{}{}C{}{}{}{}'.format(
|
|
iLower,
|
|
'+' if iExtra else '',
|
|
'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra),
|
|
cLower,
|
|
'+' if cExtra else '',
|
|
'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra),
|
|
'&D=0' if d0 else '')
|
|
|
|
def value(self, index, extra):
|
|
i,c,d0 = self.splitSymbol(index)
|
|
iExtra = i.extraBits()
|
|
ce, ie = extra>>iExtra, extra&(1<<iExtra)-1
|
|
insert = i.value(ie)
|
|
copy = c.value(ce)
|
|
return insert, copy, d0
|
|
|
|
def explanation(self, index, extra):
|
|
insert, copy, d0 = self.value(index, extra)
|
|
if d0: return 'Literal: {}, copy: {}, same distance'.format(insert, copy)
|
|
else: return 'Literal: {}, copy: {}'.format(insert, copy)
|
|
|
|
class DistanceAlphabet(WithExtra):
|
|
"""Represent the distance encoding.
|
|
Dynamically generated alphabet.
|
|
This is what the documentation should have said:
|
|
Ignoring offsets for the moment, the "long" encoding works as follows:
|
|
Write the distance in binary as follows:
|
|
1xy..yz..z, then the distance symbol consists of n..nxz..z
|
|
Where:
|
|
n is one less than number of bits in y
|
|
x is a single bit
|
|
y..y are n+1 extra bits (encoded in the bit stream)
|
|
z..z is NPOSTFIX bits that are part of the symbol
|
|
The offsets are so as to start at the lowest useable value:
|
|
if 1xyyyyz = distance +(4<<POSTFIX)-NDIRECT-1
|
|
then n..nxz..z is symbol -NDIRECT-16
|
|
>>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
|
|
>>> print(d[4], d[17], d[34])
|
|
last-1 1 10xx00-5
|
|
>>> [str(d[x]) for x in range(26, 32)]
|
|
['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5']
|
|
"""
|
|
def __init__(self, number, *, NPOSTFIX, NDIRECT):
|
|
self.NPOSTFIX = NPOSTFIX
|
|
self.NDIRECT = NDIRECT
|
|
#set length
|
|
#Actually, not all symbols are used,
|
|
#only NDIRECT+16+(44-2*POSTFIX<<NPOSTFIX)
|
|
super().__init__('D'+str(number),
|
|
alphabetSize=self.NDIRECT+16+(48<<self.NPOSTFIX))
|
|
|
|
def extraBits(self, index):
|
|
"""Indicate how many extra bits are needed to interpret symbol
|
|
>>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
|
|
>>> [d[i].extraBits() for i in range(26)]
|
|
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
|
>>> [d[i].extraBits() for i in range(26,36)]
|
|
[1, 1, 1, 1, 1, 1, 1, 1, 2, 2]
|
|
"""
|
|
if index<16+self.NDIRECT: return 0
|
|
return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
|
|
|
|
def value(self, dcode, dextra):
|
|
"""Decode value of symbol together with the extra bits.
|
|
>>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
|
|
>>> d[34].value(2)
|
|
(0, 35)
|
|
"""
|
|
if dcode<16:
|
|
return [(1,0),(2,0),(3,0),(4,0),
|
|
(1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3),
|
|
(2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3)
|
|
][dcode]
|
|
if dcode<16+self.NDIRECT:
|
|
return (0,dcode-16)
|
|
#we use the original formulas, instead of my clear explanation
|
|
POSTFIX_MASK = (1 << self.NPOSTFIX) - 1
|
|
ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
|
|
hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX
|
|
lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK
|
|
offset = ((2 + (hcode & 1)) << ndistbits) - 4
|
|
distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1
|
|
return (0,distance)
|
|
|
|
def mnemonic(self, index, verbose=False):
|
|
"""Give mnemonic representation of meaning.
|
|
verbose compresses strings of x's
|
|
"""
|
|
if index<16:
|
|
return ['last', '2last', '3last', '4last',
|
|
'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3',
|
|
'2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3'
|
|
][index]
|
|
if index<16+self.NDIRECT:
|
|
return str(index-16)
|
|
#construct strings like "1xx01-15"
|
|
index -= self.NDIRECT+16
|
|
hcode = index >> self.NPOSTFIX
|
|
lcode = index & (1<<self.NPOSTFIX)-1
|
|
if self.NPOSTFIX: formatString = '1{0}{1}{2:0{3}b}{4:+d}'
|
|
else: formatString = '1{0}{1}{4:+d}'
|
|
return formatString.format(
|
|
hcode&1,
|
|
'x'*(2+hcode>>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1),
|
|
lcode, self.NPOSTFIX,
|
|
self.NDIRECT+1-(4<<self.NPOSTFIX))
|
|
|
|
def explanation(self, index, extra):
|
|
"""
|
|
>>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
|
|
>>> d[55].explanation(13)
|
|
'11[1101]01-5: [0]+240'
|
|
"""
|
|
extraBits = self.extraBits(index)
|
|
extraString = '[{:0{}b}]'.format(extra, extraBits)
|
|
return '{0}: [{1[0]}]{1[1]:+d}'.format(
|
|
self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString),
|
|
self.value(index, extra))
|
|
|
|
#Classes for doing actual work------------------------------------------
|
|
class ContextModeKeeper:
|
|
"""For computing the literal context mode.
|
|
You feed it characters, and it computes indices in the context map.
|
|
"""
|
|
def __init__(self, mode):
|
|
self.chars = deque([0,0], maxlen=2)
|
|
self.mode = mode
|
|
|
|
def setContextMode(self, mode):
|
|
"""Switch to given context mode (0..3)"""
|
|
self.mode = mode
|
|
def getIndex(self):
|
|
if self.mode==0: #LSB6
|
|
return self.chars[1]&0x3f
|
|
elif self.mode==1: #MSB6
|
|
return self.chars[1]>>2
|
|
elif self.mode==2: #UTF8: character class of previous and a bit of the second
|
|
p2,p1 = self.chars
|
|
return self.lut0[p1]|self.lut1[p2]
|
|
elif self.mode==3: #Signed: initial bits of last two bytes
|
|
p2,p1 = self.chars
|
|
return self.lut2[p1]<<3|self.lut2[p2]
|
|
|
|
def add(self, index):
|
|
"""Adjust the context for output char (as int)."""
|
|
self.chars.append(index)
|
|
|
|
#0: control #16: quote #32: ,:; #48: AEIOU
|
|
#4: tab/lf/cr #20: % #36: . #52: BC..Z
|
|
#8: space #24: (<[{ #40: = #56: aeiou
|
|
#12:!#$&*+-/?@| #28: )>]} #44: 0-9 #60: bc..z
|
|
lut0 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
|
|
44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
|
|
12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
|
|
52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
|
|
12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
|
|
60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0
|
|
]+[0,1]*32+[2,3]*32
|
|
#0: space 1:punctuation 2:digit/upper 3:lower
|
|
lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
|
|
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
|
|
1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0
|
|
]+[0]*96+[2]*32
|
|
#initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1
|
|
lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7]
|
|
assert len(lut0)==len(lut1)==len(lut2)==256
|
|
|
|
class WordList:
|
|
"""Word list.
|
|
>>> WordList().word(7, 35555)
|
|
b'Program to '
|
|
"""
|
|
NDBITS = [0, 0, 0, 0, 10, 10, 11, 11, 10, 10,
|
|
10, 10, 10, 9, 9, 8, 7, 7, 8, 7,
|
|
7, 6, 6, 5, 5]
|
|
def __init__(self):
|
|
self.file = open(DICTIONARY_PATH, 'rb')
|
|
self.compileActions()
|
|
|
|
def word(self, size, dist):
|
|
"""Get word
|
|
"""
|
|
#split dist in index and action
|
|
ndbits = self.NDBITS[size]
|
|
index = dist&(1<<ndbits)-1
|
|
action = dist>>ndbits
|
|
#compute position in file
|
|
position = sum(n<<self.NDBITS[n] for n in range(4,size))+size*index
|
|
self.file.seek(position)
|
|
return self.doAction(self.file.read(size), action)
|
|
|
|
def upperCase1(self, word):
|
|
word = word.decode('utf8')
|
|
word = word[0].upper()+word[1:]
|
|
return word.encode('utf8')
|
|
|
|
|
|
#Super compact form of action table.
|
|
#_ means space, .U means UpperCaseAll, U(w) means UpperCaseFirst
|
|
actionTable = r"""
|
|
0:w 25:w+_for_ 50:w+\n\t 75:w+. This_100:w+ize_
|
|
1:w+_ 26:w[3:] 51:w+: 76:w+, 101:w.U+.
|
|
2:_+w+_ 27:w[:-2] 52:_+w+._ 77:.+w+_ 102:\xc2\xa0+w
|
|
3:w[1:] 28:w+_a_ 53:w+ed_ 78:U(w)+( 103:_+w+,
|
|
4:U(w)+_ 29:w+_that_ 54:w[9:] 79:U(w)+. 104:U(w)+="
|
|
5:w+_the_ 30:_+U(w) 55:w[7:] 80:w+_not_ 105:w.U+="
|
|
6:_+w 31:w+._ 56:w[:-6] 81:_+w+=" 106:w+ous_
|
|
7:s_+w+_ 32:.+w 57:w+( 82:w+er_ 107:w.U+,_
|
|
8:w+_of_ 33:_+w+,_ 58:U(w)+,_ 83:_+w.U+_ 108:U(w)+=\'
|
|
9:U(w) 34:w[4:] 59:w[:-8] 84:w+al_ 109:_+U(w)+,
|
|
10:w+_and_ 35:w+_with_ 60:w+_at_ 85:_+w.U 110:_+w.U+="
|
|
11:w[2:] 36:w+\' 61:w+ly_ 86:w+=\' 111:_+w.U+,_
|
|
12:w[:-1] 37:w+_from_ 62:_the_+w+_of_ 87:w.U+" 112:_+w.U+,
|
|
13:,_+w+_ 38:w+_by_ 63:w[:-5] 88:U(w)+._ 113:w.U+(
|
|
14:w+,_ 39:w[5:] 64:w[:-9] 89:_+w+( 114:w.U+._
|
|
15:_+U(w)+_ 40:w[6:] 65:_+U(w)+,_ 90:w+ful_ 115:_+w.U+.
|
|
16:w+_in_ 41:_the_+w 66:U(w)+" 91:_+U(w)+._116:w.U+=\'
|
|
17:w+_to_ 42:w[:-4] 67:.+w+( 92:w+ive_ 117:_+w.U+._
|
|
18:e_+w+_ 43:w+. The_ 68:w.U+_ 93:w+less_ 118:_+U(w)+="
|
|
19:w+" 44:w.U 69:U(w)+"> 94:w.U+\' 119:_+w.U+=\'
|
|
20:w+. 45:w+_on_ 70:w+=" 95:w+est_ 120:_+U(w)+=\'
|
|
21:w+"> 46:w+_as_ 71:_+w+. 96:_+U(w)+.
|
|
22:w+\n 47:w+_is_ 72:.com/+w 97:w.U+">
|
|
23:w[:-3] 48:w[:-7] 98:_+w+=\'
|
|
24:w+] 49:w[:-1]+ing_ 74:U(w)+\' 99:U(w)+,
|
|
"""
|
|
|
|
def compileActions(self):
|
|
"""Build the action table from the text above
|
|
"""
|
|
import re
|
|
self.actionList = actions = [None]*121
|
|
#Action 73, which is too long, looks like this when expanded:
|
|
actions[73] = "b' the '+w+b' of the '"
|
|
#find out what the columns are
|
|
actionLines = self.actionTable.splitlines()
|
|
colonPositions = [m.start()
|
|
for m in re.finditer(':',actionLines[1])
|
|
]+[100]
|
|
columns = [(colonPositions[i]-3,colonPositions[i+1]-3)
|
|
for i in range(len(colonPositions)-1)]
|
|
for line in self.actionTable.splitlines(keepends=False):
|
|
for start,end in columns:
|
|
action = line[start:end]
|
|
#skip empty actions
|
|
if not action or action.isspace(): continue
|
|
#chop it up, and check if the colon is properly placed
|
|
index, colon, action = action[:3], action[3], action[4:]
|
|
assert colon==':'
|
|
#remove filler spaces at right
|
|
action = action.rstrip()
|
|
#replace space symbols
|
|
action = action.replace('_', ' ')
|
|
wPos = action.index('w')
|
|
#add quotes around left string when present
|
|
#translation: any pattern from beginning, up to
|
|
#(but not including) a + following by a w later on
|
|
action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action)
|
|
#add quotes around right string when present
|
|
#translation: anything with a w in it, followed by a +
|
|
#and a pattern up to the end
|
|
#(there is no variable lookbehind assertion,
|
|
#so we have to copy the pattern)
|
|
action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action)
|
|
#expand shortcut for uppercaseAll
|
|
action = action.replace(".U", ".upper()")
|
|
#store action
|
|
actions[int(index)] = action
|
|
|
|
def doAction(self, w, action):
|
|
"""Perform the proper action
|
|
"""
|
|
#set environment for the UpperCaseFirst
|
|
U = self.upperCase1
|
|
return eval(self.actionList[action], locals())
|
|
|
|
class Layout:
|
|
"""Class to layout the output.
|
|
"""
|
|
#display width of hexdata+bitdata
|
|
width = 25
|
|
#general
|
|
def __init__(self, stream):
|
|
self.stream = stream
|
|
self.bitPtr = self.width
|
|
|
|
def makeHexData(self, pos):
|
|
"""Produce hex dump of all data containing the bits
|
|
from pos to stream.pos
|
|
"""
|
|
firstAddress = pos+7>>3
|
|
lastAddress = self.stream.pos+7>>3
|
|
return ''.join(map('{:02x} '.format,
|
|
self.stream.data[firstAddress:lastAddress]))
|
|
|
|
def formatBitData(self, pos, width1, width2=0):
|
|
"""Show formatted bit data:
|
|
Bytes are separated by commas
|
|
whole bytes are displayed in hex
|
|
>>> Layout(olleke).formatBitData(6, 2, 16)
|
|
'|00h|2Eh,|00'
|
|
>>> Layout(olleke).formatBitData(4, 1, 0)
|
|
'1'
|
|
"""
|
|
result = []
|
|
#make empty prefix code explicit
|
|
if width1==0: result = ['()', ',']
|
|
for width in width1, width2:
|
|
#skip empty width2
|
|
if width==0: continue
|
|
#build result backwards in a list
|
|
while width>0:
|
|
availableBits = 8-(pos&7)
|
|
if width<availableBits:
|
|
#read partial byte, beginning nor ending at boundary
|
|
data = self.stream.data[pos>>3] >> (pos&7) & (1<<width)-1
|
|
result.append('{:0{}b}'.format(data, width))
|
|
elif availableBits<8:
|
|
#read rest of byte, ending at boundary
|
|
data = self.stream.data[pos>>3] >> (pos&7)
|
|
result.append('|{:0{}b}'.format(data, availableBits))
|
|
else:
|
|
#read whole byte (in hex), beginning and ending at boundary
|
|
data = self.stream.data[pos>>3]
|
|
result.append('|{:02X}h'.format(data))
|
|
width -= availableBits
|
|
pos += availableBits
|
|
#if width overshot from the availableBits subtraction, fix it
|
|
pos += width
|
|
#add comma to separate fields
|
|
result.append(',')
|
|
#concatenate pieces, reversed, skipping the last space
|
|
return ''.join(result[-2::-1])
|
|
|
|
def readPrefixCode(self, alphabet):
|
|
"""give alphabet the prefix code that is read from the stream
|
|
Called for the following alphabets, in this order:
|
|
The alphabet in question must have a "logical" order,
|
|
otherwise the assignment of symbols doesn't work.
|
|
"""
|
|
mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name))
|
|
if mode=='Complex':
|
|
#for a complex code, numberOfSymbols means hskip
|
|
self.readComplexCode(numberOfSymbols, alphabet)
|
|
return alphabet
|
|
else:
|
|
table = []
|
|
#Set table of lengths for mnemonic function
|
|
lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1]
|
|
#adjust mnemonic function of alphabet class
|
|
def myMnemonic(index):
|
|
return '{} bit{}: {}'.format(
|
|
lengths[i],
|
|
'' if lengths[i]==1 else 's',
|
|
alphabet.__class__.mnemonic(alphabet, index)
|
|
)
|
|
alphabet.mnemonic = myMnemonic
|
|
for i in range(numberOfSymbols):
|
|
table.append(self.verboseRead(alphabet, skipExtra=True).index)
|
|
#restore mnemonic
|
|
del alphabet.mnemonic
|
|
if numberOfSymbols==4:
|
|
#read tree shape to redefine lengths
|
|
lengths = self.verboseRead(TreeShapeAlhabet())
|
|
#construct the alphabet prefix code
|
|
alphabet.setLength(dict(zip(table, lengths)))
|
|
return alphabet
|
|
|
|
def readComplexCode(self, hskip, alphabet):
|
|
"""Read complex code"""
|
|
stream = self.stream
|
|
#read the lengths for the length code
|
|
lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:]
|
|
codeLengths = {}
|
|
total = 0
|
|
lol = LengthOfLengthAlphabet('##'+alphabet.name)
|
|
#lengthCode will be used for coding the lengths of the new code
|
|
#we use it for display until now; definition comes below
|
|
lengthCode = LengthAlphabet('#'+alphabet.name)
|
|
lengthIter = iter(lengths)
|
|
lengthsLeft = len(lengths)
|
|
while total<32 and lengthsLeft>0:
|
|
lengthsLeft -= 1
|
|
newSymbol = next(lengthIter)
|
|
lol.description = str(lengthCode[newSymbol])
|
|
length = self.verboseRead(lol)
|
|
if length:
|
|
codeLengths[newSymbol] = length
|
|
total += 32>>length
|
|
if total>32: raise ValueError("Stream format")
|
|
if len(codeLengths)==1: codeLengths[list(codeLengths.keys())[0]] = 0
|
|
#Now set the encoding of the lengthCode
|
|
lengthCode.setLength(codeLengths)
|
|
print("***** Lengths for {} will be coded as:".format(alphabet.name))
|
|
lengthCode.showCode()
|
|
#Now determine the symbol lengths with the lengthCode
|
|
symbolLengths = {}
|
|
total = 0
|
|
lastLength = 8
|
|
alphabetIter = iter(alphabet)
|
|
while total<32768:
|
|
#look ahead to see what is going to happen
|
|
length = lengthCode.decodePeek(
|
|
self.stream.peek(lengthCode.maxLength))[1].index
|
|
#in every branch, set lengthCode.description to explanatory text
|
|
#lengthCode calls format(symbol, extra) with this string
|
|
if length==0:
|
|
symbol = next(alphabetIter)
|
|
lengthCode.description = 'symbol {} unused'.format(symbol)
|
|
self.verboseRead(lengthCode)
|
|
#unused symbol
|
|
continue
|
|
if length==16:
|
|
lengthCode.description = \
|
|
'{1}+3 symbols of length '+str(lastLength)
|
|
extra = self.verboseRead(lengthCode)
|
|
#scan series of 16s (repeat counts)
|
|
#start with repeat count 2
|
|
repeat = 2
|
|
startSymbol = next(alphabetIter)
|
|
endSymbol = next(alphabetIter)
|
|
symbolLengths[startSymbol.index] = \
|
|
symbolLengths[endSymbol.index] = lastLength
|
|
#count the two just defined symbols
|
|
total += 2*32768>>lastLength
|
|
#note: loop may end because we're there
|
|
#even if a 16 _appears_ to follow
|
|
while True:
|
|
#determine last symbol
|
|
oldRepeat = repeat
|
|
repeat = (repeat-2<<2)+extra+3
|
|
#read as many symbols as repeat increased
|
|
for i in range(oldRepeat, repeat):
|
|
endSymbol = next(alphabetIter)
|
|
symbolLengths[endSymbol.index] = lastLength
|
|
#compute new total; it may be end of loop
|
|
total += (repeat-oldRepeat)*32768>>lastLength
|
|
if total>=32768: break
|
|
#see if there is more to do
|
|
length = lengthCode.decodePeek(
|
|
self.stream.peek(lengthCode.maxLength))[1].index
|
|
if length!=16: break
|
|
lengthCode.description = 'total {}+{{1}} symbols'.format(
|
|
(repeat-2<<2)+3)
|
|
extra = self.verboseRead(lengthCode)
|
|
elif length==17:
|
|
#read, and show explanation
|
|
lengthCode.description = '{1}+3 unused'
|
|
extra = self.verboseRead(lengthCode)
|
|
#scan series of 17s (groups of zero counts)
|
|
#start with repeat count 2
|
|
repeat = 2
|
|
startSymbol = next(alphabetIter)
|
|
endSymbol = next(alphabetIter)
|
|
#note: loop will not end with total==32768,
|
|
#since total doesn't change here
|
|
while True:
|
|
#determine last symbol
|
|
oldRepeat = repeat
|
|
repeat = (repeat-2<<3)+extra+3
|
|
#read as many symbols as repeat increases
|
|
for i in range(repeat-oldRepeat):
|
|
endSymbol = next(alphabetIter)
|
|
#see if there is more to do
|
|
length = lengthCode.decodePeek(
|
|
self.stream.peek(lengthCode.maxLength))[1].index
|
|
if length!=17: break
|
|
lengthCode.description = 'total {}+{{1}} unused'.format(
|
|
(repeat-2<<3)+3)
|
|
extra = self.verboseRead(lengthCode)
|
|
else:
|
|
symbol = next(alphabetIter)
|
|
#double braces for format
|
|
char = str(symbol)
|
|
if char in '{}': char *= 2
|
|
lengthCode.description = \
|
|
'Length for {} is {{0.index}} bits'.format(char)
|
|
#output is not needed (will be 0)
|
|
self.verboseRead(lengthCode)
|
|
symbolLengths[symbol.index] = length
|
|
total += 32768>>length
|
|
lastLength = length
|
|
assert total==32768
|
|
alphabet.setLength(symbolLengths)
|
|
print('End of table. Prefix code '+alphabet.name+':')
|
|
alphabet.showCode()
|
|
|
|
#stream
|
|
def processStream(self):
|
|
"""Process a brotli stream.
|
|
"""
|
|
print('addr hex{:{}s}binary context explanation'.format(
|
|
'', self.width-10))
|
|
print('Stream header'.center(60, '-'))
|
|
self.windowSize = self.verboseRead(WindowSizeAlphabet())
|
|
print('Metablock header'.center(60, '='))
|
|
self.ISLAST = False
|
|
self.output = bytearray()
|
|
while not self.ISLAST:
|
|
self.ISLAST = self.verboseRead(
|
|
BoolCode('LAST', description="Last block"))
|
|
if self.ISLAST:
|
|
if self.verboseRead(
|
|
BoolCode('EMPTY', description="Empty block")): break
|
|
if self.metablockLength(): continue
|
|
if not self.ISLAST and self.uncompressed(): continue
|
|
print('Block type descriptors'.center(60, '-'))
|
|
self.numberOfBlockTypes = {}
|
|
self.currentBlockCounts = {}
|
|
self.blockTypeCodes = {}
|
|
self.blockCountCodes = {}
|
|
for blockType in (L,I,D): self.blockType(blockType)
|
|
print('Distance code parameters'.center(60, '-'))
|
|
self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet())
|
|
self.readLiteralContextModes()
|
|
print('Context maps'.center(60, '-'))
|
|
self.cmaps = {}
|
|
#keep the number of each kind of prefix tree for the last loop
|
|
numberOfTrees = {I: self.numberOfBlockTypes[I]}
|
|
for blockType in (L,D):
|
|
numberOfTrees[blockType] = self.contextMap(blockType)
|
|
print('Prefix code lists'.center(60, '-'))
|
|
self.prefixCodes = {}
|
|
for blockType in (L,I,D):
|
|
self.readPrefixArray(blockType, numberOfTrees[blockType])
|
|
self.metablock()
|
|
|
|
#metablock header
|
|
def verboseRead(self, alphabet, context='', skipExtra=False):
|
|
"""Read symbol and extra from stream and explain what happens.
|
|
Returns the value of the symbol
|
|
>>> olleke.pos = 0
|
|
>>> l = Layout(olleke)
|
|
>>> l.verboseRead(WindowSizeAlphabet())
|
|
0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
|
|
4194288
|
|
"""
|
|
#TODO 2: verbosity level, e.g. show only codes and maps in header
|
|
stream = self.stream
|
|
pos = stream.pos
|
|
if skipExtra:
|
|
length, symbol = alphabet.readTuple(stream)
|
|
extraBits, extra = 0, None
|
|
else:
|
|
length, symbol, extraBits, extra = alphabet.readTupleAndExtra(
|
|
stream)
|
|
#fields: address, hex data, binary data, name of alphabet, explanation
|
|
hexdata = self.makeHexData(pos)
|
|
addressField = '{:04x}'.format(pos+7>>3) if hexdata else ''
|
|
bitdata = self.formatBitData(pos, length, extraBits)
|
|
#bitPtr moves bitdata so that the bytes are easier to read
|
|
#jump back to right if a new byte starts
|
|
if '|' in bitdata[1:]:
|
|
#start over on the right side
|
|
self.bitPtr = self.width
|
|
fillWidth = self.bitPtr-(len(hexdata)+len(bitdata))
|
|
if fillWidth<0: fillWidth = 0
|
|
print('{:<5s} {:<{}s} {:7s} {}'.format(
|
|
addressField,
|
|
hexdata+' '*fillWidth+bitdata, self.width,
|
|
context+alphabet.name,
|
|
symbol if skipExtra else symbol.explanation(extra),
|
|
))
|
|
#jump to the right if we started with a '|'
|
|
#because we didn't jump before printing
|
|
if bitdata.startswith('|'): self.bitPtr = self.width
|
|
else: self.bitPtr -= len(bitdata)
|
|
return symbol if skipExtra else symbol.value(extra)
|
|
|
|
def metablockLength(self):
|
|
"""Read MNIBBLES and meta block length;
|
|
if empty block, skip block and return true.
|
|
"""
|
|
self.MLEN = self.verboseRead(MetablockLengthAlphabet())
|
|
if self.MLEN:
|
|
return False
|
|
#empty block; skip and return False
|
|
self.verboseRead(ReservedAlphabet())
|
|
MSKIP = self.verboseRead(SkipLengthAlphabet())
|
|
self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
|
|
self.stream.pos += 8*MSKIP
|
|
print("Skipping to {:x}".format(self.stream.pos>>3))
|
|
return True
|
|
|
|
def uncompressed(self):
|
|
"""If true, handle uncompressed data
|
|
"""
|
|
ISUNCOMPRESSED = self.verboseRead(
|
|
BoolCode('UNCMPR', description='Is uncompressed?'))
|
|
if ISUNCOMPRESSED:
|
|
self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
|
|
print('Uncompressed data:')
|
|
self.output += self.stream.readBytes(self.MLEN)
|
|
print(outputFormatter(self.output[-self.MLEN:]))
|
|
return ISUNCOMPRESSED
|
|
|
|
def blockType(self, kind):
|
|
"""Read block type switch descriptor for given kind of blockType."""
|
|
NBLTYPES = self.verboseRead(TypeCountAlphabet(
|
|
'BT#'+kind[0].upper(),
|
|
description='{} block types'.format(kind),
|
|
))
|
|
self.numberOfBlockTypes[kind] = NBLTYPES
|
|
if NBLTYPES>=2:
|
|
self.blockTypeCodes[kind] = self.readPrefixCode(
|
|
BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES))
|
|
self.blockCountCodes[kind] = self.readPrefixCode(
|
|
BlockCountAlphabet('BC'+kind[0].upper()))
|
|
blockCount = self.verboseRead(self.blockCountCodes[kind])
|
|
else:
|
|
blockCount = 1<<24
|
|
self.currentBlockCounts[kind] = blockCount
|
|
|
|
def readLiteralContextModes(self):
|
|
"""Read literal context modes.
|
|
LSB6: lower 6 bits of last char
|
|
MSB6: upper 6 bits of last char
|
|
UTF8: roughly dependent on categories:
|
|
upper 4 bits depend on category of last char:
|
|
control/whitespace/space/ punctuation/quote/%/open/close/
|
|
comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant
|
|
lower 2 bits depend on category of 2nd last char:
|
|
space/punctuation/digit or upper/lowercase
|
|
signed: hamming weight of last 2 chars
|
|
"""
|
|
print('Context modes'.center(60, '-'))
|
|
self.literalContextModes = []
|
|
for i in range(self.numberOfBlockTypes[L]):
|
|
self.literalContextModes.append(
|
|
self.verboseRead(LiteralContextMode(number=i)))
|
|
|
|
def contextMap(self, kind):
|
|
"""Read context maps
|
|
Returns the number of differnt values on the context map
|
|
(In other words, the number of prefix trees)
|
|
"""
|
|
NTREES = self.verboseRead(TypeCountAlphabet(
|
|
kind[0].upper()+'T#',
|
|
description='{} prefix trees'.format(kind)))
|
|
mapSize = {L:64, D:4}[kind]
|
|
if NTREES<2:
|
|
self.cmaps[kind] = [0]*mapSize
|
|
else:
|
|
#read CMAPkind
|
|
RLEMAX = self.verboseRead(RLEmaxAlphabet(
|
|
'RLE#'+kind[0].upper(),
|
|
description=kind+' context map'))
|
|
alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX)
|
|
cmapCode = self.readPrefixCode(alphabet)
|
|
tableSize = mapSize*self.numberOfBlockTypes[kind]
|
|
cmap = []
|
|
while len(cmap)<tableSize:
|
|
cmapCode.description = 'map {}, entry {}'.format(
|
|
*divmod(len(cmap), mapSize))
|
|
count, value = self.verboseRead(cmapCode)
|
|
cmap.extend([value]*count)
|
|
assert len(cmap)==tableSize
|
|
IMTF = self.verboseRead(BoolCode('IMTF', description='Apply inverse MTF'))
|
|
if IMTF:
|
|
self.IMTF(cmap)
|
|
if kind==L:
|
|
print('Context maps for literal data:')
|
|
for i in range(0, len(cmap), 64):
|
|
print(*(
|
|
''.join(map(str, cmap[j:j+8]))
|
|
for j in range(i, i+64, 8)
|
|
))
|
|
else:
|
|
print('Context map for distances:')
|
|
print(*(
|
|
''.join(map('{:x}'.format, cmap[i:i+4]))
|
|
for i in range(0, len(cmap), 4)
|
|
))
|
|
self.cmaps[kind] = cmap
|
|
return NTREES
|
|
|
|
@staticmethod
|
|
def IMTF(v):
|
|
"""In place inverse move to front transform.
|
|
"""
|
|
#mtf is initialized virtually with range(infinity)
|
|
mtf = []
|
|
for i, vi in enumerate(v):
|
|
#get old value from mtf. If never seen, take virtual value
|
|
try: value = mtf.pop(vi)
|
|
except IndexError: value = vi
|
|
#put value at front
|
|
mtf.insert(0, value)
|
|
#replace transformed value
|
|
v[i] = value
|
|
|
|
def readPrefixArray(self, kind, numberOfTrees):
|
|
"""Read prefix code array"""
|
|
prefixes = []
|
|
for i in range(numberOfTrees):
|
|
if kind==L: alphabet = LiteralAlphabet(i)
|
|
elif kind==I: alphabet = InsertAndCopyAlphabet(i)
|
|
elif kind==D: alphabet = DistanceAlphabet(
|
|
i, NPOSTFIX=self.NPOSTFIX, NDIRECT=self.NDIRECT)
|
|
self.readPrefixCode(alphabet)
|
|
prefixes.append(alphabet)
|
|
self.prefixCodes[kind] = prefixes
|
|
|
|
#metablock data
|
|
def metablock(self):
|
|
"""Process the data.
|
|
Relevant variables of self:
|
|
numberOfBlockTypes[kind]: number of block types
|
|
currentBlockTypes[kind]: current block types (=0)
|
|
literalContextModes: the context modes for the literal block types
|
|
currentBlockCounts[kind]: counters for block types
|
|
blockTypeCodes[kind]: code for block type
|
|
blockCountCodes[kind]: code for block count
|
|
cmaps[kind]: the context maps (not for I)
|
|
prefixCodes[kind][#]: the prefix codes
|
|
lastDistances: the last four distances
|
|
lastChars: the last two chars
|
|
output: the result
|
|
"""
|
|
print('Meta block contents'.center(60, '='))
|
|
self.currentBlockTypes = {L:0, I:0, D:0, pL:1, pI:1, pD:1}
|
|
self.lastDistances = deque([17,16,11,4], maxlen=4)
|
|
#the current context mode is for block type 0
|
|
self.contextMode = ContextModeKeeper(self.literalContextModes[0])
|
|
wordList = WordList()
|
|
|
|
#setup distance callback function
|
|
def distanceCallback(symbol, extra):
|
|
"callback function for displaying decoded distance"
|
|
index, offset = symbol.value(extra)
|
|
if index:
|
|
#recent distance
|
|
distance = self.lastDistances[-index]+offset
|
|
return 'Distance: {}last{:+d}={}'.format(index, offset, distance)
|
|
#absolute value
|
|
if offset<=maxDistance:
|
|
return 'Absolute value: {} (pos {})'.format(offset, maxDistance-offset)
|
|
#word list value
|
|
action, word = divmod(offset-maxDistance, 1<<wordList.NDBITS[copyLen])
|
|
return '{}-{} gives word {},{} action {}'.format(
|
|
offset, maxDistance, copyLen, word, action)
|
|
for dpc in self.prefixCodes[D]: dpc.callback = distanceCallback
|
|
|
|
blockLen = 0
|
|
#there we go
|
|
while blockLen<self.MLEN:
|
|
#get insert© command
|
|
litLen, copyLen, dist0Flag = self.verboseRead(
|
|
self.prefixCodes[I][
|
|
self.figureBlockType(I)])
|
|
#literal data
|
|
for i in range(litLen):
|
|
bt = self.figureBlockType(L)
|
|
cm = self.contextMode.getIndex()
|
|
ct = self.cmaps[L][bt<<6|cm]
|
|
char = self.verboseRead(
|
|
self.prefixCodes[L][ct],
|
|
context='{},{}='.format(bt,cm))
|
|
self.contextMode.add(char)
|
|
self.output.append(char)
|
|
blockLen += litLen
|
|
#check if we're done
|
|
if blockLen>=self.MLEN: return
|
|
#distance
|
|
#distances are computed relative to output length, at most window size
|
|
maxDistance = min(len(self.output), self.windowSize)
|
|
if dist0Flag:
|
|
distance = self.lastDistances[-1]
|
|
else:
|
|
bt = self.figureBlockType(D)
|
|
cm = {2:0, 3:1, 4:2}.get(copyLen, 3)
|
|
ct = self.cmaps[D][bt<<2|cm]
|
|
index, offset = self.verboseRead(
|
|
self.prefixCodes[D][ct],
|
|
context='{},{}='.format(bt,cm))
|
|
distance = self.lastDistances[-index]+offset if index else offset
|
|
if index==1 and offset==0:
|
|
#to make sure distance is not put in last distance list
|
|
dist0Flag = True
|
|
if distance<=maxDistance:
|
|
#copy from output
|
|
for i in range(
|
|
maxDistance-distance,
|
|
maxDistance-distance+copyLen):
|
|
self.output.append(self.output[i])
|
|
if not dist0Flag: self.lastDistances.append(distance)
|
|
comment = 'Seen before'
|
|
else:
|
|
#fetch from wordlist
|
|
newWord = wordList.word(copyLen, distance-maxDistance-1)
|
|
self.output.extend(newWord)
|
|
#adjust copyLen to reflect actual new data
|
|
copyLen = len(newWord)
|
|
comment = 'From wordlist'
|
|
blockLen += copyLen
|
|
print(' '*40,
|
|
comment,
|
|
': "',
|
|
outputFormatter(self.output[-copyLen:]),
|
|
'"',
|
|
sep='')
|
|
self.contextMode.add(self.output[-2])
|
|
self.contextMode.add(self.output[-1])
|
|
|
|
def figureBlockType(self, kind):
|
|
counts, types = self.currentBlockCounts, self.currentBlockTypes
|
|
if counts[kind]==0:
|
|
newType = self.verboseRead(self.blockTypeCodes[kind])
|
|
if newType==-2: newType = types['P'+kind]
|
|
elif newType==-1:
|
|
newType = (types[kind]+1)%self.numberOfBlockTypes[kind]
|
|
types['P'+kind] = types[kind]
|
|
types[kind] = newType
|
|
counts[kind] = self.verboseRead(self.blockCountCodes[kind])
|
|
counts[kind] -=1
|
|
return types[kind]
|
|
|
|
__test__ = {
|
|
'BitStream': """
|
|
>>> bs = BitStream(b'Jurjen')
|
|
>>> bs.readBytes(2)
|
|
b'Ju'
|
|
>>> bs.read(6) #r=01110010
|
|
50
|
|
>>> bs
|
|
BitStream(pos=2:6)
|
|
>>> bs.peek(5) #j=01101010
|
|
9
|
|
>>> bs.readBytes(2)
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: readBytes: need byte boundary
|
|
""",
|
|
|
|
'Symbol': """
|
|
>>> a=Symbol(MetablockLengthAlphabet(),5)
|
|
>>> len(a)
|
|
2
|
|
>>> int(a)
|
|
5
|
|
>>> a.bitPattern()
|
|
'01'
|
|
>>> a.value(200000)
|
|
200001
|
|
>>> a.explanation(300000)
|
|
'data length: 493e0h+1=300001'
|
|
""",
|
|
|
|
'RangeDecoder': """
|
|
>>> a=RangeDecoder(bitLength=3)
|
|
>>> len(a)
|
|
8
|
|
>>> a.name='t'
|
|
>>> list(a)
|
|
[Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)]
|
|
>>> a[2]
|
|
Symbol(t, 2)
|
|
>>> a.bitPattern(4)
|
|
'100'
|
|
>>> a.length(2)
|
|
3
|
|
>>> a.decodePeek(15)
|
|
(3, Symbol(t, 7))
|
|
>>>
|
|
|
|
""",
|
|
|
|
'PrefixDecoder': """
|
|
>>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4})
|
|
>>> len(a)
|
|
4
|
|
>>> a.name='t'
|
|
>>> list(a)
|
|
[Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)]
|
|
>>> a.decodePeek(22)
|
|
(1, Symbol(t, 1))
|
|
>>> a.decodePeek(27)
|
|
(3, Symbol(t, 3))
|
|
>>> a.length(1)
|
|
1
|
|
>>> a.length(4)
|
|
3
|
|
""",
|
|
|
|
'Code': """
|
|
>>> a=Code('t',alphabetSize=10)
|
|
>>> len(a)
|
|
10
|
|
>>> a.showCode()
|
|
0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9
|
|
>>> a.setLength({2:1,3:2,5:3,6:3})
|
|
>>> a.showCode()
|
|
0:2 01:3 011:5 111:6
|
|
>>> len(a)
|
|
4
|
|
>>> def callback(i): return 'call{}back'.format(i)
|
|
>>> a=Code('t',callback=callback,bitLength=3)
|
|
>>> a[6].explanation()
|
|
'call6back'
|
|
""",
|
|
|
|
'WithExtra': """
|
|
>>> class A(WithExtra):
|
|
... extraTable = [0,1,1,2,2]
|
|
>>> a=A('t',alphabetSize=5)
|
|
>>> a[1]
|
|
Symbol(t, 1)
|
|
>>> a.extraBits(2)
|
|
1
|
|
>>> a.mnemonic(4)
|
|
'4'
|
|
>>> a.readTupleAndExtra(BitStream(b'\x5b'))
|
|
(3, Symbol(t, 3), 2, 3)
|
|
""",
|
|
|
|
'BoolCode': """
|
|
>>> BoolCode('test')[0].explanation()
|
|
'0: False'
|
|
""",
|
|
|
|
'Enumerator': """
|
|
>>> class A(Enumerator):
|
|
... extraTable = [0,1,1,2,2]
|
|
... value0=3
|
|
>>> a=A(alphabetLength=5)
|
|
>>> a.value(3)
|
|
Traceback (most recent call last):
|
|
...
|
|
TypeError: value() missing 1 required positional argument: 'extra'
|
|
>>> a.explanation(3,4)
|
|
'xx 011: 8-11; 8+4=12'
|
|
""",
|
|
|
|
'WindowSizeAlphabet': """
|
|
>>> windowSizeAlphabet = WindowSizeAlphabet()
|
|
>>> windowSizeAlphabet[0]
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: No symbol WindowSizeAlphabet[0]
|
|
>>> len(windowSizeAlphabet)
|
|
16
|
|
>>> windowSizeAlphabet[21]
|
|
Symbol(WSIZE, 21)
|
|
>>> windowSizeAlphabet[21].bitPattern()
|
|
'1001'
|
|
>>> windowSizeAlphabet[21].extraBits()
|
|
0
|
|
>>> windowSizeAlphabet[21].index
|
|
21
|
|
>>> windowSizeAlphabet[10].value()
|
|
1008
|
|
>>> windowSizeAlphabet[10].explanation()
|
|
'windowsize=(1<<10)-16=1008'
|
|
>>> windowSizeAlphabet.showCode()
|
|
0:65520 1100001:16368 1110001:32752 0011:262128
|
|
0000001:131056 0010001:None 1001:2097136 1011:4194288
|
|
1000001:4080 1010001:8176 0101:524272 0111:1048560
|
|
0100001:1008 0110001:2032 1101:8388592 1111:16777200
|
|
""",
|
|
|
|
'TypeCountAlphabet': """
|
|
>>> typeCountAlphabet = TypeCountAlphabet(description='bananas')
|
|
>>> len(typeCountAlphabet)
|
|
9
|
|
>>> typeCountAlphabet[3]
|
|
Symbol(BT#, 3)
|
|
>>> typeCountAlphabet[9]
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: No symbol TypeCountAlphabet[9]
|
|
>>> print(typeCountAlphabet[3])
|
|
xx,0101
|
|
>>> typeCountAlphabet[8].value(127)
|
|
256
|
|
>>> typeCountAlphabet[4].explanation(2)
|
|
'xxx,0111: 11 bananas'
|
|
>>> typeCountAlphabet[0].explanation()
|
|
'0: 1 banana'
|
|
""",
|
|
|
|
'DistanceParamAlphabet': """
|
|
>>> dpa = DistanceParamAlphabet()
|
|
>>> dpa.showCode()
|
|
00:PF0 01:PF1 10:PF2 11:PF3
|
|
>>> dpa.readTupleAndExtra(BitStream(b'\\x29'))
|
|
(2, Symbol(DIST, 1), 4, 10)
|
|
>>> dpa.explanation(2, 5)
|
|
'2 postfix bits and 0101<<2=20 direct codes'
|
|
""",
|
|
|
|
'LiteralAlphabet': """
|
|
>>> LiteralAlphabet(-1).showCode() #doctest: +ELLIPSIS
|
|
00000000:\\x00 00110100:4 01101000:h 10011100:\\x9c 11010000:\\xd0
|
|
00000001:\\x01 00110101:5 01101001:i 10011101:\\x9d 11010001:\\xd1
|
|
00000010:\\x02 00110110:6 01101010:j 10011110:\\x9e 11010010:\\xd2
|
|
...
|
|
00101111:/ 01100011:c 10010111:\\x97 11001011:\\xcb 11111111:\\xff
|
|
00110000:0 01100100:d 10011000:\\x98 11001100:\\xcc
|
|
00110001:1 01100101:e 10011001:\\x99 11001101:\\xcd
|
|
00110010:2 01100110:f 10011010:\\x9a 11001110:\\xce
|
|
00110011:3 01100111:g 10011011:\\x9b 11001111:\\xcf
|
|
""",
|
|
|
|
'BlockCountAlphabet': """
|
|
>>> bc=BlockCountAlphabet('BCL')
|
|
>>> len(bc)
|
|
26
|
|
>>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02')
|
|
>>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
|
|
'Block count: xx 00000: 1-4; 1+2=3'
|
|
>>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
|
|
'Block count: xxx 00110: 33-40; 33+0=33'
|
|
>>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
|
|
'Block count: xxxxxx 10001: 305-368; 305+28=333'
|
|
>>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
|
|
'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333'
|
|
""",
|
|
|
|
'Layout': """
|
|
>>> olleke.pos = 0
|
|
>>> l = Layout(olleke)
|
|
>>> l.verboseRead(WindowSizeAlphabet())
|
|
0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
|
|
4194288
|
|
>>> l.verboseRead(BoolCode('LAST', description="Last block"))
|
|
1 LAST Last block: 1: True
|
|
True
|
|
>>> l.verboseRead(BoolCode('EMPTY', description="Empty block"))
|
|
0 EMPTY Empty block: 0: False
|
|
False
|
|
>>> l.verboseRead(MetablockLengthAlphabet())
|
|
0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47
|
|
47
|
|
>>> olleke.pos = 76
|
|
>>> l = Layout(olleke)
|
|
>>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True)
|
|
000a 82 10|1100 D0 10[15*x]-3
|
|
>>> x.explanation(0x86a3)
|
|
'10[1000011010100011]-3: [0]+100000'
|
|
""",
|
|
|
|
'olleke': """
|
|
>>> olleke.pos = 0
|
|
>>> try: Layout(olleke).processStream()
|
|
... except NotImplementedError: pass
|
|
... #doctest: +REPORT_NDIFF
|
|
addr hex binary context explanation
|
|
-----------------------Stream header------------------------
|
|
0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
|
|
======================Metablock header======================
|
|
1 LAST Last block: 1: True
|
|
0 EMPTY Empty block: 0: False
|
|
0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47
|
|
-------------------Block type descriptors-------------------
|
|
0003 00 0 BT#L 0: 1 literal block type
|
|
0 BT#I 0: 1 insert© block type
|
|
0 BT#D 0: 1 distance block type
|
|
------------------Distance code parameters------------------
|
|
0004 44 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes
|
|
-----------------------Context modes------------------------
|
|
10 LC0 Context mode for type 0: 2(UTF8)
|
|
------------------------Context maps------------------------
|
|
0 LT# 0: 1 literal prefix tree
|
|
0 DT# 0: 1 distance prefix tree
|
|
---------------------Prefix code lists----------------------
|
|
10 PFX L0 is complex with lengths 3,4,0,5,17...
|
|
0005 4f 1|0 ##L0 len 3: coded with 3 bits
|
|
0111 ##L0 len 4: coded with 1 bits
|
|
10 ##L0 unused: coded with 3 bits
|
|
0006 d6 0|0 ##L0 len 5: skipped
|
|
011 ##L0 zero xxx: coded with 2 bits
|
|
***** Lengths for L0 will be coded as:
|
|
0:len 4 01:zero xxx 011:unused 111:len 3
|
|
0007 95 1|11,01 #L0 7+3 unused
|
|
0 #L0 Length for \\n is 4 bits
|
|
001,01 #L0 1+3 unused
|
|
0008 44 010,0|1 #L0 total 19+2 unused
|
|
0 #L0 Length for " " is 4 bits
|
|
0 #L0 Length for ! is 4 bits
|
|
0009 cb 011,|01 #L0 3+3 unused
|
|
|110,01 #L0 total 35+6 unused
|
|
000a 82 0 #L0 Length for K is 4 bits
|
|
000,01 #L0 0+3 unused
|
|
0 #L0 Length for O is 4 bits
|
|
000b 4d 01|1 #L0 symbol P unused
|
|
011 #L0 symbol Q unused
|
|
0 #L0 Length for R is 4 bits
|
|
000c 88 000,|01 #L0 0+3 unused
|
|
|100,01 #L0 total 11+4 unused
|
|
000d b6 0 #L0 Length for b is 4 bits
|
|
011 #L0 symbol c unused
|
|
011 #L0 symbol d unused
|
|
000e 27 11|1 #L0 Length for e is 3 bits
|
|
010,01 #L0 2+3 unused
|
|
|0 #L0 Length for k is 4 bits
|
|
000f 1f 111 #L0 Length for l is 3 bits
|
|
011 #L0 symbol m unused
|
|
0 #L0 Length for n is 4 bits
|
|
|0 #L0 Length for o is 4 bits
|
|
0010 c1 000,01 #L0 0+3 unused
|
|
0 #L0 Length for s is 4 bits
|
|
0011 b4 0|11 #L0 symbol t unused
|
|
0 #L0 Length for u is 4 bits
|
|
End of table. Prefix code L0:
|
|
000:e 0010:\\n 0110:! 0001:O 0101:b 0011:n 0111:s
|
|
100:l 1010:" " 1110:K 1001:R 1101:k 1011:o 1111:u
|
|
11,01 PFX IC0 is simple with 4 code words
|
|
0012 2a |2Ah|10 IC0 ? bits: I5C4
|
|
0013 b5 ec 00|B5h IC0 ? bits: I6+xC7
|
|
0015 22 0010|111011 IC0 ? bits: I8+xC5
|
|
0016 8c 001100|0010 IC0 ? bits: I0C14+xx
|
|
0 SHAPE False: lengths 2,2,2,2
|
|
0017 74 10,0|1 PFX D0 is simple with 3 code words
|
|
0018 a6 0|01110 D0 1 bit: 2last-3
|
|
010011 D0 2 bits: 11xx-3
|
|
0019 aa 01010|1 D0 2 bits: 11xxx-3
|
|
====================Meta block contents=====================
|
|
|1,01 IC0 Literal: 9, copy: 5
|
|
001a 41 0001 0,0=L0 O
|
|
100 0,48=L0 l
|
|
001b a2 10|0 0,62=L0 l
|
|
000 0,63=L0 e
|
|
001c a1 1|101 0,59=L0 k
|
|
000 0,63=L0 e
|
|
|1010 0,59=L0 " "
|
|
001d b5 0101 0,11=L0 b
|
|
|1011 0,60=L0 o
|
|
001e 24 0 0,3=D0 Distance: 2last-3=8
|
|
Seen before: "lleke"
|
|
0,10 IC0 Literal: 6, copy: 7
|
|
|0010 0,59=L0 \\n
|
|
001f 89 1001 0,7=L0 R
|
|
000 0,52=L0 e
|
|
0020 fa 010|1 0,58=L0 b
|
|
1111 0,63=L0 u
|
|
0021 eb 011|1 0,59=L0 s
|
|
11,01 0,3=D0 Absolute value: 12 (pos 8)
|
|
Seen before: "olleke\\n"
|
|
0022 db 01,1|1 IC0 Literal: 0, copy: 15
|
|
|110,11 0,3=D0 Absolute value: 27 (pos 0)
|
|
Seen before: "Olleke bolleke\\n"
|
|
0023 f8 00 IC0 Literal: 5, copy: 4
|
|
1110 0,7=L0 K
|
|
0024 2c 00|11 0,52=L0 n
|
|
1011 0,62=L0 o
|
|
0025 0d 1|00 0,59=L0 l
|
|
0110 0,63=L0 !
|
|
""",
|
|
|
|
'file': """
|
|
>>> try: Layout(BitStream(
|
|
... open("H:/Downloads/brotli-master/tests/testdata/10x10y.compressed",'rb')
|
|
... .read())).processStream()
|
|
... except NotImplementedError: pass
|
|
addr hex binary context explanation
|
|
-----------------------Stream header------------------------
|
|
0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
|
|
======================Metablock header======================
|
|
1 LAST Last block: 1: True
|
|
0 EMPTY Empty block: 0: False
|
|
0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20
|
|
-------------------Block type descriptors-------------------
|
|
0003 00 0 BT#L 0: 1 literal block type
|
|
0 BT#I 0: 1 insert© block type
|
|
0 BT#D 0: 1 distance block type
|
|
------------------Distance code parameters------------------
|
|
0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes
|
|
-----------------------Context modes------------------------
|
|
10 LC0 Context mode for type 0: 2(UTF8)
|
|
------------------------Context maps------------------------
|
|
0 LT# 0: 1 literal prefix tree
|
|
0 DT# 0: 1 distance prefix tree
|
|
---------------------Prefix code lists----------------------
|
|
0005 b0 0|1,01 PFX L0 is simple with 2 code words
|
|
0006 b2 0|1011000 L0 1 bit: X
|
|
0007 ea 0|1011001 L0 1 bit: Y
|
|
01,01 PFX IC0 is simple with 2 code words
|
|
0008 81 0000001|111 IC0 1 bit: I1C9&D=0
|
|
0009 47 02 0|47h|1 IC0 1 bit: I1C9
|
|
00,01 PFX D0 is simple with 1 code word
|
|
000b 8a 010|000 D0 0 bits: 10x-3
|
|
====================Meta block contents=====================
|
|
1 IC0 Literal: 1, copy: 9
|
|
0 0,0=L0 X
|
|
0,() 0,3=D0 Absolute value: 1 (pos 0)
|
|
Seen before: "XXXXXXXXX"
|
|
0 IC0 Literal: 1, copy: 9, same distance
|
|
|1 0,54=L0 Y
|
|
Seen before: "YYYYYYYYY"
|
|
""",
|
|
|
|
'XY': """
|
|
>>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream()
|
|
... except NotImplementedError: pass
|
|
addr hex binary context explanation
|
|
-----------------------Stream header------------------------
|
|
0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
|
|
======================Metablock header======================
|
|
1 LAST Last block: 1: True
|
|
0 EMPTY Empty block: 0: False
|
|
0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20
|
|
-------------------Block type descriptors-------------------
|
|
0003 00 0 BT#L 0: 1 literal block type
|
|
0 BT#I 0: 1 insert© block type
|
|
0 BT#D 0: 1 distance block type
|
|
------------------Distance code parameters------------------
|
|
0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes
|
|
-----------------------Context modes------------------------
|
|
10 LC0 Context mode for type 0: 2(UTF8)
|
|
------------------------Context maps------------------------
|
|
0 LT# 0: 1 literal prefix tree
|
|
0 DT# 0: 1 distance prefix tree
|
|
---------------------Prefix code lists----------------------
|
|
0005 b0 0|1,01 PFX L0 is simple with 2 code words
|
|
0006 b2 0|1011000 L0 1 bit: X
|
|
0007 82 0|1011001 L0 1 bit: Y
|
|
00,01 PFX IC0 is simple with 1 code word
|
|
0008 84 0000100|100 IC0 0 bits: I4C6&D=0
|
|
0009 00 00,0|1 PFX D0 is simple with 1 code word
|
|
000a e0 0|00000 D0 0 bits: last
|
|
====================Meta block contents=====================
|
|
() IC0 Literal: 4, copy: 6, same distance
|
|
0 0,0=L0 X
|
|
0 0,52=L0 X
|
|
0 0,54=L0 X
|
|
0 0,54=L0 X
|
|
Seen before: "XXXXXX"
|
|
() IC0 Literal: 4, copy: 6, same distance
|
|
1 0,54=L0 Y
|
|
1 0,54=L0 Y
|
|
|1 0,54=L0 Y
|
|
000b 01 1 0,54=L0 Y
|
|
Seen before: "YYYYYY"
|
|
""",
|
|
|
|
'empty': """
|
|
>>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream()
|
|
... except NotImplementedError: pass
|
|
addr hex binary context explanation
|
|
-----------------------Stream header------------------------
|
|
0000 81 0000001 WSIZE windowsize=(1<<17)-16=131056
|
|
======================Metablock header======================
|
|
|1 LAST Last block: 1: True
|
|
0001 16 0 EMPTY Empty block: 0: False
|
|
11 MLEN 11: empty block
|
|
0 RSVD Reserved (must be zero)
|
|
0002 00 000000|00,01 SKIP skip length: 0h+1=1
|
|
|00 SKIP 2 bits ignored
|
|
Skipping to 4
|
|
""",
|
|
|
|
}
|
|
|
|
if __name__ == '__main__':
|
|
import os
|
|
import sys
|
|
here = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
|
DICTIONARY_PATH = os.path.realpath(os.path.join(here, DICTIONARY_PATH))
|
|
if len(sys.argv) > 1:
|
|
l = Layout(BitStream(open(sys.argv[1],'rb').read()))
|
|
l.processStream()
|
|
else:
|
|
sys.path.append("h:/Persoonlijk/bin")
|
|
try:
|
|
import brotli
|
|
open('brotlidump.br', 'wb').write(
|
|
brotli.compress(
|
|
open('brotlidump.py', 'r').read()
|
|
))
|
|
olleke = BitStream(brotli.compress(
|
|
'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!'))
|
|
except ImportError: pass
|
|
import doctest
|
|
doctest.testmod(optionflags=doctest.REPORT_NDIFF
|
|
#|doctest.FAIL_FAST
|
|
)
|