Network: Use public suffix database in DAFSA format

This saves 91kiB of text size and makes it easier to use
a locally installed database when available.

[ChangeLog][Third-Party Code] Moved attribution of
The Public Suffix List from Qt Core to Qt Network.

[ChangeLog][Third-Party Code] Added attribution of new libpsl
library to Qt Network. libpsl is available under
the BSD 3-Clause "New" or "Revised" License.

Task-number: QTBUG-95889
Change-Id: Ibd37c7a94fdf235e75d96fec20d427fb5c2bd2a4
Reviewed-by: Kai Koehne <kai.koehne@qt.io>
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
This commit is contained in:
Ievgenii Meshcheriakov 2022-02-21 15:04:05 +01:00
parent d0b22762be
commit cba40055b1
10 changed files with 5446 additions and 14912 deletions

15
src/3rdparty/libpsl/qt_attribution.json vendored Normal file
View File

@ -0,0 +1,15 @@
{
"Id": "libpsl",
"Name": "libpsl - C library to handle the Public Suffix List",
"QDocModule": "qtnetwork",
"Description": "Libpsl allows checking domains against the Public Suffix List.",
"Files": "src/lookup_string_in_fixed_set.c src/psl-make-dafsa",
"QtUsage": "Used to compress the embedded copy of publicsuffix list and
to lookup entries in it.",
"Homepage": "https://github.com/rockdaboot/libpsl",
"Version": "664f3dc85259ec65e30248a61fa1c45b7b0e4c3f",
"License": "BSD 3-clause \"New\" or \"Revised\" License",
"LicenseFile": "src/LICENSE.chromium",
"LicenseId": "BSD-3-Clause",
"Copyright": "Copyright 2014-2016 The Chromium Authors. All rights reserved."
}

View File

@ -0,0 +1,30 @@
* The following License is for the source code files
psl-make-dafsa and lookup_string_in_fixed_set.c.
// Copyright 2015 The Chromium Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,273 @@
/* Copyright 2015-2016 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE.chromium file.
*
* Converted to C89 2015 by Tim Rühsen
*/
#include <stddef.h>
#define CHECK_LT(a, b) if ((a) >= b) return 0
static const char multibyte_length_table[16] = {
0, 0, 0, 0, /* 0x00-0x3F */
0, 0, 0, 0, /* 0x40-0x7F */
0, 0, 0, 0, /* 0x80-0xBF */
2, 2, 3, 4, /* 0xC0-0xFF */
};
/*
* Get length of multibyte character sequence starting at a given byte.
* Returns zero if the byte is not a valid leading byte in UTF-8.
*/
static int GetMultibyteLength(char c) {
return multibyte_length_table[((unsigned char)c) >> 4];
}
/*
* Moves pointers one byte forward.
*/
static void NextPos(const unsigned char** pos,
const char** key,
const char** multibyte_start)
{
++*pos;
if (*multibyte_start) {
/* Advance key to next byte in multibyte sequence. */
++*key;
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
*multibyte_start = 0;
} else {
if (GetMultibyteLength(**key)) {
/* Multibyte prefix was matched in the dafsa, start matching multibyte
* content in next round. */
*multibyte_start = *key;
} else {
/* Advance key as a single byte character was matched. */
++*key;
}
}
}
/*
* Read next offset from pos.
* Returns true if an offset could be read, false otherwise.
*/
static int GetNextOffset(const unsigned char** pos,
const unsigned char* end,
const unsigned char** offset)
{
size_t bytes_consumed;
if (*pos == end)
return 0;
/* When reading an offset the byte array must always contain at least
* three more bytes to consume. First the offset to read, then a node
* to skip over and finally a destination node. No object can be smaller
* than one byte. */
CHECK_LT(*pos + 2, end);
switch (**pos & 0x60) {
case 0x60: /* Read three byte offset */
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
bytes_consumed = 3;
break;
case 0x40: /* Read two byte offset */
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
bytes_consumed = 2;
break;
default:
*offset += (*pos)[0] & 0x3F;
bytes_consumed = 1;
}
if ((**pos & 0x80) != 0) {
*pos = end;
} else {
*pos += bytes_consumed;
}
return 1;
}
/*
* Check if byte at offset is last in label.
*/
static int IsEOL(const unsigned char* offset, const unsigned char* end)
{
CHECK_LT(offset, end);
return(*offset & 0x80) != 0;
}
/*
* Check if byte at offset matches first character in key.
* This version assumes a range check was already performed by the caller.
*/
static int IsMatchUnchecked(const unsigned char matcher,
const char* key,
const char* multibyte_start)
{
if (multibyte_start) {
/* Multibyte matching mode. */
if (multibyte_start == key) {
/* Match leading byte, which will also match the sequence length. */
return (matcher ^ 0x80) == (const unsigned char)*key;
} else {
/* Match following bytes. */
return (matcher ^ 0xC0) == (const unsigned char)*key;
}
}
/* If key points at a leading byte in a multibyte sequence, but we are not yet
* in multibyte mode, then the dafsa should contain a special byte to indicate
* a mode switch. */
if (GetMultibyteLength(*key)) {
return matcher == 0x1F;
}
/* Normal matching of a single byte character. */
return matcher == (const unsigned char)*key;
}
/*
* Check if byte at offset matches first character in key.
* This version matches characters not last in label.
*/
static int IsMatch(const unsigned char* offset,
const unsigned char* end,
const char* key,
const char* multibyte_start)
{
CHECK_LT(offset, end);
return IsMatchUnchecked(*offset, key, multibyte_start);
}
/*
* Check if byte at offset matches first character in key.
* This version matches characters last in label.
*/
static int IsEndCharMatch(const unsigned char* offset,
const unsigned char* end,
const char* key,
const char* multibyte_start)
{
CHECK_LT(offset, end);
return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
}
/*
* Read return value at offset.
* Returns true if a return value could be read, false otherwise.
*/
static int GetReturnValue(const unsigned char* offset,
const unsigned char* end,
const char* multibyte_start,
int* return_value)
{
CHECK_LT(offset, end);
if (!multibyte_start && (*offset & 0xE0) == 0x80) {
*return_value = *offset & 0x0F;
return 1;
}
return 0;
}
/*
* Looks up the string |key| with length |key_length| in a fixed set of
* strings. The set of strings must be known at compile time. It is converted to
* a graph structure named a DAFSA (Deterministic Acyclic Finite State
* Automaton) by the script psl-make-dafsa during compilation. This permits
* efficient (in time and space) lookup. The graph generated by psl-make-dafsa
* takes the form of a constant byte array which should be supplied via the
* |graph| and |length| parameters. The return value is kDafsaNotFound,
* kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
* kDafsaWildcardRule and kDafsaPrivateRule ORed together.
*
* Lookup a domain key in a byte array generated by psl-make-dafsa.
*/
/* prototype to skip warning with -Wmissing-prototypes */
int LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
int LookupStringInFixedSet(const unsigned char* graph,
size_t length,
const char* key,
size_t key_length)
{
const unsigned char* pos = graph;
const unsigned char* end = graph + length;
const unsigned char* offset = pos;
const char* key_end = key + key_length;
const char* multibyte_start = 0;
while (GetNextOffset(&pos, end, &offset)) {
/*char <char>+ end_char offsets
* char <char>+ return value
* char end_char offsets
* char return value
* end_char offsets
* return_value
*/
int did_consume = 0;
if (key != key_end && !IsEOL(offset, end)) {
/* Leading <char> is not a match. Don't dive into this child */
if (!IsMatch(offset, end, key, multibyte_start))
continue;
did_consume = 1;
NextPos(&offset, &key, &multibyte_start);
/* Possible matches at this point:
* <char>+ end_char offsets
* <char>+ return value
* end_char offsets
* return value
*/
/* Remove all remaining <char> nodes possible */
while (!IsEOL(offset, end) && key != key_end) {
if (!IsMatch(offset, end, key, multibyte_start))
return -1;
NextPos(&offset, &key, &multibyte_start);
}
}
/* Possible matches at this point:
* end_char offsets
* return_value
* If one or more <char> elements were consumed, a failure
* to match is terminal. Otherwise, try the next node.
*/
if (key == key_end) {
int return_value;
if (GetReturnValue(offset, end, multibyte_start, &return_value))
return return_value;
/* The DAFSA guarantees that if the first char is a match, all
* remaining char elements MUST match if the key is truly present.
*/
if (did_consume)
return -1;
continue;
}
if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
if (did_consume)
return -1; /* Unexpected */
continue;
}
NextPos(&offset, &key, &multibyte_start);
pos = offset; /* Dive into child */
}
return -1; /* No match */
}
/* prototype to skip warning with -Wmissing-prototypes */
int GetUtfMode(const unsigned char *graph, size_t length);
int GetUtfMode(const unsigned char *graph, size_t length)
{
return length > 0 && graph[length - 1] < 0x80;
}

692
src/3rdparty/libpsl/src/psl-make-dafsa vendored Executable file
View File

@ -0,0 +1,692 @@
#!/usr/bin/env python
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE.chromium file.
"""
A Deterministic acyclic finite state automaton (DAFSA) is a compact
representation of an unordered word list (dictionary).
https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
This python program converts a list of strings to a byte array in C++.
This python program fetches strings and return values from a gperf file
and generates a C++ file with a byte array representing graph that can be
used as a memory efficient replacement for the perfect hash table.
The input strings must consist of printable 7-bit ASCII characters or UTF-8
multibyte sequences. Control characters in the range [0x00-0x1F] are not
allowed. The return values must be one digit integers. .
In this program a DAFSA is a diamond shaped graph starting at a common
source node and ending at a common sink node. All internal nodes contain
a label and each word is represented by the labels in one path from
the source node to the sink node.
The following python represention is used for nodes:
Source node: [ children ]
Internal node: (label, [ children ])
Sink node: None
The graph is first compressed by prefixes like a trie. In the next step
suffixes are compressed so that the graph gets diamond shaped. Finally
one to one linked nodes are replaced by nodes with the labels joined.
The order of the operations is crucial since lookups will be performed
starting from the source with no backtracking. Thus a node must have at
most one child with a label starting by the same character. The output
is also arranged so that all jumps are to increasing addresses, thus forward
in memory.
The generated output has suffix free decoding so that the sign of leading
bits in a link (a reference to a child node) indicate if it has a size of one,
two or three bytes and if it is the last outgoing link from the actual node.
A node label is terminated by a byte with the leading bit set.
The generated byte array can described by the following BNF:
<byte> ::= < 8-bit value in range [0x00-0xFF] >
<char> ::= < byte in range [0x1F-0x7F] >
<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
<offset1> ::= < byte in range [0x00-0x3F] >
<offset2> ::= < byte in range [0x40-0x5F] >
<offset3> ::= < byte in range [0x60-0x7F] >
<end_offset1> ::= < byte in range [0x80-0xBF] >
<end_offset2> ::= < byte in range [0xC0-0xDF] >
<end_offset3> ::= < byte in range [0xE0-0xFF] >
<prefix> ::= <char>
<label> ::= <end_char>
| <char> <label>
<end_label> ::= <return_value>
| <char> <end_label>
<offset> ::= <offset1>
| <offset2> <byte>
| <offset3> <byte> <byte>
<end_offset> ::= <end_offset1>
| <end_offset2> <byte>
| <end_offset3> <byte> <byte>
<offsets> ::= <end_offset>
| <offset> <offsets>
<source> ::= <offsets>
<node> ::= <label> <offsets>
| <prefix> <node>
| <end_label>
<graph> ::= <graph>
| <graph> <node>
<version> ::= <empty> # The DAFSA was generated in ASCII mode.
| < byte value 0x01 > # The DAFSA was generated in UTF-8 mode.
<dafsa> ::= <graph> <version>
Decoding:
<char> -> character
<end_char> & 0x7F -> character
<return value> & 0x0F -> integer
<offset1 & 0x3F> -> integer
((<offset2> & 0x1F>) << 8) + <byte> -> integer
((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
offset2 and offset3 respectively.
The first offset in a list of offsets is the distance in bytes between the
offset itself and the first child node. Subsequent offsets are the distance
between previous child node and next child node. Thus each offset links a node
to a child node. The distance is always counted between start addresses, i.e.
first byte in decoded offset or first byte in child node.
Transcoding of UTF-8 multibyte sequences:
The original DAFSA format was limited to 7-bit printable ASCII characters in
range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
By transcoding of such characters the new format preserves compatibility with
old parsers, so that a DAFSA in the extended format can be used by an old
parser without false positives, although strings containing transcoded
characters will never match. Since the format is extended rather than being
changed, a parser supporting the new format will automatically support data
generated in the old format.
Transcoding is performed by insertion of a start byte with the special value
0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
the range of printable ASCII.
2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
Example 1:
%%
aa, 1
a, 2
%%
The input is first parsed to a list of words:
["aa1", "a2"]
A fully expanded graph is created from the words:
source = [node1, node4]
node1 = ("a", [node2])
node2 = ("a", [node3])
node3 = ("\x01", [sink])
node4 = ("a", [node5])
node5 = ("\x02", [sink])
sink = None
Compression results in the following graph:
source = [node1]
node1 = ("a", [node2, node3])
node2 = ("\x02", [sink])
node3 = ("a\x01", [sink])
sink = None
A C++ representation of the compressed graph is generated:
const unsigned char dafsa[7] = {
0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
};
The bytes in the generated array has the following meaning:
0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1
1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a"
2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4
3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5
4: 0x82 <return_value> 0x82 & 0x0F -> return 2
5: 0x61 <char> label character 0x61 -> match "a"
6: 0x81 <return_value> 0x81 & 0x0F -> return 1
Example 2:
%%
aa, 1
bbb, 2
baa, 1
%%
The input is first parsed to a list of words:
["aa1", "bbb2", "baa1"]
Compression results in the following graph:
source = [node1, node2]
node1 = ("b", [node2, node3])
node2 = ("aa\x01", [sink])
node3 = ("bb\x02", [sink])
sink = None
A C++ representation of the compressed graph is generated:
const unsigned char dafsa[11] = {
0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
};
The bytes in the generated array has the following meaning:
0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2
1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5
2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b"
3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5
4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8
5: 0x61 <char> label character 0x61 -> match "a"
6: 0x61 <char> label character 0x61 -> match "a"
7: 0x81 <return_value> 0x81 & 0x0F -> return 1
8: 0x62 <char> label character 0x62 -> match "b"
9: 0x62 <char> label character 0x62 -> match "b"
10: 0x82 <return_value> 0x82 & 0x0F -> return 2
"""
import sys
import os.path
import hashlib
class InputError(Exception):
"""Exception raised for errors in the input file."""
# Length of a character starting at a given byte.
char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
def to_bytes(n):
"""Converts an integer value to a bytes object."""
return bytes(bytearray((n,)))
def to_dafsa(words, utf_mode):
"""Generates a DAFSA from a word list and returns the source node.
Each word is split into characters so that each character is represented by
a unique node. It is assumed the word list is not empty.
"""
if not words:
raise InputError('The domain list must not be empty')
def to_nodes(word, multibyte_length):
"""Split words into characters"""
byte = ord(word[:1])
if multibyte_length:
# Consume next byte in multibyte sequence.
if byte & 0xC0 != 0x80:
raise InputError('Invalid UTF-8 multibyte sequence')
return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
char_length = char_length_table[byte]
if char_length == 1:
# 7-bit printable ASCII.
if len(word) == 1:
return to_bytes(int(word[:1], 16) & 0x0F), [None]
return word[:1], [to_nodes(word[1:], 0)]
elif char_length > 1:
# Leading byte in multibyte sequence.
if not utf_mode:
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
if len(word) <= char_length:
raise InputError('Unterminated UTF-8 multibyte sequence')
return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
# Unexpected character.
raise InputError('Domain names must be printable ASCII or UTF-8')
return [to_nodes(word, 0) for word in words]
def to_words(node):
"""Generates a word list from all paths starting from an internal node."""
if not node:
return [b'']
return [(node[0] + word) for child in node[1] for word in to_words(child)]
def reverse(dafsa):
"""Generates a new DAFSA that is reversed, so that the old sink node becomes
the new source node.
"""
sink = []
nodemap = {}
def dfs(node, parent):
"""Creates reverse nodes.
A new reverse node will be created for each old node. The new node will
get a reversed label and the parents of the old node as children.
"""
if not node:
sink.append(parent)
elif id(node) not in nodemap:
nodemap[id(node)] = (node[0][::-1], [parent])
for child in node[1]:
dfs(child, nodemap[id(node)])
else:
nodemap[id(node)][1].append(parent)
for node in dafsa:
dfs(node, None)
return sink
def join_labels(dafsa):
"""Generates a new DAFSA where internal nodes are merged if there is a one to
one connection.
"""
parentcount = {id(None): 2}
nodemap = {id(None): None}
def count_parents(node):
"""Count incoming references"""
if id(node) in parentcount:
parentcount[id(node)] += 1
else:
parentcount[id(node)] = 1
for child in node[1]:
count_parents(child)
def join(node):
"""Create new nodes"""
if id(node) not in nodemap:
children = [join(child) for child in node[1]]
if len(children) == 1 and parentcount[id(node[1][0])] == 1:
child = children[0]
nodemap[id(node)] = (node[0] + child[0], child[1])
else:
nodemap[id(node)] = (node[0], children)
return nodemap[id(node)]
for node in dafsa:
count_parents(node)
return [join(node) for node in dafsa]
def join_suffixes(dafsa):
"""Generates a new DAFSA where nodes that represent the same word lists
towards the sink are merged.
"""
nodemap = {frozenset((b'',)): None}
def join(node):
"""Returns a matching node. A new node is created if no matching node
exists. The graph is accessed in dfs order.
"""
suffixes = frozenset(to_words(node))
if suffixes not in nodemap:
nodemap[suffixes] = (node[0], [join(child) for child in node[1]])
return nodemap[suffixes]
return [join(node) for node in dafsa]
def top_sort(dafsa):
"""Generates list of nodes in topological sort order."""
incoming = {}
def count_incoming(node):
"""Counts incoming references."""
if node:
if id(node) not in incoming:
incoming[id(node)] = 1
for child in node[1]:
count_incoming(child)
else:
incoming[id(node)] += 1
for node in dafsa:
count_incoming(node)
for node in dafsa:
incoming[id(node)] -= 1
waiting = [node for node in dafsa if incoming[id(node)] == 0]
nodes = []
while waiting:
node = waiting.pop()
assert incoming[id(node)] == 0
nodes.append(node)
for child in node[1]:
if child:
incoming[id(child)] -= 1
if incoming[id(child)] == 0:
waiting.append(child)
return nodes
def encode_links(children, offsets, current):
"""Encodes a list of children as one, two or three byte offsets."""
if not children[0]:
# This is an <end_label> node and no links follow such nodes
assert len(children) == 1
return []
guess = 3 * len(children)
assert children
children = sorted(children, key=lambda x: -offsets[id(x)])
while True:
offset = current + guess
buf = []
for child in children:
last = len(buf)
distance = offset - offsets[id(child)]
assert distance > 0 and distance < (1 << 21)
if distance < (1 << 6):
# A 6-bit offset: "s0xxxxxx"
buf.append(distance)
elif distance < (1 << 13):
# A 13-bit offset: "s10xxxxxxxxxxxxx"
buf.append(0x40 | (distance >> 8))
buf.append(distance & 0xFF)
else:
# A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
buf.append(0x60 | (distance >> 16))
buf.append((distance >> 8) & 0xFF)
buf.append(distance & 0xFF)
# Distance in first link is relative to following record.
# Distance in other links are relative to previous link.
offset -= distance
if len(buf) == guess:
break
guess = len(buf)
# Set most significant bit to mark end of links in this node.
buf[last] |= (1 << 7)
buf.reverse()
return buf
def encode_prefix(label):
"""Encodes a node label as a list of bytes without a trailing high byte.
This method encodes a node if there is exactly one child and the
child follows immediately after so that no jump is needed. This label
will then be a prefix to the label in the child node.
"""
assert label
return [c for c in bytearray(reversed(label))]
def encode_label(label):
"""Encodes a node label as a list of bytes with a trailing high byte >0x80.
"""
buf = encode_prefix(label)
# Set most significant bit to mark end of label in this node.
buf[0] |= (1 << 7)
return buf
def encode(dafsa, utf_mode):
"""Encodes a DAFSA to a list of bytes"""
output = []
offsets = {}
for node in reversed(top_sort(dafsa)):
if (len(node[1]) == 1 and node[1][0] and
(offsets[id(node[1][0])] == len(output))):
output.extend(encode_prefix(node[0]))
else:
output.extend(encode_links(node[1], offsets, len(output)))
output.extend(encode_label(node[0]))
offsets[id(node)] = len(output)
output.extend(encode_links(dafsa, offsets, len(output)))
output.reverse()
if utf_mode:
output.append(0x01)
return output
def to_cxx(data, codecs):
"""Generates C++ code from a list of encoded bytes."""
text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
text += b'The byte array encodes effective tld names. See psl-make-dafsa source for'
text += b' documentation.'
text += b'*/\n\n'
text += b'static const unsigned char kDafsa['
text += bytes(str(len(data)), **codecs)
text += b'] = {\n'
for i in range(0, len(data), 12):
text += b' '
text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
text += b',\n'
text += b'};\n'
return text
def sha1_file(name):
sha1 = hashlib.sha1()
with open(name, 'rb') as f:
while True:
data = f.read(65536)
if not data:
break
sha1.update(data)
return sha1.hexdigest()
def to_cxx_plus(data, codecs):
"""Generates C++ code from a word list plus some variable assignments as needed by libpsl"""
text = to_cxx(data, codecs)
text += b'static time_t _psl_file_time = %d;\n' % os.stat(psl_input_file).st_mtime
text += b'static int _psl_nsuffixes = %d;\n' % psl_nsuffixes
text += b'static int _psl_nexceptions = %d;\n' % psl_nexceptions
text += b'static int _psl_nwildcards = %d;\n' % psl_nwildcards
text += b'static const char _psl_sha1_checksum[] = "%s";\n' % bytes(sha1_file(psl_input_file), **codecs)
text += b'static const char _psl_filename[] = "%s";\n' % bytes(psl_input_file, **codecs)
return text
def words_to_whatever(words, converter, utf_mode, codecs):
"""Generates C++ code from a word list"""
dafsa = to_dafsa(words, utf_mode)
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
dafsa = fun(dafsa)
return converter(encode(dafsa, utf_mode), codecs)
def words_to_cxx(words, utf_mode, codecs):
"""Generates C++ code from a word list"""
return words_to_whatever(words, to_cxx, utf_mode, codecs)
def words_to_cxx_plus(words, utf_mode, codecs):
"""Generates C++ code from a word list plus some variable assignments as needed by libpsl"""
return words_to_whatever(words, to_cxx_plus, utf_mode, codecs)
def words_to_binary(words, utf_mode, codecs):
"""Generates C++ code from a word list"""
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)
def parse_psl(infile, utf_mode, codecs):
"""Parses PSL file and extract strings and return code"""
PSL_FLAG_EXCEPTION = (1<<0)
PSL_FLAG_WILDCARD = (1<<1)
PSL_FLAG_ICANN = (1<<2) # entry of ICANN section
PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
global psl_nsuffixes, psl_nexceptions, psl_nwildcards
psl = {}
section = 0
for line in infile:
line = bytes(line.strip(), **codecs)
if not line:
continue
if line.startswith(b'//'):
if section == 0:
if b'===BEGIN ICANN DOMAINS===' in line:
section = PSL_FLAG_ICANN
elif b'===BEGIN PRIVATE DOMAINS===' in line:
section = PSL_FLAG_PRIVATE
elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line:
section = 0
elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line:
section = 0
continue # skip comments
if line[:1] == b'!':
psl_nexceptions += 1
flags = PSL_FLAG_EXCEPTION | section
line = line[1:]
elif line[:1] == b'*':
if line[1:2] != b'.':
print('Unsupported kind of rule (ignored): %s' % line)
continue
psl_nwildcards += 1
psl_nsuffixes += 1
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
line = line[2:]
else:
psl_nsuffixes += 1
flags = PSL_FLAG_PLAIN | section
punycode = line.decode('utf-8').encode('idna')
if punycode in psl:
"""Found existing entry:
Combination of exception and plain rule is ambiguous
!foo.bar
foo.bar
Allowed:
!foo.bar + *.foo.bar
foo.bar + *.foo.bar
"""
print('Found %s/%X (now %X)' % punycode, psl[punycode], flags)
continue
if utf_mode:
psl[line] = flags
psl[punycode] = flags
# with open("psl.out", 'w') as outfile:
# for (domain, flags) in sorted(psl.iteritems()):
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())]
def usage():
"""Prints the usage"""
print('usage: %s [options] infile outfile' % sys.argv[0])
print(' --output-format=cxx Write DAFSA as C/C++ code (default)')
print(' --output-format=cxx+ Write DAFSA as C/C++ code plus statistical assignments')
print(' --output-format=binary Write DAFSA binary data')
print(' --encoding=ascii 7-bit ASCII mode')
print(' --encoding=utf-8 UTF-8 mode (default)')
exit(1)
def main():
"""Convert PSL file into C or binary DAFSA file"""
if len(sys.argv) < 3:
usage()
converter = words_to_cxx
parser = parse_psl
utf_mode = True
codecs = dict()
if sys.version_info.major > 2:
codecs['encoding'] = 'utf-8'
for arg in sys.argv[1:-2]:
# Check --input-format for backward compatibility
if arg.startswith('--input-format='):
value = arg[15:].lower()
if value == 'psl':
parser = parse_psl
else:
print("Unknown input format '%s'" % value)
return 1
elif arg.startswith('--output-format='):
value = arg[16:].lower()
if value == 'binary':
converter = words_to_binary
elif value == 'cxx':
converter = words_to_cxx
elif value == 'cxx+':
converter = words_to_cxx_plus
else:
print("Unknown output format '%s'" % value)
return 1
elif arg.startswith('--encoding='):
value = arg[11:].lower()
if value == 'ascii':
utf_mode = False
elif value == 'utf-8':
utf_mode = True
else:
print("Unknown encoding '%s'" % value)
return 1
else:
usage()
if sys.argv[-2] == '-':
with open(sys.argv[-1], 'wb') as outfile:
outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
else:
"""Some statistical data for --output-format=cxx+"""
global psl_input_file, psl_nsuffixes, psl_nexceptions, psl_nwildcards
psl_input_file = sys.argv[-2]
psl_nsuffixes = 0
psl_nexceptions = 0
psl_nwildcards = 0
with open(sys.argv[-2], 'r', **codecs) as infile, open(sys.argv[-1], 'wb') as outfile:
outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -156,6 +156,7 @@ qt_internal_extend_target(Network CONDITION QT_FEATURE_topleveldomain
SOURCES
kernel/qtldurl.cpp kernel/qtldurl_p.h
kernel/qurltlds_p.h
../3rdparty/libpsl/src/lookup_string_in_fixed_set.c
)
qt_internal_extend_target(Network CONDITION QT_FEATURE_dnslookup

View File

@ -1,7 +1,7 @@
{
"Id": "psl",
"Name": "The Public Suffix List",
"QDocModule": "qtcore",
"QDocModule": "qtnetwork",
"Description": "The Public Suffix List is an initiative of Mozilla,
but is maintained as a community resource. It is available for use in any software,
but was originally created to meet the needs of browser manufacturers.
@ -14,8 +14,7 @@ It allows browsers to, for example:
- Accurately sort history entries by site",
"Files": "qurltlds_p.h",
"QtUsage": "See util/publicSuffix/ for code-generator",
"QtUsage": "Used in Qt Core to avoid setting \"supercookies\" in the cookie jar
"QtUsage": "Used in Qt Network to avoid setting \"supercookies\" in the cookie jar
supported by Qt (by the QNetworkCookieJar class).",
"Homepage": "Consult https://github.com/publicsuffix/list for the sha1 but download from ...",

View File

@ -43,68 +43,25 @@
#if QT_CONFIG(topleveldomain)
#include "qplatformdefs.h"
#include "qurl.h"
#include "private/qurltlds_p.h"
#include "private/qtldurl_p.h"
#include "QtCore/qlist.h"
#include "QtCore/qstring.h"
#include "qurltlds_p.h"
// Defined in src/3rdparty/libpsl/src/lookup_string_in_fixed_set.c
extern "C" int LookupStringInFixedSet(const unsigned char *graph, std::size_t length,
const char *key, std::size_t key_length);
QT_BEGIN_NAMESPACE
enum TLDMatchType {
ExactMatch,
SuffixMatch,
ExceptionMatch,
};
static constexpr int PSL_NOT_FOUND = -1;
static constexpr int PSL_FLAG_EXCEPTION = 1 << 0;
static constexpr int PSL_FLAG_WILDCARD = 1 << 1;
// Scan the auto-generated table of TLDs for an entry. For more details
// see comments in file: util/publicSuffix/main.cpp
static bool containsTLDEntry(QStringView entry, TLDMatchType match)
static int lookupDomain(QByteArrayView domain)
{
const QStringView matchSymbols[] = {
u"",
u"*",
u"!",
};
const auto symbol = matchSymbols[match];
const int index = qt_hash(entry, qt_hash(symbol)) % tldCount;
// select the right chunk from the big table
short chunk = 0;
uint chunkIndex = tldIndices[index], offset = 0;
// The offset in the big string, of the group that our entry hashes into.
const auto tldGroupOffset = tldIndices[index];
// It should always be inside all chunks' total size.
Q_ASSERT(tldGroupOffset < tldChunks[tldChunkCount - 1]);
// All offsets are stored in non-decreasing order.
// This check is within bounds as tldIndices has length tldCount+1.
Q_ASSERT(tldGroupOffset <= tldIndices[index + 1]);
// The last extra entry in tldIndices
// should be equal to the total of all chunks' lengths.
static_assert(tldIndices[tldCount] == tldChunks[tldChunkCount - 1]);
// Find which chunk contains the tldGroupOffset
while (tldGroupOffset >= tldChunks[chunk]) {
chunkIndex -= tldChunks[chunk];
offset += tldChunks[chunk];
chunk++;
// We can not go above the number of chunks we have, since all our
// indices are less than the total chunks' size (see asserts above).
Q_ASSERT(chunk < tldChunkCount);
}
// check all the entries from the given offset
while (chunkIndex < tldIndices[index+1] - offset) {
const auto utf8 = tldData[chunk] + chunkIndex;
if ((symbol.isEmpty() || QLatin1Char(*utf8) == symbol) && entry == QString::fromUtf8(utf8 + symbol.size()))
return true;
chunkIndex += uint(qstrlen(utf8)) + 1; // +1 for the ending \0
}
return false;
return LookupStringInFixedSet(kDafsa, sizeof(kDafsa), domain.data(), domain.size());
}
/*!
@ -118,19 +75,28 @@ static bool containsTLDEntry(QStringView entry, TLDMatchType match)
Q_NETWORK_EXPORT bool qIsEffectiveTLD(QStringView domain)
{
// for domain 'foo.bar.com':
// 1. return if TLD table contains 'foo.bar.com'
// 2. else if table contains '*.bar.com',
// 3. test that table does not contain '!foo.bar.com'
// 1. return false if TLD table contains '!foo.bar.com'
// 2. return true if TLD table contains 'foo.bar.com'
// 3. return true if the table contains '*.bar.com'
if (containsTLDEntry(domain, ExactMatch)) // 1
return true;
QByteArray decodedDomain = domain.toUtf8();
QByteArrayView domainView(decodedDomain);
const auto dot = domain.indexOf(QLatin1Char('.'));
auto ret = lookupDomain(domainView);
if (ret != PSL_NOT_FOUND) {
if (ret & PSL_FLAG_EXCEPTION) // 1
return false;
if ((ret & PSL_FLAG_WILDCARD) == 0) // 2
return true;
}
const auto dot = domainView.indexOf('.');
if (dot < 0) // Actual TLD: may be effective if the subject of a wildcard rule:
return containsTLDEntry(QString(QLatin1Char('.') + domain), SuffixMatch);
if (containsTLDEntry(domain.mid(dot), SuffixMatch)) // 2
return !containsTLDEntry(domain, ExceptionMatch); // 3
return false;
return ret != PSL_NOT_FOUND;
ret = lookupDomain(domainView.sliced(dot + 1)); // 3
if (ret == PSL_NOT_FOUND)
return false;
return (ret & PSL_FLAG_WILDCARD) != 0;
}
QT_END_NAMESPACE

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,15 @@
The file qurltlds_p.h is generated from the Public Suffix
List (see [1] and [2]), by the program residing at
qtbase/util/publicSuffix/ in the Qt source tree.
qtbase/src/3rdparty/libpsl/src/psl-make-dafsa in the Qt source tree.
That program generates a character array and an index array from the
list to provide fast lookups of elements within C++.
To regenerate the file, run the following command from qtbase tree:
src/3rdparty/libpsl/src/psl-make-dafsa public_suffix_list.dat src/network/kernel/qurltlds_p.h
Those arrays in qurltlds_p.h are derived from the Public
Suffix List ([2]), which was originally provided by
Jo Hermans <jo.hermans@gmail.com>.
----
[1] list: http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1
[2] homepage: http://publicsuffix.org/
[1] list: https://publicsuffix.org/list/public_suffix_list.dat
[2] homepage: https://publicsuffix.org/

View File

@ -82,7 +82,8 @@
@qpa_headers = ( qr/^qplatform/, qr/^qwindowsystem/ );
my @internal_zlib_headers = ( "crc32.h", "deflate.h", "gzguts.h", "inffast.h", "inffixed.h", "inflate.h", "inftrees.h", "trees.h", "zutil.h" );
my @zlib_headers = ( "zconf.h", "zlib.h" );
@ignore_headers = ( @internal_zlib_headers );
my @internal_qtnetwork_headers = ( "qurltlds_p.h" );
@ignore_headers = ( @internal_zlib_headers, @internal_qtnetwork_headers );
@ignore_for_include_check = ( "qsystemdetection.h", "qcompilerdetection.h", "qprocessordetection.h", @zlib_headers);
@ignore_for_qt_begin_namespace_check = ( "qt_windows.h", @zlib_headers);
%inject_headers = (