Network: Use public suffix database in DAFSA format
This saves 91kiB of text size and makes it easier to use a locally installed database when available. [ChangeLog][Third-Party Code] Moved attribution of The Public Suffix List from Qt Core to Qt Network. [ChangeLog][Third-Party Code] Added attribution of new libpsl library to Qt Network. libpsl is available under the BSD 3-Clause "New" or "Revised" License. Task-number: QTBUG-95889 Change-Id: Ibd37c7a94fdf235e75d96fec20d427fb5c2bd2a4 Reviewed-by: Kai Koehne <kai.koehne@qt.io> Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
This commit is contained in:
parent
d0b22762be
commit
cba40055b1
15
src/3rdparty/libpsl/qt_attribution.json
vendored
Normal file
15
src/3rdparty/libpsl/qt_attribution.json
vendored
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"Id": "libpsl",
|
||||
"Name": "libpsl - C library to handle the Public Suffix List",
|
||||
"QDocModule": "qtnetwork",
|
||||
"Description": "Libpsl allows checking domains against the Public Suffix List.",
|
||||
"Files": "src/lookup_string_in_fixed_set.c src/psl-make-dafsa",
|
||||
"QtUsage": "Used to compress the embedded copy of publicsuffix list and
|
||||
to lookup entries in it.",
|
||||
"Homepage": "https://github.com/rockdaboot/libpsl",
|
||||
"Version": "664f3dc85259ec65e30248a61fa1c45b7b0e4c3f",
|
||||
"License": "BSD 3-clause \"New\" or \"Revised\" License",
|
||||
"LicenseFile": "src/LICENSE.chromium",
|
||||
"LicenseId": "BSD-3-Clause",
|
||||
"Copyright": "Copyright 2014-2016 The Chromium Authors. All rights reserved."
|
||||
}
|
30
src/3rdparty/libpsl/src/LICENSE.chromium
vendored
Normal file
30
src/3rdparty/libpsl/src/LICENSE.chromium
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
* The following License is for the source code files
|
||||
psl-make-dafsa and lookup_string_in_fixed_set.c.
|
||||
|
||||
// Copyright 2015 The Chromium Authors. All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above
|
||||
// copyright notice, this list of conditions and the following disclaimer
|
||||
// in the documentation and/or other materials provided with the
|
||||
// distribution.
|
||||
// * Neither the name of Google Inc. nor the names of its
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
273
src/3rdparty/libpsl/src/lookup_string_in_fixed_set.c
vendored
Normal file
273
src/3rdparty/libpsl/src/lookup_string_in_fixed_set.c
vendored
Normal file
@ -0,0 +1,273 @@
|
||||
/* Copyright 2015-2016 The Chromium Authors. All rights reserved.
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE.chromium file.
|
||||
*
|
||||
* Converted to C89 2015 by Tim Rühsen
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#define CHECK_LT(a, b) if ((a) >= b) return 0
|
||||
|
||||
static const char multibyte_length_table[16] = {
|
||||
0, 0, 0, 0, /* 0x00-0x3F */
|
||||
0, 0, 0, 0, /* 0x40-0x7F */
|
||||
0, 0, 0, 0, /* 0x80-0xBF */
|
||||
2, 2, 3, 4, /* 0xC0-0xFF */
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Get length of multibyte character sequence starting at a given byte.
|
||||
* Returns zero if the byte is not a valid leading byte in UTF-8.
|
||||
*/
|
||||
static int GetMultibyteLength(char c) {
|
||||
return multibyte_length_table[((unsigned char)c) >> 4];
|
||||
}
|
||||
|
||||
/*
|
||||
* Moves pointers one byte forward.
|
||||
*/
|
||||
static void NextPos(const unsigned char** pos,
|
||||
const char** key,
|
||||
const char** multibyte_start)
|
||||
{
|
||||
++*pos;
|
||||
if (*multibyte_start) {
|
||||
/* Advance key to next byte in multibyte sequence. */
|
||||
++*key;
|
||||
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
|
||||
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
|
||||
*multibyte_start = 0;
|
||||
} else {
|
||||
if (GetMultibyteLength(**key)) {
|
||||
/* Multibyte prefix was matched in the dafsa, start matching multibyte
|
||||
* content in next round. */
|
||||
*multibyte_start = *key;
|
||||
} else {
|
||||
/* Advance key as a single byte character was matched. */
|
||||
++*key;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Read next offset from pos.
|
||||
* Returns true if an offset could be read, false otherwise.
|
||||
*/
|
||||
|
||||
static int GetNextOffset(const unsigned char** pos,
|
||||
const unsigned char* end,
|
||||
const unsigned char** offset)
|
||||
{
|
||||
size_t bytes_consumed;
|
||||
|
||||
if (*pos == end)
|
||||
return 0;
|
||||
|
||||
/* When reading an offset the byte array must always contain at least
|
||||
* three more bytes to consume. First the offset to read, then a node
|
||||
* to skip over and finally a destination node. No object can be smaller
|
||||
* than one byte. */
|
||||
CHECK_LT(*pos + 2, end);
|
||||
switch (**pos & 0x60) {
|
||||
case 0x60: /* Read three byte offset */
|
||||
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
|
||||
bytes_consumed = 3;
|
||||
break;
|
||||
case 0x40: /* Read two byte offset */
|
||||
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
|
||||
bytes_consumed = 2;
|
||||
break;
|
||||
default:
|
||||
*offset += (*pos)[0] & 0x3F;
|
||||
bytes_consumed = 1;
|
||||
}
|
||||
if ((**pos & 0x80) != 0) {
|
||||
*pos = end;
|
||||
} else {
|
||||
*pos += bytes_consumed;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if byte at offset is last in label.
|
||||
*/
|
||||
|
||||
static int IsEOL(const unsigned char* offset, const unsigned char* end)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
return(*offset & 0x80) != 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if byte at offset matches first character in key.
|
||||
* This version assumes a range check was already performed by the caller.
|
||||
*/
|
||||
|
||||
static int IsMatchUnchecked(const unsigned char matcher,
|
||||
const char* key,
|
||||
const char* multibyte_start)
|
||||
{
|
||||
if (multibyte_start) {
|
||||
/* Multibyte matching mode. */
|
||||
if (multibyte_start == key) {
|
||||
/* Match leading byte, which will also match the sequence length. */
|
||||
return (matcher ^ 0x80) == (const unsigned char)*key;
|
||||
} else {
|
||||
/* Match following bytes. */
|
||||
return (matcher ^ 0xC0) == (const unsigned char)*key;
|
||||
}
|
||||
}
|
||||
/* If key points at a leading byte in a multibyte sequence, but we are not yet
|
||||
* in multibyte mode, then the dafsa should contain a special byte to indicate
|
||||
* a mode switch. */
|
||||
if (GetMultibyteLength(*key)) {
|
||||
return matcher == 0x1F;
|
||||
}
|
||||
/* Normal matching of a single byte character. */
|
||||
return matcher == (const unsigned char)*key;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if byte at offset matches first character in key.
|
||||
* This version matches characters not last in label.
|
||||
*/
|
||||
|
||||
static int IsMatch(const unsigned char* offset,
|
||||
const unsigned char* end,
|
||||
const char* key,
|
||||
const char* multibyte_start)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
return IsMatchUnchecked(*offset, key, multibyte_start);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if byte at offset matches first character in key.
|
||||
* This version matches characters last in label.
|
||||
*/
|
||||
|
||||
static int IsEndCharMatch(const unsigned char* offset,
|
||||
const unsigned char* end,
|
||||
const char* key,
|
||||
const char* multibyte_start)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
|
||||
}
|
||||
|
||||
/*
|
||||
* Read return value at offset.
|
||||
* Returns true if a return value could be read, false otherwise.
|
||||
*/
|
||||
|
||||
static int GetReturnValue(const unsigned char* offset,
|
||||
const unsigned char* end,
|
||||
const char* multibyte_start,
|
||||
int* return_value)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
if (!multibyte_start && (*offset & 0xE0) == 0x80) {
|
||||
*return_value = *offset & 0x0F;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Looks up the string |key| with length |key_length| in a fixed set of
|
||||
* strings. The set of strings must be known at compile time. It is converted to
|
||||
* a graph structure named a DAFSA (Deterministic Acyclic Finite State
|
||||
* Automaton) by the script psl-make-dafsa during compilation. This permits
|
||||
* efficient (in time and space) lookup. The graph generated by psl-make-dafsa
|
||||
* takes the form of a constant byte array which should be supplied via the
|
||||
* |graph| and |length| parameters. The return value is kDafsaNotFound,
|
||||
* kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
|
||||
* kDafsaWildcardRule and kDafsaPrivateRule ORed together.
|
||||
*
|
||||
* Lookup a domain key in a byte array generated by psl-make-dafsa.
|
||||
*/
|
||||
|
||||
/* prototype to skip warning with -Wmissing-prototypes */
|
||||
int LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
|
||||
|
||||
int LookupStringInFixedSet(const unsigned char* graph,
|
||||
size_t length,
|
||||
const char* key,
|
||||
size_t key_length)
|
||||
{
|
||||
const unsigned char* pos = graph;
|
||||
const unsigned char* end = graph + length;
|
||||
const unsigned char* offset = pos;
|
||||
const char* key_end = key + key_length;
|
||||
const char* multibyte_start = 0;
|
||||
|
||||
while (GetNextOffset(&pos, end, &offset)) {
|
||||
/*char <char>+ end_char offsets
|
||||
* char <char>+ return value
|
||||
* char end_char offsets
|
||||
* char return value
|
||||
* end_char offsets
|
||||
* return_value
|
||||
*/
|
||||
int did_consume = 0;
|
||||
|
||||
if (key != key_end && !IsEOL(offset, end)) {
|
||||
/* Leading <char> is not a match. Don't dive into this child */
|
||||
if (!IsMatch(offset, end, key, multibyte_start))
|
||||
continue;
|
||||
did_consume = 1;
|
||||
NextPos(&offset, &key, &multibyte_start);
|
||||
/* Possible matches at this point:
|
||||
* <char>+ end_char offsets
|
||||
* <char>+ return value
|
||||
* end_char offsets
|
||||
* return value
|
||||
*/
|
||||
|
||||
/* Remove all remaining <char> nodes possible */
|
||||
while (!IsEOL(offset, end) && key != key_end) {
|
||||
if (!IsMatch(offset, end, key, multibyte_start))
|
||||
return -1;
|
||||
NextPos(&offset, &key, &multibyte_start);
|
||||
}
|
||||
}
|
||||
/* Possible matches at this point:
|
||||
* end_char offsets
|
||||
* return_value
|
||||
* If one or more <char> elements were consumed, a failure
|
||||
* to match is terminal. Otherwise, try the next node.
|
||||
*/
|
||||
if (key == key_end) {
|
||||
int return_value;
|
||||
|
||||
if (GetReturnValue(offset, end, multibyte_start, &return_value))
|
||||
return return_value;
|
||||
/* The DAFSA guarantees that if the first char is a match, all
|
||||
* remaining char elements MUST match if the key is truly present.
|
||||
*/
|
||||
if (did_consume)
|
||||
return -1;
|
||||
continue;
|
||||
}
|
||||
if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
|
||||
if (did_consume)
|
||||
return -1; /* Unexpected */
|
||||
continue;
|
||||
}
|
||||
NextPos(&offset, &key, &multibyte_start);
|
||||
pos = offset; /* Dive into child */
|
||||
}
|
||||
|
||||
return -1; /* No match */
|
||||
}
|
||||
|
||||
/* prototype to skip warning with -Wmissing-prototypes */
|
||||
int GetUtfMode(const unsigned char *graph, size_t length);
|
||||
|
||||
int GetUtfMode(const unsigned char *graph, size_t length)
|
||||
{
|
||||
return length > 0 && graph[length - 1] < 0x80;
|
||||
}
|
692
src/3rdparty/libpsl/src/psl-make-dafsa
vendored
Executable file
692
src/3rdparty/libpsl/src/psl-make-dafsa
vendored
Executable file
@ -0,0 +1,692 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2014 The Chromium Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE.chromium file.
|
||||
|
||||
"""
|
||||
A Deterministic acyclic finite state automaton (DAFSA) is a compact
|
||||
representation of an unordered word list (dictionary).
|
||||
|
||||
https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
|
||||
|
||||
This python program converts a list of strings to a byte array in C++.
|
||||
This python program fetches strings and return values from a gperf file
|
||||
and generates a C++ file with a byte array representing graph that can be
|
||||
used as a memory efficient replacement for the perfect hash table.
|
||||
|
||||
The input strings must consist of printable 7-bit ASCII characters or UTF-8
|
||||
multibyte sequences. Control characters in the range [0x00-0x1F] are not
|
||||
allowed. The return values must be one digit integers. .
|
||||
|
||||
In this program a DAFSA is a diamond shaped graph starting at a common
|
||||
source node and ending at a common sink node. All internal nodes contain
|
||||
a label and each word is represented by the labels in one path from
|
||||
the source node to the sink node.
|
||||
|
||||
The following python represention is used for nodes:
|
||||
|
||||
Source node: [ children ]
|
||||
Internal node: (label, [ children ])
|
||||
Sink node: None
|
||||
|
||||
The graph is first compressed by prefixes like a trie. In the next step
|
||||
suffixes are compressed so that the graph gets diamond shaped. Finally
|
||||
one to one linked nodes are replaced by nodes with the labels joined.
|
||||
|
||||
The order of the operations is crucial since lookups will be performed
|
||||
starting from the source with no backtracking. Thus a node must have at
|
||||
most one child with a label starting by the same character. The output
|
||||
is also arranged so that all jumps are to increasing addresses, thus forward
|
||||
in memory.
|
||||
|
||||
The generated output has suffix free decoding so that the sign of leading
|
||||
bits in a link (a reference to a child node) indicate if it has a size of one,
|
||||
two or three bytes and if it is the last outgoing link from the actual node.
|
||||
A node label is terminated by a byte with the leading bit set.
|
||||
|
||||
The generated byte array can described by the following BNF:
|
||||
|
||||
<byte> ::= < 8-bit value in range [0x00-0xFF] >
|
||||
|
||||
<char> ::= < byte in range [0x1F-0x7F] >
|
||||
<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
|
||||
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
|
||||
|
||||
<offset1> ::= < byte in range [0x00-0x3F] >
|
||||
<offset2> ::= < byte in range [0x40-0x5F] >
|
||||
<offset3> ::= < byte in range [0x60-0x7F] >
|
||||
|
||||
<end_offset1> ::= < byte in range [0x80-0xBF] >
|
||||
<end_offset2> ::= < byte in range [0xC0-0xDF] >
|
||||
<end_offset3> ::= < byte in range [0xE0-0xFF] >
|
||||
|
||||
<prefix> ::= <char>
|
||||
|
||||
<label> ::= <end_char>
|
||||
| <char> <label>
|
||||
|
||||
<end_label> ::= <return_value>
|
||||
| <char> <end_label>
|
||||
|
||||
<offset> ::= <offset1>
|
||||
| <offset2> <byte>
|
||||
| <offset3> <byte> <byte>
|
||||
|
||||
<end_offset> ::= <end_offset1>
|
||||
| <end_offset2> <byte>
|
||||
| <end_offset3> <byte> <byte>
|
||||
|
||||
<offsets> ::= <end_offset>
|
||||
| <offset> <offsets>
|
||||
|
||||
<source> ::= <offsets>
|
||||
|
||||
<node> ::= <label> <offsets>
|
||||
| <prefix> <node>
|
||||
| <end_label>
|
||||
|
||||
<graph> ::= <graph>
|
||||
| <graph> <node>
|
||||
|
||||
<version> ::= <empty> # The DAFSA was generated in ASCII mode.
|
||||
| < byte value 0x01 > # The DAFSA was generated in UTF-8 mode.
|
||||
|
||||
<dafsa> ::= <graph> <version>
|
||||
|
||||
Decoding:
|
||||
|
||||
<char> -> character
|
||||
<end_char> & 0x7F -> character
|
||||
<return value> & 0x0F -> integer
|
||||
<offset1 & 0x3F> -> integer
|
||||
((<offset2> & 0x1F>) << 8) + <byte> -> integer
|
||||
((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
|
||||
|
||||
end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
|
||||
offset2 and offset3 respectively.
|
||||
|
||||
The first offset in a list of offsets is the distance in bytes between the
|
||||
offset itself and the first child node. Subsequent offsets are the distance
|
||||
between previous child node and next child node. Thus each offset links a node
|
||||
to a child node. The distance is always counted between start addresses, i.e.
|
||||
first byte in decoded offset or first byte in child node.
|
||||
|
||||
Transcoding of UTF-8 multibyte sequences:
|
||||
|
||||
The original DAFSA format was limited to 7-bit printable ASCII characters in
|
||||
range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
|
||||
By transcoding of such characters the new format preserves compatibility with
|
||||
old parsers, so that a DAFSA in the extended format can be used by an old
|
||||
parser without false positives, although strings containing transcoded
|
||||
characters will never match. Since the format is extended rather than being
|
||||
changed, a parser supporting the new format will automatically support data
|
||||
generated in the old format.
|
||||
|
||||
Transcoding is performed by insertion of a start byte with the special value
|
||||
0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
|
||||
the range of printable ASCII.
|
||||
|
||||
2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
|
||||
|
||||
3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
|
||||
|
||||
4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
|
||||
00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
|
||||
|
||||
Example 1:
|
||||
|
||||
%%
|
||||
aa, 1
|
||||
a, 2
|
||||
%%
|
||||
|
||||
The input is first parsed to a list of words:
|
||||
["aa1", "a2"]
|
||||
|
||||
A fully expanded graph is created from the words:
|
||||
source = [node1, node4]
|
||||
node1 = ("a", [node2])
|
||||
node2 = ("a", [node3])
|
||||
node3 = ("\x01", [sink])
|
||||
node4 = ("a", [node5])
|
||||
node5 = ("\x02", [sink])
|
||||
sink = None
|
||||
|
||||
Compression results in the following graph:
|
||||
source = [node1]
|
||||
node1 = ("a", [node2, node3])
|
||||
node2 = ("\x02", [sink])
|
||||
node3 = ("a\x01", [sink])
|
||||
sink = None
|
||||
|
||||
A C++ representation of the compressed graph is generated:
|
||||
|
||||
const unsigned char dafsa[7] = {
|
||||
0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
|
||||
};
|
||||
|
||||
The bytes in the generated array has the following meaning:
|
||||
|
||||
0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1
|
||||
|
||||
1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a"
|
||||
2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4
|
||||
|
||||
3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5
|
||||
4: 0x82 <return_value> 0x82 & 0x0F -> return 2
|
||||
|
||||
5: 0x61 <char> label character 0x61 -> match "a"
|
||||
6: 0x81 <return_value> 0x81 & 0x0F -> return 1
|
||||
|
||||
Example 2:
|
||||
|
||||
%%
|
||||
aa, 1
|
||||
bbb, 2
|
||||
baa, 1
|
||||
%%
|
||||
|
||||
The input is first parsed to a list of words:
|
||||
["aa1", "bbb2", "baa1"]
|
||||
|
||||
Compression results in the following graph:
|
||||
source = [node1, node2]
|
||||
node1 = ("b", [node2, node3])
|
||||
node2 = ("aa\x01", [sink])
|
||||
node3 = ("bb\x02", [sink])
|
||||
sink = None
|
||||
|
||||
A C++ representation of the compressed graph is generated:
|
||||
|
||||
const unsigned char dafsa[11] = {
|
||||
0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
|
||||
};
|
||||
|
||||
The bytes in the generated array has the following meaning:
|
||||
|
||||
0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2
|
||||
1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5
|
||||
|
||||
2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b"
|
||||
3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5
|
||||
4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8
|
||||
|
||||
5: 0x61 <char> label character 0x61 -> match "a"
|
||||
6: 0x61 <char> label character 0x61 -> match "a"
|
||||
7: 0x81 <return_value> 0x81 & 0x0F -> return 1
|
||||
|
||||
8: 0x62 <char> label character 0x62 -> match "b"
|
||||
9: 0x62 <char> label character 0x62 -> match "b"
|
||||
10: 0x82 <return_value> 0x82 & 0x0F -> return 2
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os.path
|
||||
import hashlib
|
||||
|
||||
class InputError(Exception):
|
||||
"""Exception raised for errors in the input file."""
|
||||
|
||||
# Length of a character starting at a given byte.
|
||||
char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
|
||||
|
||||
def to_bytes(n):
|
||||
"""Converts an integer value to a bytes object."""
|
||||
return bytes(bytearray((n,)))
|
||||
|
||||
def to_dafsa(words, utf_mode):
|
||||
"""Generates a DAFSA from a word list and returns the source node.
|
||||
|
||||
Each word is split into characters so that each character is represented by
|
||||
a unique node. It is assumed the word list is not empty.
|
||||
"""
|
||||
if not words:
|
||||
raise InputError('The domain list must not be empty')
|
||||
def to_nodes(word, multibyte_length):
|
||||
"""Split words into characters"""
|
||||
byte = ord(word[:1])
|
||||
if multibyte_length:
|
||||
# Consume next byte in multibyte sequence.
|
||||
if byte & 0xC0 != 0x80:
|
||||
raise InputError('Invalid UTF-8 multibyte sequence')
|
||||
return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
|
||||
char_length = char_length_table[byte]
|
||||
if char_length == 1:
|
||||
# 7-bit printable ASCII.
|
||||
if len(word) == 1:
|
||||
return to_bytes(int(word[:1], 16) & 0x0F), [None]
|
||||
return word[:1], [to_nodes(word[1:], 0)]
|
||||
elif char_length > 1:
|
||||
# Leading byte in multibyte sequence.
|
||||
if not utf_mode:
|
||||
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
|
||||
if len(word) <= char_length:
|
||||
raise InputError('Unterminated UTF-8 multibyte sequence')
|
||||
return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
|
||||
# Unexpected character.
|
||||
raise InputError('Domain names must be printable ASCII or UTF-8')
|
||||
|
||||
return [to_nodes(word, 0) for word in words]
|
||||
|
||||
def to_words(node):
|
||||
"""Generates a word list from all paths starting from an internal node."""
|
||||
if not node:
|
||||
return [b'']
|
||||
return [(node[0] + word) for child in node[1] for word in to_words(child)]
|
||||
|
||||
|
||||
def reverse(dafsa):
|
||||
"""Generates a new DAFSA that is reversed, so that the old sink node becomes
|
||||
the new source node.
|
||||
"""
|
||||
sink = []
|
||||
nodemap = {}
|
||||
|
||||
def dfs(node, parent):
|
||||
"""Creates reverse nodes.
|
||||
|
||||
A new reverse node will be created for each old node. The new node will
|
||||
get a reversed label and the parents of the old node as children.
|
||||
"""
|
||||
if not node:
|
||||
sink.append(parent)
|
||||
elif id(node) not in nodemap:
|
||||
nodemap[id(node)] = (node[0][::-1], [parent])
|
||||
for child in node[1]:
|
||||
dfs(child, nodemap[id(node)])
|
||||
else:
|
||||
nodemap[id(node)][1].append(parent)
|
||||
|
||||
for node in dafsa:
|
||||
dfs(node, None)
|
||||
return sink
|
||||
|
||||
|
||||
def join_labels(dafsa):
|
||||
"""Generates a new DAFSA where internal nodes are merged if there is a one to
|
||||
one connection.
|
||||
"""
|
||||
parentcount = {id(None): 2}
|
||||
nodemap = {id(None): None}
|
||||
|
||||
def count_parents(node):
|
||||
"""Count incoming references"""
|
||||
if id(node) in parentcount:
|
||||
parentcount[id(node)] += 1
|
||||
else:
|
||||
parentcount[id(node)] = 1
|
||||
for child in node[1]:
|
||||
count_parents(child)
|
||||
|
||||
def join(node):
|
||||
"""Create new nodes"""
|
||||
if id(node) not in nodemap:
|
||||
children = [join(child) for child in node[1]]
|
||||
if len(children) == 1 and parentcount[id(node[1][0])] == 1:
|
||||
child = children[0]
|
||||
nodemap[id(node)] = (node[0] + child[0], child[1])
|
||||
else:
|
||||
nodemap[id(node)] = (node[0], children)
|
||||
return nodemap[id(node)]
|
||||
|
||||
for node in dafsa:
|
||||
count_parents(node)
|
||||
return [join(node) for node in dafsa]
|
||||
|
||||
|
||||
def join_suffixes(dafsa):
|
||||
"""Generates a new DAFSA where nodes that represent the same word lists
|
||||
towards the sink are merged.
|
||||
"""
|
||||
nodemap = {frozenset((b'',)): None}
|
||||
|
||||
def join(node):
|
||||
"""Returns a matching node. A new node is created if no matching node
|
||||
exists. The graph is accessed in dfs order.
|
||||
"""
|
||||
suffixes = frozenset(to_words(node))
|
||||
if suffixes not in nodemap:
|
||||
nodemap[suffixes] = (node[0], [join(child) for child in node[1]])
|
||||
return nodemap[suffixes]
|
||||
|
||||
return [join(node) for node in dafsa]
|
||||
|
||||
|
||||
def top_sort(dafsa):
|
||||
"""Generates list of nodes in topological sort order."""
|
||||
incoming = {}
|
||||
|
||||
def count_incoming(node):
|
||||
"""Counts incoming references."""
|
||||
if node:
|
||||
if id(node) not in incoming:
|
||||
incoming[id(node)] = 1
|
||||
for child in node[1]:
|
||||
count_incoming(child)
|
||||
else:
|
||||
incoming[id(node)] += 1
|
||||
|
||||
for node in dafsa:
|
||||
count_incoming(node)
|
||||
|
||||
for node in dafsa:
|
||||
incoming[id(node)] -= 1
|
||||
|
||||
waiting = [node for node in dafsa if incoming[id(node)] == 0]
|
||||
nodes = []
|
||||
|
||||
while waiting:
|
||||
node = waiting.pop()
|
||||
assert incoming[id(node)] == 0
|
||||
nodes.append(node)
|
||||
for child in node[1]:
|
||||
if child:
|
||||
incoming[id(child)] -= 1
|
||||
if incoming[id(child)] == 0:
|
||||
waiting.append(child)
|
||||
return nodes
|
||||
|
||||
|
||||
def encode_links(children, offsets, current):
|
||||
"""Encodes a list of children as one, two or three byte offsets."""
|
||||
if not children[0]:
|
||||
# This is an <end_label> node and no links follow such nodes
|
||||
assert len(children) == 1
|
||||
return []
|
||||
guess = 3 * len(children)
|
||||
assert children
|
||||
children = sorted(children, key=lambda x: -offsets[id(x)])
|
||||
while True:
|
||||
offset = current + guess
|
||||
buf = []
|
||||
for child in children:
|
||||
last = len(buf)
|
||||
distance = offset - offsets[id(child)]
|
||||
assert distance > 0 and distance < (1 << 21)
|
||||
|
||||
if distance < (1 << 6):
|
||||
# A 6-bit offset: "s0xxxxxx"
|
||||
buf.append(distance)
|
||||
elif distance < (1 << 13):
|
||||
# A 13-bit offset: "s10xxxxxxxxxxxxx"
|
||||
buf.append(0x40 | (distance >> 8))
|
||||
buf.append(distance & 0xFF)
|
||||
else:
|
||||
# A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
|
||||
buf.append(0x60 | (distance >> 16))
|
||||
buf.append((distance >> 8) & 0xFF)
|
||||
buf.append(distance & 0xFF)
|
||||
# Distance in first link is relative to following record.
|
||||
# Distance in other links are relative to previous link.
|
||||
offset -= distance
|
||||
if len(buf) == guess:
|
||||
break
|
||||
guess = len(buf)
|
||||
# Set most significant bit to mark end of links in this node.
|
||||
buf[last] |= (1 << 7)
|
||||
buf.reverse()
|
||||
return buf
|
||||
|
||||
|
||||
def encode_prefix(label):
|
||||
"""Encodes a node label as a list of bytes without a trailing high byte.
|
||||
|
||||
This method encodes a node if there is exactly one child and the
|
||||
child follows immediately after so that no jump is needed. This label
|
||||
will then be a prefix to the label in the child node.
|
||||
"""
|
||||
assert label
|
||||
return [c for c in bytearray(reversed(label))]
|
||||
|
||||
|
||||
def encode_label(label):
|
||||
"""Encodes a node label as a list of bytes with a trailing high byte >0x80.
|
||||
"""
|
||||
buf = encode_prefix(label)
|
||||
# Set most significant bit to mark end of label in this node.
|
||||
buf[0] |= (1 << 7)
|
||||
return buf
|
||||
|
||||
|
||||
def encode(dafsa, utf_mode):
|
||||
"""Encodes a DAFSA to a list of bytes"""
|
||||
output = []
|
||||
offsets = {}
|
||||
|
||||
for node in reversed(top_sort(dafsa)):
|
||||
if (len(node[1]) == 1 and node[1][0] and
|
||||
(offsets[id(node[1][0])] == len(output))):
|
||||
output.extend(encode_prefix(node[0]))
|
||||
else:
|
||||
output.extend(encode_links(node[1], offsets, len(output)))
|
||||
output.extend(encode_label(node[0]))
|
||||
offsets[id(node)] = len(output)
|
||||
|
||||
output.extend(encode_links(dafsa, offsets, len(output)))
|
||||
output.reverse()
|
||||
if utf_mode:
|
||||
output.append(0x01)
|
||||
return output
|
||||
|
||||
|
||||
def to_cxx(data, codecs):
|
||||
"""Generates C++ code from a list of encoded bytes."""
|
||||
text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
|
||||
text += b'The byte array encodes effective tld names. See psl-make-dafsa source for'
|
||||
text += b' documentation.'
|
||||
text += b'*/\n\n'
|
||||
text += b'static const unsigned char kDafsa['
|
||||
text += bytes(str(len(data)), **codecs)
|
||||
text += b'] = {\n'
|
||||
for i in range(0, len(data), 12):
|
||||
text += b' '
|
||||
text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
|
||||
text += b',\n'
|
||||
text += b'};\n'
|
||||
return text
|
||||
|
||||
def sha1_file(name):
|
||||
sha1 = hashlib.sha1()
|
||||
with open(name, 'rb') as f:
|
||||
while True:
|
||||
data = f.read(65536)
|
||||
if not data:
|
||||
break
|
||||
sha1.update(data)
|
||||
return sha1.hexdigest()
|
||||
|
||||
def to_cxx_plus(data, codecs):
|
||||
"""Generates C++ code from a word list plus some variable assignments as needed by libpsl"""
|
||||
text = to_cxx(data, codecs)
|
||||
text += b'static time_t _psl_file_time = %d;\n' % os.stat(psl_input_file).st_mtime
|
||||
text += b'static int _psl_nsuffixes = %d;\n' % psl_nsuffixes
|
||||
text += b'static int _psl_nexceptions = %d;\n' % psl_nexceptions
|
||||
text += b'static int _psl_nwildcards = %d;\n' % psl_nwildcards
|
||||
text += b'static const char _psl_sha1_checksum[] = "%s";\n' % bytes(sha1_file(psl_input_file), **codecs)
|
||||
text += b'static const char _psl_filename[] = "%s";\n' % bytes(psl_input_file, **codecs)
|
||||
return text
|
||||
|
||||
def words_to_whatever(words, converter, utf_mode, codecs):
|
||||
"""Generates C++ code from a word list"""
|
||||
dafsa = to_dafsa(words, utf_mode)
|
||||
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
|
||||
dafsa = fun(dafsa)
|
||||
return converter(encode(dafsa, utf_mode), codecs)
|
||||
|
||||
|
||||
def words_to_cxx(words, utf_mode, codecs):
|
||||
"""Generates C++ code from a word list"""
|
||||
return words_to_whatever(words, to_cxx, utf_mode, codecs)
|
||||
|
||||
def words_to_cxx_plus(words, utf_mode, codecs):
|
||||
"""Generates C++ code from a word list plus some variable assignments as needed by libpsl"""
|
||||
return words_to_whatever(words, to_cxx_plus, utf_mode, codecs)
|
||||
|
||||
def words_to_binary(words, utf_mode, codecs):
|
||||
"""Generates C++ code from a word list"""
|
||||
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)
|
||||
|
||||
|
||||
def parse_psl(infile, utf_mode, codecs):
|
||||
"""Parses PSL file and extract strings and return code"""
|
||||
PSL_FLAG_EXCEPTION = (1<<0)
|
||||
PSL_FLAG_WILDCARD = (1<<1)
|
||||
PSL_FLAG_ICANN = (1<<2) # entry of ICANN section
|
||||
PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
|
||||
PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
|
||||
|
||||
global psl_nsuffixes, psl_nexceptions, psl_nwildcards
|
||||
|
||||
psl = {}
|
||||
section = 0
|
||||
|
||||
for line in infile:
|
||||
line = bytes(line.strip(), **codecs)
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith(b'//'):
|
||||
if section == 0:
|
||||
if b'===BEGIN ICANN DOMAINS===' in line:
|
||||
section = PSL_FLAG_ICANN
|
||||
elif b'===BEGIN PRIVATE DOMAINS===' in line:
|
||||
section = PSL_FLAG_PRIVATE
|
||||
elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line:
|
||||
section = 0
|
||||
elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line:
|
||||
section = 0
|
||||
continue # skip comments
|
||||
|
||||
if line[:1] == b'!':
|
||||
psl_nexceptions += 1
|
||||
flags = PSL_FLAG_EXCEPTION | section
|
||||
line = line[1:]
|
||||
elif line[:1] == b'*':
|
||||
if line[1:2] != b'.':
|
||||
print('Unsupported kind of rule (ignored): %s' % line)
|
||||
continue
|
||||
psl_nwildcards += 1
|
||||
psl_nsuffixes += 1
|
||||
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
|
||||
line = line[2:]
|
||||
else:
|
||||
psl_nsuffixes += 1
|
||||
flags = PSL_FLAG_PLAIN | section
|
||||
|
||||
punycode = line.decode('utf-8').encode('idna')
|
||||
|
||||
if punycode in psl:
|
||||
"""Found existing entry:
|
||||
Combination of exception and plain rule is ambiguous
|
||||
!foo.bar
|
||||
foo.bar
|
||||
|
||||
Allowed:
|
||||
!foo.bar + *.foo.bar
|
||||
foo.bar + *.foo.bar
|
||||
"""
|
||||
print('Found %s/%X (now %X)' % punycode, psl[punycode], flags)
|
||||
continue
|
||||
|
||||
if utf_mode:
|
||||
psl[line] = flags
|
||||
psl[punycode] = flags
|
||||
|
||||
# with open("psl.out", 'w') as outfile:
|
||||
# for (domain, flags) in sorted(psl.iteritems()):
|
||||
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
|
||||
|
||||
return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())]
|
||||
|
||||
|
||||
def usage():
|
||||
"""Prints the usage"""
|
||||
print('usage: %s [options] infile outfile' % sys.argv[0])
|
||||
print(' --output-format=cxx Write DAFSA as C/C++ code (default)')
|
||||
print(' --output-format=cxx+ Write DAFSA as C/C++ code plus statistical assignments')
|
||||
print(' --output-format=binary Write DAFSA binary data')
|
||||
print(' --encoding=ascii 7-bit ASCII mode')
|
||||
print(' --encoding=utf-8 UTF-8 mode (default)')
|
||||
exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
"""Convert PSL file into C or binary DAFSA file"""
|
||||
if len(sys.argv) < 3:
|
||||
usage()
|
||||
|
||||
converter = words_to_cxx
|
||||
parser = parse_psl
|
||||
utf_mode = True
|
||||
|
||||
codecs = dict()
|
||||
if sys.version_info.major > 2:
|
||||
codecs['encoding'] = 'utf-8'
|
||||
|
||||
for arg in sys.argv[1:-2]:
|
||||
# Check --input-format for backward compatibility
|
||||
if arg.startswith('--input-format='):
|
||||
value = arg[15:].lower()
|
||||
if value == 'psl':
|
||||
parser = parse_psl
|
||||
else:
|
||||
print("Unknown input format '%s'" % value)
|
||||
return 1
|
||||
elif arg.startswith('--output-format='):
|
||||
value = arg[16:].lower()
|
||||
if value == 'binary':
|
||||
converter = words_to_binary
|
||||
elif value == 'cxx':
|
||||
converter = words_to_cxx
|
||||
elif value == 'cxx+':
|
||||
converter = words_to_cxx_plus
|
||||
else:
|
||||
print("Unknown output format '%s'" % value)
|
||||
return 1
|
||||
elif arg.startswith('--encoding='):
|
||||
value = arg[11:].lower()
|
||||
if value == 'ascii':
|
||||
utf_mode = False
|
||||
elif value == 'utf-8':
|
||||
utf_mode = True
|
||||
else:
|
||||
print("Unknown encoding '%s'" % value)
|
||||
return 1
|
||||
else:
|
||||
usage()
|
||||
|
||||
if sys.argv[-2] == '-':
|
||||
with open(sys.argv[-1], 'wb') as outfile:
|
||||
outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
|
||||
else:
|
||||
"""Some statistical data for --output-format=cxx+"""
|
||||
global psl_input_file, psl_nsuffixes, psl_nexceptions, psl_nwildcards
|
||||
|
||||
psl_input_file = sys.argv[-2]
|
||||
psl_nsuffixes = 0
|
||||
psl_nexceptions = 0
|
||||
psl_nwildcards = 0
|
||||
|
||||
with open(sys.argv[-2], 'r', **codecs) as infile, open(sys.argv[-1], 'wb') as outfile:
|
||||
outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -156,6 +156,7 @@ qt_internal_extend_target(Network CONDITION QT_FEATURE_topleveldomain
|
||||
SOURCES
|
||||
kernel/qtldurl.cpp kernel/qtldurl_p.h
|
||||
kernel/qurltlds_p.h
|
||||
../3rdparty/libpsl/src/lookup_string_in_fixed_set.c
|
||||
)
|
||||
|
||||
qt_internal_extend_target(Network CONDITION QT_FEATURE_dnslookup
|
||||
|
@ -1,7 +1,7 @@
|
||||
{
|
||||
"Id": "psl",
|
||||
"Name": "The Public Suffix List",
|
||||
"QDocModule": "qtcore",
|
||||
"QDocModule": "qtnetwork",
|
||||
"Description": "The Public Suffix List is an initiative of Mozilla,
|
||||
but is maintained as a community resource. It is available for use in any software,
|
||||
but was originally created to meet the needs of browser manufacturers.
|
||||
@ -14,8 +14,7 @@ It allows browsers to, for example:
|
||||
- Accurately sort history entries by site",
|
||||
|
||||
"Files": "qurltlds_p.h",
|
||||
"QtUsage": "See util/publicSuffix/ for code-generator",
|
||||
"QtUsage": "Used in Qt Core to avoid setting \"supercookies\" in the cookie jar
|
||||
"QtUsage": "Used in Qt Network to avoid setting \"supercookies\" in the cookie jar
|
||||
supported by Qt (by the QNetworkCookieJar class).",
|
||||
|
||||
"Homepage": "Consult https://github.com/publicsuffix/list for the sha1 but download from ...",
|
||||
|
@ -43,68 +43,25 @@
|
||||
|
||||
#if QT_CONFIG(topleveldomain)
|
||||
|
||||
#include "qplatformdefs.h"
|
||||
#include "qurl.h"
|
||||
#include "private/qurltlds_p.h"
|
||||
#include "private/qtldurl_p.h"
|
||||
#include "QtCore/qlist.h"
|
||||
#include "QtCore/qstring.h"
|
||||
|
||||
#include "qurltlds_p.h"
|
||||
|
||||
// Defined in src/3rdparty/libpsl/src/lookup_string_in_fixed_set.c
|
||||
extern "C" int LookupStringInFixedSet(const unsigned char *graph, std::size_t length,
|
||||
const char *key, std::size_t key_length);
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
enum TLDMatchType {
|
||||
ExactMatch,
|
||||
SuffixMatch,
|
||||
ExceptionMatch,
|
||||
};
|
||||
static constexpr int PSL_NOT_FOUND = -1;
|
||||
static constexpr int PSL_FLAG_EXCEPTION = 1 << 0;
|
||||
static constexpr int PSL_FLAG_WILDCARD = 1 << 1;
|
||||
|
||||
// Scan the auto-generated table of TLDs for an entry. For more details
|
||||
// see comments in file: util/publicSuffix/main.cpp
|
||||
static bool containsTLDEntry(QStringView entry, TLDMatchType match)
|
||||
static int lookupDomain(QByteArrayView domain)
|
||||
{
|
||||
const QStringView matchSymbols[] = {
|
||||
u"",
|
||||
u"*",
|
||||
u"!",
|
||||
};
|
||||
const auto symbol = matchSymbols[match];
|
||||
const int index = qt_hash(entry, qt_hash(symbol)) % tldCount;
|
||||
|
||||
// select the right chunk from the big table
|
||||
short chunk = 0;
|
||||
uint chunkIndex = tldIndices[index], offset = 0;
|
||||
|
||||
// The offset in the big string, of the group that our entry hashes into.
|
||||
const auto tldGroupOffset = tldIndices[index];
|
||||
|
||||
// It should always be inside all chunks' total size.
|
||||
Q_ASSERT(tldGroupOffset < tldChunks[tldChunkCount - 1]);
|
||||
// All offsets are stored in non-decreasing order.
|
||||
// This check is within bounds as tldIndices has length tldCount+1.
|
||||
Q_ASSERT(tldGroupOffset <= tldIndices[index + 1]);
|
||||
// The last extra entry in tldIndices
|
||||
// should be equal to the total of all chunks' lengths.
|
||||
static_assert(tldIndices[tldCount] == tldChunks[tldChunkCount - 1]);
|
||||
|
||||
// Find which chunk contains the tldGroupOffset
|
||||
while (tldGroupOffset >= tldChunks[chunk]) {
|
||||
chunkIndex -= tldChunks[chunk];
|
||||
offset += tldChunks[chunk];
|
||||
chunk++;
|
||||
|
||||
// We can not go above the number of chunks we have, since all our
|
||||
// indices are less than the total chunks' size (see asserts above).
|
||||
Q_ASSERT(chunk < tldChunkCount);
|
||||
}
|
||||
|
||||
// check all the entries from the given offset
|
||||
while (chunkIndex < tldIndices[index+1] - offset) {
|
||||
const auto utf8 = tldData[chunk] + chunkIndex;
|
||||
if ((symbol.isEmpty() || QLatin1Char(*utf8) == symbol) && entry == QString::fromUtf8(utf8 + symbol.size()))
|
||||
return true;
|
||||
chunkIndex += uint(qstrlen(utf8)) + 1; // +1 for the ending \0
|
||||
}
|
||||
return false;
|
||||
return LookupStringInFixedSet(kDafsa, sizeof(kDafsa), domain.data(), domain.size());
|
||||
}
|
||||
|
||||
/*!
|
||||
@ -118,19 +75,28 @@ static bool containsTLDEntry(QStringView entry, TLDMatchType match)
|
||||
Q_NETWORK_EXPORT bool qIsEffectiveTLD(QStringView domain)
|
||||
{
|
||||
// for domain 'foo.bar.com':
|
||||
// 1. return if TLD table contains 'foo.bar.com'
|
||||
// 2. else if table contains '*.bar.com',
|
||||
// 3. test that table does not contain '!foo.bar.com'
|
||||
// 1. return false if TLD table contains '!foo.bar.com'
|
||||
// 2. return true if TLD table contains 'foo.bar.com'
|
||||
// 3. return true if the table contains '*.bar.com'
|
||||
|
||||
if (containsTLDEntry(domain, ExactMatch)) // 1
|
||||
return true;
|
||||
QByteArray decodedDomain = domain.toUtf8();
|
||||
QByteArrayView domainView(decodedDomain);
|
||||
|
||||
const auto dot = domain.indexOf(QLatin1Char('.'));
|
||||
auto ret = lookupDomain(domainView);
|
||||
if (ret != PSL_NOT_FOUND) {
|
||||
if (ret & PSL_FLAG_EXCEPTION) // 1
|
||||
return false;
|
||||
if ((ret & PSL_FLAG_WILDCARD) == 0) // 2
|
||||
return true;
|
||||
}
|
||||
|
||||
const auto dot = domainView.indexOf('.');
|
||||
if (dot < 0) // Actual TLD: may be effective if the subject of a wildcard rule:
|
||||
return containsTLDEntry(QString(QLatin1Char('.') + domain), SuffixMatch);
|
||||
if (containsTLDEntry(domain.mid(dot), SuffixMatch)) // 2
|
||||
return !containsTLDEntry(domain, ExceptionMatch); // 3
|
||||
return false;
|
||||
return ret != PSL_NOT_FOUND;
|
||||
ret = lookupDomain(domainView.sliced(dot + 1)); // 3
|
||||
if (ret == PSL_NOT_FOUND)
|
||||
return false;
|
||||
return (ret & PSL_FLAG_WILDCARD) != 0;
|
||||
}
|
||||
|
||||
QT_END_NAMESPACE
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,14 +1,15 @@
|
||||
The file qurltlds_p.h is generated from the Public Suffix
|
||||
List (see [1] and [2]), by the program residing at
|
||||
qtbase/util/publicSuffix/ in the Qt source tree.
|
||||
qtbase/src/3rdparty/libpsl/src/psl-make-dafsa in the Qt source tree.
|
||||
|
||||
That program generates a character array and an index array from the
|
||||
list to provide fast lookups of elements within C++.
|
||||
To regenerate the file, run the following command from qtbase tree:
|
||||
|
||||
src/3rdparty/libpsl/src/psl-make-dafsa public_suffix_list.dat src/network/kernel/qurltlds_p.h
|
||||
|
||||
Those arrays in qurltlds_p.h are derived from the Public
|
||||
Suffix List ([2]), which was originally provided by
|
||||
Jo Hermans <jo.hermans@gmail.com>.
|
||||
|
||||
----
|
||||
[1] list: http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1
|
||||
[2] homepage: http://publicsuffix.org/
|
||||
[1] list: https://publicsuffix.org/list/public_suffix_list.dat
|
||||
[2] homepage: https://publicsuffix.org/
|
||||
|
@ -82,7 +82,8 @@
|
||||
@qpa_headers = ( qr/^qplatform/, qr/^qwindowsystem/ );
|
||||
my @internal_zlib_headers = ( "crc32.h", "deflate.h", "gzguts.h", "inffast.h", "inffixed.h", "inflate.h", "inftrees.h", "trees.h", "zutil.h" );
|
||||
my @zlib_headers = ( "zconf.h", "zlib.h" );
|
||||
@ignore_headers = ( @internal_zlib_headers );
|
||||
my @internal_qtnetwork_headers = ( "qurltlds_p.h" );
|
||||
@ignore_headers = ( @internal_zlib_headers, @internal_qtnetwork_headers );
|
||||
@ignore_for_include_check = ( "qsystemdetection.h", "qcompilerdetection.h", "qprocessordetection.h", @zlib_headers);
|
||||
@ignore_for_qt_begin_namespace_check = ( "qt_windows.h", @zlib_headers);
|
||||
%inject_headers = (
|
||||
|
Loading…
Reference in New Issue
Block a user