Network: Use public suffix database in DAFSA format

This saves 91kiB of text size and makes it easier to use a locally installed database when available. [ChangeLog][Third-Party Code] Moved attribution of The Public Suffix List from Qt Core to Qt Network. [ChangeLog][Third-Party Code] Added attribution of new libpsl library to Qt Network. libpsl is available under the BSD 3-Clause "New" or "Revised" License. Task-number: QTBUG-95889 Change-Id: Ibd37c7a94fdf235e75d96fec20d427fb5c2bd2a4 Reviewed-by: Kai Koehne <kai.koehne@qt.io> Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2022-02-21 15:04:05 +01:00 · 2022-02-21 15:04:05 +01:00 · cba40055b1
commit cba40055b1
parent d0b22762be
10 changed files with 5446 additions and 14912 deletions
--- a/src/3rdparty/libpsl/qt_attribution.json
+++ b/src/3rdparty/libpsl/qt_attribution.json
@ -0,0 +1,15 @@
+{
+    "Id": "libpsl",
+    "Name": "libpsl - C library to handle the Public Suffix List",
+    "QDocModule": "qtnetwork",
+    "Description": "Libpsl allows checking domains against the Public Suffix List.",
+    "Files": "src/lookup_string_in_fixed_set.c src/psl-make-dafsa",
+    "QtUsage": "Used to compress the embedded copy of publicsuffix list and
+to lookup entries in it.",
+    "Homepage": "https://github.com/rockdaboot/libpsl",
+    "Version": "664f3dc85259ec65e30248a61fa1c45b7b0e4c3f",
+    "License": "BSD 3-clause \"New\" or \"Revised\" License",
+    "LicenseFile": "src/LICENSE.chromium",
+    "LicenseId": "BSD-3-Clause",
+    "Copyright": "Copyright 2014-2016 The Chromium Authors. All rights reserved."
+}
--- a/src/3rdparty/libpsl/src/LICENSE.chromium
+++ b/src/3rdparty/libpsl/src/LICENSE.chromium
@ -0,0 +1,30 @@
+* The following License is for the source code files
+  psl-make-dafsa and lookup_string_in_fixed_set.c.
+
+// Copyright 2015 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/src/3rdparty/libpsl/src/lookup_string_in_fixed_set.c
+++ b/src/3rdparty/libpsl/src/lookup_string_in_fixed_set.c
@ -0,0 +1,273 @@
+/* Copyright 2015-2016 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE.chromium file.
+ *
+ * Converted to C89 2015 by Tim Rühsen
+ */
+
+#include <stddef.h>
+
+#define CHECK_LT(a, b) if ((a) >= b) return 0
+
+static const char multibyte_length_table[16] = {
+	0, 0, 0, 0,	 /* 0x00-0x3F */
+	0, 0, 0, 0,	 /* 0x40-0x7F */
+	0, 0, 0, 0,	 /* 0x80-0xBF */
+	2, 2, 3, 4,	 /* 0xC0-0xFF */
+};
+
+
+/*
+ * Get length of multibyte character sequence starting at a given byte.
+ * Returns zero if the byte is not a valid leading byte in UTF-8.
+ */
+static int GetMultibyteLength(char c) {
+	return multibyte_length_table[((unsigned char)c) >> 4];
+}
+
+/*
+ * Moves pointers one byte forward.
+ */
+static void NextPos(const unsigned char** pos,
+	const char** key,
+	const char** multibyte_start)
+{
+	++*pos;
+	if (*multibyte_start) {
+		/* Advance key to next byte in multibyte sequence. */
+		++*key;
+		/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
+		if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
+			*multibyte_start = 0;
+	} else {
+		if (GetMultibyteLength(**key)) {
+			/* Multibyte prefix was matched in the dafsa, start matching multibyte
+			 * content in next round. */
+			*multibyte_start = *key;
+		} else {
+			/* Advance key as a single byte character was matched. */
+			++*key;
+		}
+	}
+}
+
+/*
+ * Read next offset from pos.
+ * Returns true if an offset could be read, false otherwise.
+ */
+
+static int GetNextOffset(const unsigned char** pos,
+	const unsigned char* end,
+	const unsigned char** offset)
+{
+	size_t bytes_consumed;
+
+	if (*pos == end)
+		return 0;
+
+	/* When reading an offset the byte array must always contain at least
+	 * three more bytes to consume. First the offset to read, then a node
+	 * to skip over and finally a destination node. No object can be smaller
+	 * than one byte. */
+	CHECK_LT(*pos + 2, end);
+	switch (**pos & 0x60) {
+	case 0x60: /* Read three byte offset */
+		*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
+		bytes_consumed = 3;
+		break;
+	case 0x40: /* Read two byte offset */
+		*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
+		bytes_consumed = 2;
+		break;
+	default:
+		*offset += (*pos)[0] & 0x3F;
+		bytes_consumed = 1;
+	}
+	if ((**pos & 0x80) != 0) {
+		*pos = end;
+	} else {
+		*pos += bytes_consumed;
+	}
+	return 1;
+}
+
+/*
+ * Check if byte at offset is last in label.
+ */
+
+static int IsEOL(const unsigned char* offset, const unsigned char* end)
+{
+	CHECK_LT(offset, end);
+	return(*offset & 0x80) != 0;
+}
+
+/*
+ * Check if byte at offset matches first character in key.
+ * This version assumes a range check was already performed by the caller.
+ */
+
+static int IsMatchUnchecked(const unsigned char matcher,
+	const char* key,
+	const char* multibyte_start)
+{
+	if (multibyte_start) {
+		/* Multibyte matching mode. */
+		if (multibyte_start == key) {
+			/* Match leading byte, which will also match the sequence length. */
+			return (matcher ^ 0x80) == (const unsigned char)*key;
+		} else {
+			/* Match following bytes. */
+			return (matcher ^ 0xC0) == (const unsigned char)*key;
+		}
+	}
+	/* If key points at a leading byte in a multibyte sequence, but we are not yet
+	 * in multibyte mode, then the dafsa should contain a special byte to indicate
+	 * a mode switch. */
+	if (GetMultibyteLength(*key)) {
+		return matcher == 0x1F;
+	}
+	/* Normal matching of a single byte character. */
+	return matcher == (const unsigned char)*key;
+}
+
+/*
+ * Check if byte at offset matches first character in key.
+ * This version matches characters not last in label.
+ */
+
+static int IsMatch(const unsigned char* offset,
+	const unsigned char* end,
+	const char* key,
+	const char* multibyte_start)
+{
+	CHECK_LT(offset, end);
+	return IsMatchUnchecked(*offset, key, multibyte_start);
+}
+
+/*
+ * Check if byte at offset matches first character in key.
+ * This version matches characters last in label.
+ */
+
+static int IsEndCharMatch(const unsigned char* offset,
+	const unsigned char* end,
+	const char* key,
+	const char* multibyte_start)
+{
+	CHECK_LT(offset, end);
+	return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
+}
+
+/*
+ * Read return value at offset.
+ * Returns true if a return value could be read, false otherwise.
+ */
+
+static int GetReturnValue(const unsigned char* offset,
+	const unsigned char* end,
+	const char* multibyte_start,
+	int* return_value)
+{
+	CHECK_LT(offset, end);
+	if (!multibyte_start && (*offset & 0xE0) == 0x80) {
+		*return_value = *offset & 0x0F;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ *  Looks up the string |key| with length |key_length| in a fixed set of
+ * strings. The set of strings must be known at compile time. It is converted to
+ * a graph structure named a DAFSA (Deterministic Acyclic Finite State
+ * Automaton) by the script psl-make-dafsa during compilation. This permits
+ * efficient (in time and space) lookup. The graph generated by psl-make-dafsa
+ * takes the form of a constant byte array which should be supplied via the
+ * |graph| and |length| parameters.  The return value is kDafsaNotFound,
+ * kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
+ * kDafsaWildcardRule and kDafsaPrivateRule ORed together.
+ * 
+ * Lookup a domain key in a byte array generated by psl-make-dafsa.
+ */
+
+/* prototype to skip warning with -Wmissing-prototypes */
+int LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
+
+int LookupStringInFixedSet(const unsigned char* graph,
+	size_t length,
+	const char* key,
+	size_t key_length)
+{
+	const unsigned char* pos = graph;
+	const unsigned char* end = graph + length;
+	const unsigned char* offset = pos;
+	const char* key_end = key + key_length;
+	const char* multibyte_start = 0;
+
+	while (GetNextOffset(&pos, end, &offset)) {
+		/*char <char>+ end_char offsets
+		 * char <char>+ return value
+		 * char end_char offsets
+		 * char return value
+		 * end_char offsets
+		 * return_value
+		 */
+		int did_consume = 0;
+
+		if (key != key_end && !IsEOL(offset, end)) {
+			/* Leading <char> is not a match. Don't dive into this child */
+			if (!IsMatch(offset, end, key, multibyte_start))
+				continue;
+			did_consume = 1;
+			NextPos(&offset, &key, &multibyte_start);
+			/* Possible matches at this point:
+			 * <char>+ end_char offsets
+			 * <char>+ return value
+			 * end_char offsets
+			 * return value
+			 */
+
+			/* Remove all remaining <char> nodes possible */
+			while (!IsEOL(offset, end) && key != key_end) {
+				if (!IsMatch(offset, end, key, multibyte_start))
+					return -1;
+				NextPos(&offset, &key, &multibyte_start);
+			}
+		}
+		/* Possible matches at this point:
+		 * end_char offsets
+		 * return_value
+		 * If one or more <char> elements were consumed, a failure
+		 * to match is terminal. Otherwise, try the next node.
+		 */
+		if (key == key_end) {
+			int return_value;
+
+			if (GetReturnValue(offset, end, multibyte_start, &return_value))
+				return return_value;
+			/* The DAFSA guarantees that if the first char is a match, all
+			 * remaining char elements MUST match if the key is truly present.
+			 */
+			if (did_consume)
+				return -1;
+			continue;
+		}
+		if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
+			if (did_consume)
+				return -1; /* Unexpected */
+			continue;
+		}
+		NextPos(&offset, &key, &multibyte_start);
+		pos = offset; /* Dive into child */
+	}
+
+	return -1; /* No match */
+}
+
+/* prototype to skip warning with -Wmissing-prototypes */
+int GetUtfMode(const unsigned char *graph, size_t length);
+
+int GetUtfMode(const unsigned char *graph, size_t length)
+{
+	return length > 0 && graph[length - 1] < 0x80;
+}
--- a/src/3rdparty/libpsl/src/psl-make-dafsa
+++ b/src/3rdparty/libpsl/src/psl-make-dafsa
@ -0,0 +1,692 @@
+#!/usr/bin/env python
+# Copyright 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE.chromium file.
+
+"""
+A Deterministic acyclic finite state automaton (DAFSA) is a compact
+representation of an unordered word list (dictionary).
+
+https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
+
+This python program converts a list of strings to a byte array in C++.
+This python program fetches strings and return values from a gperf file
+and generates a C++ file with a byte array representing graph that can be
+used as a memory efficient replacement for the perfect hash table.
+
+The input strings must consist of printable 7-bit ASCII characters or UTF-8
+multibyte sequences. Control characters in the range [0x00-0x1F] are not
+allowed. The return values must be one digit integers. .
+
+In this program a DAFSA is a diamond shaped graph starting at a common
+source node and ending at a common sink node. All internal nodes contain
+a label and each word is represented by the labels in one path from
+the source node to the sink node.
+
+The following python represention is used for nodes:
+
+  Source node: [ children ]
+  Internal node: (label, [ children ])
+  Sink node: None
+
+The graph is first compressed by prefixes like a trie. In the next step
+suffixes are compressed so that the graph gets diamond shaped. Finally
+one to one linked nodes are replaced by nodes with the labels joined.
+
+The order of the operations is crucial since lookups will be performed
+starting from the source with no backtracking. Thus a node must have at
+most one child with a label starting by the same character. The output
+is also arranged so that all jumps are to increasing addresses, thus forward
+in memory.
+
+The generated output has suffix free decoding so that the sign of leading
+bits in a link (a reference to a child node) indicate if it has a size of one,
+two or three bytes and if it is the last outgoing link from the actual node.
+A node label is terminated by a byte with the leading bit set.
+
+The generated byte array can described by the following BNF:
+
+<byte> ::= < 8-bit value in range [0x00-0xFF] >
+
+<char> ::= < byte in range [0x1F-0x7F] >
+<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
+<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
+
+<offset1> ::= < byte in range [0x00-0x3F] >
+<offset2> ::= < byte in range [0x40-0x5F] >
+<offset3> ::= < byte in range [0x60-0x7F] >
+
+<end_offset1> ::= < byte in range [0x80-0xBF] >
+<end_offset2> ::= < byte in range [0xC0-0xDF] >
+<end_offset3> ::= < byte in range [0xE0-0xFF] >
+
+<prefix> ::= <char>
+
+<label> ::= <end_char>
+          | <char> <label>
+
+<end_label> ::= <return_value>
+          | <char> <end_label>
+
+<offset> ::= <offset1>
+           | <offset2> <byte>
+           | <offset3> <byte> <byte>
+
+<end_offset> ::= <end_offset1>
+               | <end_offset2> <byte>
+               | <end_offset3> <byte> <byte>
+
+<offsets> ::= <end_offset>
+            | <offset> <offsets>
+
+<source> ::= <offsets>
+
+<node> ::= <label> <offsets>
+         | <prefix> <node>
+         | <end_label>
+
+<graph> ::= <graph>
+          | <graph> <node>
+
+<version> ::= <empty>            # The DAFSA was generated in ASCII mode.
+          | < byte value 0x01 >  # The DAFSA was generated in UTF-8 mode.
+
+<dafsa> ::= <graph> <version>
+
+Decoding:
+
+<char> -> character
+<end_char> & 0x7F -> character
+<return value> & 0x0F -> integer
+<offset1 & 0x3F> -> integer
+((<offset2> & 0x1F>) << 8) + <byte> -> integer
+((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
+
+end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
+offset2 and offset3 respectively.
+
+The first offset in a list of offsets is the distance in bytes between the
+offset itself and the first child node. Subsequent offsets are the distance
+between previous child node and next child node. Thus each offset links a node
+to a child node. The distance is always counted between start addresses, i.e.
+first byte in decoded offset or first byte in child node.
+
+Transcoding of UTF-8 multibyte sequences:
+
+The original DAFSA format was limited to 7-bit printable ASCII characters in
+range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
+By transcoding of such characters the new format preserves compatibility with
+old parsers, so that a DAFSA in the extended format can be used by an old
+parser without false positives, although strings containing transcoded
+characters will never match. Since the format is extended rather than being
+changed, a parser supporting the new format will automatically support data
+generated in the old format.
+
+Transcoding is performed by insertion of a start byte with the special value
+0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
+the range of printable ASCII.
+
+2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
+
+3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
+
+4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
+                00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
+
+Example 1:
+
+%%
+aa, 1
+a, 2
+%%
+
+The input is first parsed to a list of words:
+["aa1", "a2"]
+
+A fully expanded graph is created from the words:
+source = [node1, node4]
+node1 = ("a", [node2])
+node2 = ("a", [node3])
+node3 = ("\x01", [sink])
+node4 = ("a", [node5])
+node5 = ("\x02", [sink])
+sink = None
+
+Compression results in the following graph:
+source = [node1]
+node1 = ("a", [node2, node3])
+node2 = ("\x02", [sink])
+node3 = ("a\x01", [sink])
+sink = None
+
+A C++ representation of the compressed graph is generated:
+
+const unsigned char dafsa[7] = {
+  0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
+};
+
+The bytes in the generated array has the following meaning:
+
+ 0: 0x81 <end_offset1>  child at position 0 + (0x81 & 0x3F) -> jump to 1
+
+ 1: 0xE1 <end_char>     label character (0xE1 & 0x7F) -> match "a"
+ 2: 0x02 <offset1>      child at position 2 + (0x02 & 0x3F) -> jump to 4
+
+ 3: 0x81 <end_offset1>  child at position 4 + (0x81 & 0x3F) -> jump to 5
+ 4: 0x82 <return_value> 0x82 & 0x0F -> return 2
+
+ 5: 0x61 <char>         label character 0x61 -> match "a"
+ 6: 0x81 <return_value> 0x81 & 0x0F -> return 1
+
+Example 2:
+
+%%
+aa, 1
+bbb, 2
+baa, 1
+%%
+
+The input is first parsed to a list of words:
+["aa1", "bbb2", "baa1"]
+
+Compression results in the following graph:
+source = [node1, node2]
+node1 = ("b", [node2, node3])
+node2 = ("aa\x01", [sink])
+node3 = ("bb\x02", [sink])
+sink = None
+
+A C++ representation of the compressed graph is generated:
+
+const unsigned char dafsa[11] = {
+  0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
+};
+
+The bytes in the generated array has the following meaning:
+
+ 0: 0x02 <offset1>      child at position 0 + (0x02 & 0x3F) -> jump to 2
+ 1: 0x83 <end_offset1>  child at position 2 + (0x83 & 0x3F) -> jump to 5
+
+ 2: 0xE2 <end_char>     label character (0xE2 & 0x7F) -> match "b"
+ 3: 0x02 <offset1>      child at position 3 + (0x02 & 0x3F) -> jump to 5
+ 4: 0x83 <end_offset1>  child at position 5 + (0x83 & 0x3F) -> jump to 8
+
+ 5: 0x61 <char>         label character 0x61 -> match "a"
+ 6: 0x61 <char>         label character 0x61 -> match "a"
+ 7: 0x81 <return_value> 0x81 & 0x0F -> return 1
+
+ 8: 0x62 <char>         label character 0x62 -> match "b"
+ 9: 0x62 <char>         label character 0x62 -> match "b"
+10: 0x82 <return_value> 0x82 & 0x0F -> return 2
+"""
+
+import sys
+import os.path
+import hashlib
+
+class InputError(Exception):
+  """Exception raised for errors in the input file."""
+
+# Length of a character starting at a given byte.
+char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x00-0x0F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x10-0x1F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x20-0x2F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x30-x03F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x40-0x4F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x50-x05F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x60-0x6F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x70-x07F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x80-0x8F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x90-0x9F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0xA0-0xAF
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0xB0-0xBF
+                      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # 0xC0-0xCF
+                      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # 0xD0-0xDF
+                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  # 0xE0-0xEF
+                      4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
+
+def to_bytes(n):
+  """Converts an integer value to a bytes object."""
+  return bytes(bytearray((n,)))
+
+def to_dafsa(words, utf_mode):
+  """Generates a DAFSA from a word list and returns the source node.
+
+  Each word is split into characters so that each character is represented by
+  a unique node. It is assumed the word list is not empty.
+  """
+  if not words:
+    raise InputError('The domain list must not be empty')
+  def to_nodes(word, multibyte_length):
+    """Split words into characters"""
+    byte = ord(word[:1])
+    if multibyte_length:
+      # Consume next byte in multibyte sequence.
+      if byte & 0xC0 != 0x80:
+        raise InputError('Invalid UTF-8 multibyte sequence')
+      return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
+    char_length = char_length_table[byte]
+    if char_length == 1:
+      # 7-bit printable ASCII.
+      if len(word) == 1:
+        return to_bytes(int(word[:1], 16) & 0x0F), [None]
+      return word[:1], [to_nodes(word[1:], 0)]
+    elif char_length > 1:
+      # Leading byte in multibyte sequence.
+      if not utf_mode:
+        raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
+      if len(word) <= char_length:
+        raise InputError('Unterminated UTF-8 multibyte sequence')
+      return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
+    # Unexpected character.
+    raise InputError('Domain names must be printable ASCII or UTF-8')
+
+  return [to_nodes(word, 0) for word in words]
+
+def to_words(node):
+  """Generates a word list from all paths starting from an internal node."""
+  if not node:
+    return [b'']
+  return [(node[0] + word) for child in node[1] for word in to_words(child)]
+
+
+def reverse(dafsa):
+  """Generates a new DAFSA that is reversed, so that the old sink node becomes
+  the new source node.
+  """
+  sink = []
+  nodemap = {}
+
+  def dfs(node, parent):
+    """Creates reverse nodes.
+
+    A new reverse node will be created for each old node. The new node will
+    get a reversed label and the parents of the old node as children.
+    """
+    if not node:
+      sink.append(parent)
+    elif id(node) not in nodemap:
+      nodemap[id(node)] = (node[0][::-1], [parent])
+      for child in node[1]:
+        dfs(child, nodemap[id(node)])
+    else:
+      nodemap[id(node)][1].append(parent)
+
+  for node in dafsa:
+    dfs(node, None)
+  return sink
+
+
+def join_labels(dafsa):
+  """Generates a new DAFSA where internal nodes are merged if there is a one to
+  one connection.
+  """
+  parentcount = {id(None): 2}
+  nodemap = {id(None): None}
+
+  def count_parents(node):
+    """Count incoming references"""
+    if id(node) in parentcount:
+      parentcount[id(node)] += 1
+    else:
+      parentcount[id(node)] = 1
+      for child in node[1]:
+        count_parents(child)
+
+  def join(node):
+    """Create new nodes"""
+    if id(node) not in nodemap:
+      children = [join(child) for child in node[1]]
+      if len(children) == 1 and parentcount[id(node[1][0])] == 1:
+        child = children[0]
+        nodemap[id(node)] = (node[0] + child[0], child[1])
+      else:
+        nodemap[id(node)] = (node[0], children)
+    return nodemap[id(node)]
+
+  for node in dafsa:
+    count_parents(node)
+  return [join(node) for node in dafsa]
+
+
+def join_suffixes(dafsa):
+  """Generates a new DAFSA where nodes that represent the same word lists
+  towards the sink are merged.
+  """
+  nodemap = {frozenset((b'',)): None}
+
+  def join(node):
+    """Returns a matching node. A new node is created if no matching node
+    exists. The graph is accessed in dfs order.
+    """
+    suffixes = frozenset(to_words(node))
+    if suffixes not in nodemap:
+      nodemap[suffixes] = (node[0], [join(child) for child in node[1]])
+    return nodemap[suffixes]
+
+  return [join(node) for node in dafsa]
+
+
+def top_sort(dafsa):
+  """Generates list of nodes in topological sort order."""
+  incoming = {}
+
+  def count_incoming(node):
+    """Counts incoming references."""
+    if node:
+      if id(node) not in incoming:
+        incoming[id(node)] = 1
+        for child in node[1]:
+          count_incoming(child)
+      else:
+        incoming[id(node)] += 1
+
+  for node in dafsa:
+    count_incoming(node)
+
+  for node in dafsa:
+    incoming[id(node)] -= 1
+
+  waiting = [node for node in dafsa if incoming[id(node)] == 0]
+  nodes = []
+
+  while waiting:
+    node = waiting.pop()
+    assert incoming[id(node)] == 0
+    nodes.append(node)
+    for child in node[1]:
+      if child:
+        incoming[id(child)] -= 1
+        if incoming[id(child)] == 0:
+          waiting.append(child)
+  return nodes
+
+
+def encode_links(children, offsets, current):
+  """Encodes a list of children as one, two or three byte offsets."""
+  if not children[0]:
+    # This is an <end_label> node and no links follow such nodes
+    assert len(children) == 1
+    return []
+  guess = 3 * len(children)
+  assert children
+  children = sorted(children, key=lambda x: -offsets[id(x)])
+  while True:
+    offset = current + guess
+    buf = []
+    for child in children:
+      last = len(buf)
+      distance = offset - offsets[id(child)]
+      assert distance > 0 and distance < (1 << 21)
+
+      if distance < (1 << 6):
+        # A 6-bit offset: "s0xxxxxx"
+        buf.append(distance)
+      elif distance < (1 << 13):
+        # A 13-bit offset: "s10xxxxxxxxxxxxx"
+        buf.append(0x40 | (distance >> 8))
+        buf.append(distance & 0xFF)
+      else:
+        # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
+        buf.append(0x60 | (distance >> 16))
+        buf.append((distance >> 8) & 0xFF)
+        buf.append(distance & 0xFF)
+      # Distance in first link is relative to following record.
+      # Distance in other links are relative to previous link.
+      offset -= distance
+    if len(buf) == guess:
+      break
+    guess = len(buf)
+  # Set most significant bit to mark end of links in this node.
+  buf[last] |= (1 << 7)
+  buf.reverse()
+  return buf
+
+
+def encode_prefix(label):
+  """Encodes a node label as a list of bytes without a trailing high byte.
+
+  This method encodes a node if there is exactly one child  and the
+  child follows immediately after so that no jump is needed. This label
+  will then be a prefix to the label in the child node.
+  """
+  assert label
+  return [c for c in bytearray(reversed(label))]
+
+
+def encode_label(label):
+  """Encodes a node label as a list of bytes with a trailing high byte >0x80.
+  """
+  buf = encode_prefix(label)
+  # Set most significant bit to mark end of label in this node.
+  buf[0] |= (1 << 7)
+  return buf
+
+
+def encode(dafsa, utf_mode):
+  """Encodes a DAFSA to a list of bytes"""
+  output = []
+  offsets = {}
+
+  for node in reversed(top_sort(dafsa)):
+    if (len(node[1]) == 1 and node[1][0] and
+        (offsets[id(node[1][0])] == len(output))):
+      output.extend(encode_prefix(node[0]))
+    else:
+      output.extend(encode_links(node[1], offsets, len(output)))
+      output.extend(encode_label(node[0]))
+    offsets[id(node)] = len(output)
+
+  output.extend(encode_links(dafsa, offsets, len(output)))
+  output.reverse()
+  if utf_mode:
+    output.append(0x01)
+  return output
+
+
+def to_cxx(data, codecs):
+  """Generates C++ code from a list of encoded bytes."""
+  text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
+  text += b'The byte array encodes effective tld names. See psl-make-dafsa source for'
+  text += b' documentation.'
+  text += b'*/\n\n'
+  text += b'static const unsigned char kDafsa['
+  text += bytes(str(len(data)), **codecs)
+  text += b'] = {\n'
+  for i in range(0, len(data), 12):
+    text += b'  '
+    text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
+    text += b',\n'
+  text += b'};\n'
+  return text
+
+def sha1_file(name):
+  sha1 = hashlib.sha1()
+  with open(name, 'rb') as f:
+    while True:
+        data = f.read(65536)
+        if not data:
+            break
+        sha1.update(data)
+  return sha1.hexdigest()
+
+def to_cxx_plus(data, codecs):
+  """Generates C++ code from a word list plus some variable assignments as needed by libpsl"""
+  text = to_cxx(data, codecs)
+  text += b'static time_t _psl_file_time = %d;\n' % os.stat(psl_input_file).st_mtime
+  text += b'static int _psl_nsuffixes = %d;\n' % psl_nsuffixes
+  text += b'static int _psl_nexceptions = %d;\n' % psl_nexceptions
+  text += b'static int _psl_nwildcards = %d;\n' % psl_nwildcards
+  text += b'static const char _psl_sha1_checksum[] = "%s";\n' % bytes(sha1_file(psl_input_file), **codecs)
+  text += b'static const char _psl_filename[] = "%s";\n' % bytes(psl_input_file, **codecs)
+  return text
+
+def words_to_whatever(words, converter, utf_mode, codecs):
+  """Generates C++ code from a word list"""
+  dafsa = to_dafsa(words, utf_mode)
+  for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
+    dafsa = fun(dafsa)
+  return converter(encode(dafsa, utf_mode), codecs)
+
+
+def words_to_cxx(words, utf_mode, codecs):
+  """Generates C++ code from a word list"""
+  return words_to_whatever(words, to_cxx, utf_mode, codecs)
+
+def words_to_cxx_plus(words, utf_mode, codecs):
+  """Generates C++ code from a word list plus some variable assignments as needed by libpsl"""
+  return words_to_whatever(words, to_cxx_plus, utf_mode, codecs)
+
+def words_to_binary(words, utf_mode, codecs):
+  """Generates C++ code from a word list"""
+  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)
+
+
+def parse_psl(infile, utf_mode, codecs):
+  """Parses PSL file and extract strings and return code"""
+  PSL_FLAG_EXCEPTION = (1<<0)
+  PSL_FLAG_WILDCARD = (1<<1)
+  PSL_FLAG_ICANN = (1<<2) # entry of ICANN section
+  PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
+  PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
+
+  global psl_nsuffixes, psl_nexceptions, psl_nwildcards
+
+  psl = {}
+  section = 0
+
+  for line in infile:
+    line = bytes(line.strip(), **codecs)
+    if not line:
+      continue
+
+    if line.startswith(b'//'):
+      if section == 0:
+        if b'===BEGIN ICANN DOMAINS===' in line:
+          section = PSL_FLAG_ICANN
+        elif b'===BEGIN PRIVATE DOMAINS===' in line:
+          section = PSL_FLAG_PRIVATE
+      elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line:
+        section = 0
+      elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line:
+        section = 0
+      continue # skip comments
+
+    if line[:1] == b'!':
+      psl_nexceptions += 1
+      flags = PSL_FLAG_EXCEPTION | section
+      line = line[1:]
+    elif line[:1] == b'*':
+      if line[1:2] != b'.':
+        print('Unsupported kind of rule (ignored): %s' % line)
+        continue
+      psl_nwildcards += 1
+      psl_nsuffixes += 1
+      flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
+      line = line[2:]
+    else:
+      psl_nsuffixes += 1
+      flags = PSL_FLAG_PLAIN | section
+
+    punycode = line.decode('utf-8').encode('idna')
+
+    if punycode in psl:
+      """Found existing entry:
+         Combination of exception and plain rule is ambiguous
+           !foo.bar
+            foo.bar
+
+         Allowed:
+           !foo.bar + *.foo.bar
+            foo.bar + *.foo.bar
+      """
+      print('Found %s/%X (now %X)' % punycode, psl[punycode], flags)
+      continue
+
+    if utf_mode:
+      psl[line] = flags
+    psl[punycode] = flags
+
+#  with open("psl.out", 'w') as outfile:
+#    for (domain, flags) in sorted(psl.iteritems()):
+#      outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
+
+  return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())]
+
+
+def usage():
+  """Prints the usage"""
+  print('usage: %s [options] infile outfile' % sys.argv[0])
+  print('  --output-format=cxx     Write DAFSA as C/C++ code (default)')
+  print('  --output-format=cxx+    Write DAFSA as C/C++ code plus statistical assignments')
+  print('  --output-format=binary  Write DAFSA binary data')
+  print('  --encoding=ascii        7-bit ASCII mode')
+  print('  --encoding=utf-8        UTF-8 mode (default)')
+  exit(1)
+
+
+def main():
+  """Convert PSL file into C or binary DAFSA file"""
+  if len(sys.argv) < 3:
+    usage()
+
+  converter = words_to_cxx
+  parser = parse_psl
+  utf_mode = True
+
+  codecs = dict()
+  if sys.version_info.major > 2:
+    codecs['encoding'] = 'utf-8'
+
+  for arg in sys.argv[1:-2]:
+    # Check --input-format for backward compatibility
+    if arg.startswith('--input-format='):
+      value = arg[15:].lower()
+      if value == 'psl':
+        parser = parse_psl
+      else:
+        print("Unknown input format '%s'" % value)
+        return 1
+    elif arg.startswith('--output-format='):
+      value = arg[16:].lower()
+      if value == 'binary':
+        converter = words_to_binary
+      elif value == 'cxx':
+        converter = words_to_cxx
+      elif value == 'cxx+':
+        converter = words_to_cxx_plus
+      else:
+        print("Unknown output format '%s'" % value)
+        return 1
+    elif arg.startswith('--encoding='):
+      value = arg[11:].lower()
+      if value == 'ascii':
+        utf_mode = False
+      elif value == 'utf-8':
+        utf_mode = True
+      else:
+        print("Unknown encoding '%s'" % value)
+        return 1
+    else:
+      usage()
+
+  if sys.argv[-2] == '-':
+    with open(sys.argv[-1], 'wb') as outfile:
+      outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
+  else:
+    """Some statistical data for --output-format=cxx+"""
+    global psl_input_file, psl_nsuffixes, psl_nexceptions, psl_nwildcards
+
+    psl_input_file = sys.argv[-2]
+    psl_nsuffixes = 0
+    psl_nexceptions = 0
+    psl_nwildcards = 0
+
+    with open(sys.argv[-2], 'r', **codecs) as infile, open(sys.argv[-1], 'wb') as outfile:
+      outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))
+
+  return 0
+
+
+if __name__ == '__main__':
+  sys.exit(main())
--- a/src/network/CMakeLists.txt
+++ b/src/network/CMakeLists.txt
@ -156,6 +156,7 @@ qt_internal_extend_target(Network CONDITION QT_FEATURE_topleveldomain
    SOURCES
        kernel/qtldurl.cpp kernel/qtldurl_p.h
        kernel/qurltlds_p.h
+        ../3rdparty/libpsl/src/lookup_string_in_fixed_set.c
 )

 qt_internal_extend_target(Network CONDITION QT_FEATURE_dnslookup
--- a/src/network/kernel/qt_attribution.json
+++ b/src/network/kernel/qt_attribution.json
@ -1,7 +1,7 @@
 {
    "Id": "psl",
    "Name": "The Public Suffix List",
-    "QDocModule": "qtcore",
+    "QDocModule": "qtnetwork",
    "Description": "The Public Suffix List is an initiative of Mozilla,
 but is maintained as a community resource. It is available for use in any software,
 but was originally created to meet the needs of browser manufacturers.
@ -14,8 +14,7 @@ It allows browsers to, for example:
 - Accurately sort history entries by site",

    "Files": "qurltlds_p.h",
-    "QtUsage": "See util/publicSuffix/ for code-generator",
-    "QtUsage": "Used in Qt Core to avoid setting \"supercookies\" in the cookie jar
+    "QtUsage": "Used in Qt Network to avoid setting \"supercookies\" in the cookie jar
 supported by Qt (by the QNetworkCookieJar class).",

    "Homepage": "Consult https://github.com/publicsuffix/list for the sha1 but download from ...",
--- a/src/network/kernel/qtldurl.cpp
+++ b/src/network/kernel/qtldurl.cpp
@ -43,68 +43,25 @@

 #if QT_CONFIG(topleveldomain)

-#include "qplatformdefs.h"
 #include "qurl.h"
-#include "private/qurltlds_p.h"
 #include "private/qtldurl_p.h"
-#include "QtCore/qlist.h"
 #include "QtCore/qstring.h"

+#include "qurltlds_p.h"
+
+// Defined in src/3rdparty/libpsl/src/lookup_string_in_fixed_set.c
+extern "C" int LookupStringInFixedSet(const unsigned char *graph, std::size_t length,
+                                      const char *key, std::size_t key_length);
+
 QT_BEGIN_NAMESPACE

-enum TLDMatchType {
-    ExactMatch,
-    SuffixMatch,
-    ExceptionMatch,
-};
+static constexpr int PSL_NOT_FOUND = -1;
+static constexpr int PSL_FLAG_EXCEPTION = 1 << 0;
+static constexpr int PSL_FLAG_WILDCARD = 1 << 1;

-// Scan the auto-generated table of TLDs for an entry. For more details
-// see comments in file: util/publicSuffix/main.cpp
-static bool containsTLDEntry(QStringView entry, TLDMatchType match)
+static int lookupDomain(QByteArrayView domain)
 {
-    const QStringView matchSymbols[] = {
-        u"",
-        u"*",
-        u"!",
-    };
-    const auto symbol = matchSymbols[match];
-    const int index = qt_hash(entry, qt_hash(symbol)) % tldCount;
-
-    // select the right chunk from the big table
-    short chunk = 0;
-    uint chunkIndex = tldIndices[index], offset = 0;
-
-    // The offset in the big string, of the group that our entry hashes into.
-    const auto tldGroupOffset = tldIndices[index];
-
-    // It should always be inside all chunks' total size.
-    Q_ASSERT(tldGroupOffset < tldChunks[tldChunkCount - 1]);
-    // All offsets are stored in non-decreasing order.
-    // This check is within bounds as tldIndices has length tldCount+1.
-    Q_ASSERT(tldGroupOffset <= tldIndices[index + 1]);
-    // The last extra entry in tldIndices
-    // should be equal to the total of all chunks' lengths.
-    static_assert(tldIndices[tldCount] == tldChunks[tldChunkCount - 1]);
-
-    // Find which chunk contains the tldGroupOffset
-    while (tldGroupOffset >= tldChunks[chunk]) {
-        chunkIndex -= tldChunks[chunk];
-        offset += tldChunks[chunk];
-        chunk++;
-
-        // We can not go above the number of chunks we have, since all our
-        // indices are less than the total chunks' size (see asserts above).
-        Q_ASSERT(chunk < tldChunkCount);
-    }
-
-    // check all the entries from the given offset
-    while (chunkIndex < tldIndices[index+1] - offset) {
-        const auto utf8 = tldData[chunk] + chunkIndex;
-        if ((symbol.isEmpty() || QLatin1Char(*utf8) == symbol) && entry == QString::fromUtf8(utf8 + symbol.size()))
-            return true;
-        chunkIndex += uint(qstrlen(utf8)) + 1; // +1 for the ending \0
-    }
-    return false;
+    return LookupStringInFixedSet(kDafsa, sizeof(kDafsa), domain.data(), domain.size());
 }

 /*!
@ -118,19 +75,28 @@ static bool containsTLDEntry(QStringView entry, TLDMatchType match)
 Q_NETWORK_EXPORT bool qIsEffectiveTLD(QStringView domain)
 {
    // for domain 'foo.bar.com':
-    // 1. return if TLD table contains 'foo.bar.com'
-    // 2. else if table contains '*.bar.com',
-    // 3. test that table does not contain '!foo.bar.com'
+    // 1. return false if TLD table contains '!foo.bar.com'
+    // 2. return true if TLD table contains 'foo.bar.com'
+    // 3. return true if the table contains '*.bar.com'

-    if (containsTLDEntry(domain, ExactMatch)) // 1
-        return true;
+    QByteArray decodedDomain = domain.toUtf8();
+    QByteArrayView domainView(decodedDomain);

-    const auto dot = domain.indexOf(QLatin1Char('.'));
+    auto ret = lookupDomain(domainView);
+    if (ret != PSL_NOT_FOUND) {
+        if (ret & PSL_FLAG_EXCEPTION) // 1
+            return false;
+        if ((ret & PSL_FLAG_WILDCARD) == 0) // 2
+            return true;
+    }
+
+    const auto dot = domainView.indexOf('.');
    if (dot < 0) // Actual TLD: may be effective if the subject of a wildcard rule:
-        return containsTLDEntry(QString(QLatin1Char('.') + domain), SuffixMatch);
-    if (containsTLDEntry(domain.mid(dot), SuffixMatch))   // 2
-        return !containsTLDEntry(domain, ExceptionMatch); // 3
-    return false;
+        return ret != PSL_NOT_FOUND;
+    ret = lookupDomain(domainView.sliced(dot + 1)); // 3
+    if (ret == PSL_NOT_FOUND)
+        return false;
+    return (ret & PSL_FLAG_WILDCARD) != 0;
 }

 QT_END_NAMESPACE
--- a/src/network/kernel/qurltlds_p.h
+++ b/src/network/kernel/qurltlds_p.h
--- a/src/network/kernel/qurltlds_p.h.INFO
+++ b/src/network/kernel/qurltlds_p.h.INFO
@ -1,14 +1,15 @@
 The file qurltlds_p.h is generated from the Public Suffix
 List (see [1] and [2]), by the program residing at
-qtbase/util/publicSuffix/ in the Qt source tree.
+qtbase/src/3rdparty/libpsl/src/psl-make-dafsa in the Qt source tree.

-That program generates a character array and an index array from the
-list to provide fast lookups of elements within C++.
+To regenerate the file, run the following command from qtbase tree:
+
+    src/3rdparty/libpsl/src/psl-make-dafsa public_suffix_list.dat src/network/kernel/qurltlds_p.h

 Those arrays in qurltlds_p.h are derived from the Public
 Suffix List ([2]), which was originally provided by
 Jo Hermans <jo.hermans@gmail.com>.

 ----
-[1] list: http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1
-[2] homepage: http://publicsuffix.org/
+[1] list: https://publicsuffix.org/list/public_suffix_list.dat
+[2] homepage: https://publicsuffix.org/
--- a/sync.profile
+++ b/sync.profile
@ -82,7 +82,8 @@
@qpa_headers = ( qr/^qplatform/, qr/^qwindowsystem/ );
 my @internal_zlib_headers = ( "crc32.h", "deflate.h", "gzguts.h", "inffast.h", "inffixed.h", "inflate.h", "inftrees.h", "trees.h", "zutil.h" );
 my @zlib_headers = ( "zconf.h", "zlib.h" );
-@ignore_headers = ( @internal_zlib_headers );
+my @internal_qtnetwork_headers = ( "qurltlds_p.h" );
+@ignore_headers = ( @internal_zlib_headers, @internal_qtnetwork_headers );
@ignore_for_include_check = ( "qsystemdetection.h", "qcompilerdetection.h", "qprocessordetection.h", @zlib_headers);
@ignore_for_qt_begin_namespace_check = ( "qt_windows.h", @zlib_headers);
 %inject_headers = (