Add a method to determine the encoding for encoded HTML data

This is a replacement for Qt::codecForHtml().

Change-Id: I31f03518fd9c70507cbd210a8bcf405b6a0106b1
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Lars Knoll 2020-04-30 11:43:21 +02:00
parent 13af1312f7
commit b8db123341
3 changed files with 138 additions and 0 deletions

View File

@ -44,6 +44,7 @@
#include "private/qsimd_p.h"
#include "private/qstringiterator_p.h"
#include "qbytearraymatcher.h"
#ifdef Q_OS_WIN
#include <qt_windows.h>
@ -1531,6 +1532,53 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForData(cons
return std::nullopt;
}
/*!
Tries to determine the encoding of the HTML in \a buf by looking at leading byte order marks or
a charset specifier in the HTML meta tag. If the optional is empty, the encoding specified is
not supported by QStringConverter. If no encoding is detected, the method returns Utf8.
*/
std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(const char *buf, qsizetype arraySize)
{
// determine charset
auto encoding = encodingForData(buf, arraySize);
if (encoding)
// trust the initial BOM
return encoding;
QByteArray header = QByteArray(buf, qMin(arraySize, qsizetype(1024))).toLower();
int pos = header.indexOf("meta ");
if (pos != -1) {
pos = header.indexOf("charset=", pos);
if (pos != -1) {
pos += qstrlen("charset=");
if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
++pos;
int pos2 = pos;
// The attribute can be closed with either """, "'", ">" or "/",
// none of which are valid charset characters.
while (++pos2 < header.size()) {
char ch = header.at(pos2);
if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
QByteArray name = header.mid(pos, pos2 - pos);
int colon = name.indexOf(':');
if (colon > 0)
name = name.left(colon);
name = name.simplified();
if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
name = QByteArrayLiteral("UTF-8");
if (!name.isEmpty())
return encodingForName(name);
}
}
}
}
return Utf8;
}
/*!
Returns the canonical name for \a encoding.
*/
const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
{
return encodingInterfaces[int(e)].name;

View File

@ -168,6 +168,7 @@ public:
Q_CORE_EXPORT static std::optional<Encoding> encodingForName(const char *name);
Q_CORE_EXPORT static const char *nameForEncoding(Encoding e);
Q_CORE_EXPORT static std::optional<Encoding> encodingForData(const char *buf, qsizetype arraySize, char16_t expectedFirstCharacter = 0);
Q_CORE_EXPORT static std::optional<Encoding> encodingForHtml(const char *buf, qsizetype arraySize);
protected:
const Interface *iface;

View File

@ -68,6 +68,9 @@ private slots:
void encodingForData_data();
void encodingForData();
void encodingForHtml_data();
void encodingForHtml();
};
void tst_QStringConverter::constructByName()
@ -1722,6 +1725,92 @@ void tst_QStringConverter::encodingForData()
QCOMPARE(e, encoding);
}
void tst_QStringConverter::encodingForHtml_data()
{
QTest::addColumn<QByteArray>("html");
QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding");
QByteArray html = "<html><head></head><body>blah</body></html>";
QTest::newRow("no charset") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-15\" /></head></html>";
QTest::newRow("latin 15") << html << std::optional<QStringConverter::Encoding>();
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-1\" /></head></html>";
QTest::newRow("latin 1") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
html = "<!DOCTYPE html><html><head><meta charset=\"ISO_8859-1:1987\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>";
QTest::newRow("latin 1 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
html = "<!DOCTYPE html><html><head><meta charset=\"utf-8\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>";
QTest::newRow("UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8\"><title>Test</title></head>";
QTest::newRow("UTF-8 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8/></head></html>";
QTest::newRow("UTF-8, no quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset='UTF-8'/></head></html>";
QTest::newRow("UTF-8, single quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
html = "<!DOCTYPE html><html><head><meta charset=utf-8><title>Test</title></head>";
QTest::newRow("UTF-8, > terminator") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
html = "<!DOCTYPE html><html><head><meta charset= utf-8 ><title>Test</title></head>";
QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
// Test invalid charsets.
html = "<!DOCTYPE html><html><head><meta charset= utf/8 ><title>Test</title></head>";
QTest::newRow("utf/8") << html << std::optional<QStringConverter::Encoding>();
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=invalid-foo\" /></head></html>";
QTest::newRow("invalid charset, no default") << html << std::optional<QStringConverter::Encoding>();
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"";
html.prepend(QByteArray().fill(' ', 512 - html.size()));
QTest::newRow("invalid charset (large header)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8";
QTest::newRow("invalid charset (no closing double quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset='utf-8";
QTest::newRow("invalid charset (no closing single quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
html = "<!DOCTYPE html><html><head><meta charset=utf-8 foo=bar><title>Test</title></head>";
QTest::newRow("invalid (space terminator)") << html << std::optional<QStringConverter::Encoding>();
html = "<!DOCTYPE html><html><head><meta charset=\" utf' 8 /><title>Test</title></head>";
QTest::newRow("invalid charset, early terminator (')") << html << std::optional<QStringConverter::Encoding>();
const char src[] = { char(0xff), char(0xfe), char(0x7a), char(0x03), 0, 0 };
html = src;
QTest::newRow("greek text UTF-16LE") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE);
html = "<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"><span style=\"color: rgb(0, 0, 0); font-family: "
"'Galatia SIL'; font-size: 27px; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; "
"line-height: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: "
"auto; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px; display: inline !important; float: "
"none;\">&#x37b</span>\000";
QTest::newRow("greek text UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
html = "<!DOCTYPE html><html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=unicode\">"
"<head/><body><p>bla</p></body></html>"; // QTBUG-41998, ICU will return UTF-16.
QTest::newRow("legacy unicode UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
}
void tst_QStringConverter::encodingForHtml()
{
QFETCH(QByteArray, html);
QFETCH(std::optional<QStringConverter::Encoding>, encoding);
QCOMPARE(QStringConverter::encodingForHtml(html.constData(), html.size()), encoding);
}
class LoadAndConvert: public QRunnable
{
public: