Add a method to determine the encoding for encoded HTML data
This is a replacement for Qt::codecForHtml(). Change-Id: I31f03518fd9c70507cbd210a8bcf405b6a0106b1 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
13af1312f7
commit
b8db123341
@ -44,6 +44,7 @@
|
||||
|
||||
#include "private/qsimd_p.h"
|
||||
#include "private/qstringiterator_p.h"
|
||||
#include "qbytearraymatcher.h"
|
||||
|
||||
#ifdef Q_OS_WIN
|
||||
#include <qt_windows.h>
|
||||
@ -1531,6 +1532,53 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForData(cons
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
/*!
|
||||
Tries to determine the encoding of the HTML in \a buf by looking at leading byte order marks or
|
||||
a charset specifier in the HTML meta tag. If the optional is empty, the encoding specified is
|
||||
not supported by QStringConverter. If no encoding is detected, the method returns Utf8.
|
||||
*/
|
||||
std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(const char *buf, qsizetype arraySize)
|
||||
{
|
||||
// determine charset
|
||||
auto encoding = encodingForData(buf, arraySize);
|
||||
if (encoding)
|
||||
// trust the initial BOM
|
||||
return encoding;
|
||||
|
||||
QByteArray header = QByteArray(buf, qMin(arraySize, qsizetype(1024))).toLower();
|
||||
int pos = header.indexOf("meta ");
|
||||
if (pos != -1) {
|
||||
pos = header.indexOf("charset=", pos);
|
||||
if (pos != -1) {
|
||||
pos += qstrlen("charset=");
|
||||
if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
|
||||
++pos;
|
||||
|
||||
int pos2 = pos;
|
||||
// The attribute can be closed with either """, "'", ">" or "/",
|
||||
// none of which are valid charset characters.
|
||||
while (++pos2 < header.size()) {
|
||||
char ch = header.at(pos2);
|
||||
if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
|
||||
QByteArray name = header.mid(pos, pos2 - pos);
|
||||
int colon = name.indexOf(':');
|
||||
if (colon > 0)
|
||||
name = name.left(colon);
|
||||
name = name.simplified();
|
||||
if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
|
||||
name = QByteArrayLiteral("UTF-8");
|
||||
if (!name.isEmpty())
|
||||
return encodingForName(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return Utf8;
|
||||
}
|
||||
|
||||
/*!
|
||||
Returns the canonical name for \a encoding.
|
||||
*/
|
||||
const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
|
||||
{
|
||||
return encodingInterfaces[int(e)].name;
|
||||
|
@ -168,6 +168,7 @@ public:
|
||||
Q_CORE_EXPORT static std::optional<Encoding> encodingForName(const char *name);
|
||||
Q_CORE_EXPORT static const char *nameForEncoding(Encoding e);
|
||||
Q_CORE_EXPORT static std::optional<Encoding> encodingForData(const char *buf, qsizetype arraySize, char16_t expectedFirstCharacter = 0);
|
||||
Q_CORE_EXPORT static std::optional<Encoding> encodingForHtml(const char *buf, qsizetype arraySize);
|
||||
|
||||
protected:
|
||||
const Interface *iface;
|
||||
|
@ -68,6 +68,9 @@ private slots:
|
||||
|
||||
void encodingForData_data();
|
||||
void encodingForData();
|
||||
|
||||
void encodingForHtml_data();
|
||||
void encodingForHtml();
|
||||
};
|
||||
|
||||
void tst_QStringConverter::constructByName()
|
||||
@ -1722,6 +1725,92 @@ void tst_QStringConverter::encodingForData()
|
||||
QCOMPARE(e, encoding);
|
||||
}
|
||||
|
||||
|
||||
void tst_QStringConverter::encodingForHtml_data()
|
||||
{
|
||||
QTest::addColumn<QByteArray>("html");
|
||||
QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding");
|
||||
|
||||
QByteArray html = "<html><head></head><body>blah</body></html>";
|
||||
QTest::newRow("no charset") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
|
||||
|
||||
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-15\" /></head></html>";
|
||||
QTest::newRow("latin 15") << html << std::optional<QStringConverter::Encoding>();
|
||||
|
||||
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-1\" /></head></html>";
|
||||
QTest::newRow("latin 1") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset=\"ISO_8859-1:1987\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>";
|
||||
QTest::newRow("latin 1 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset=\"utf-8\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>";
|
||||
QTest::newRow("UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8\"><title>Test</title></head>";
|
||||
QTest::newRow("UTF-8 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
|
||||
|
||||
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8/></head></html>";
|
||||
QTest::newRow("UTF-8, no quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
|
||||
|
||||
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset='UTF-8'/></head></html>";
|
||||
QTest::newRow("UTF-8, single quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset=utf-8><title>Test</title></head>";
|
||||
QTest::newRow("UTF-8, > terminator") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset= utf-8 ><title>Test</title></head>";
|
||||
QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
|
||||
|
||||
// Test invalid charsets.
|
||||
html = "<!DOCTYPE html><html><head><meta charset= utf/8 ><title>Test</title></head>";
|
||||
QTest::newRow("utf/8") << html << std::optional<QStringConverter::Encoding>();
|
||||
|
||||
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=invalid-foo\" /></head></html>";
|
||||
QTest::newRow("invalid charset, no default") << html << std::optional<QStringConverter::Encoding>();
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"";
|
||||
html.prepend(QByteArray().fill(' ', 512 - html.size()));
|
||||
QTest::newRow("invalid charset (large header)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
|
||||
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8";
|
||||
QTest::newRow("invalid charset (no closing double quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
|
||||
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset='utf-8";
|
||||
QTest::newRow("invalid charset (no closing single quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset=utf-8 foo=bar><title>Test</title></head>";
|
||||
QTest::newRow("invalid (space terminator)") << html << std::optional<QStringConverter::Encoding>();
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset=\" utf' 8 /><title>Test</title></head>";
|
||||
QTest::newRow("invalid charset, early terminator (')") << html << std::optional<QStringConverter::Encoding>();
|
||||
|
||||
const char src[] = { char(0xff), char(0xfe), char(0x7a), char(0x03), 0, 0 };
|
||||
html = src;
|
||||
QTest::newRow("greek text UTF-16LE") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE);
|
||||
|
||||
|
||||
html = "<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"><span style=\"color: rgb(0, 0, 0); font-family: "
|
||||
"'Galatia SIL'; font-size: 27px; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; "
|
||||
"line-height: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: "
|
||||
"auto; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px; display: inline !important; float: "
|
||||
"none;\">ͻ</span>\000";
|
||||
QTest::newRow("greek text UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=unicode\">"
|
||||
"<head/><body><p>bla</p></body></html>"; // QTBUG-41998, ICU will return UTF-16.
|
||||
QTest::newRow("legacy unicode UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
|
||||
}
|
||||
|
||||
void tst_QStringConverter::encodingForHtml()
|
||||
{
|
||||
QFETCH(QByteArray, html);
|
||||
QFETCH(std::optional<QStringConverter::Encoding>, encoding);
|
||||
|
||||
QCOMPARE(QStringConverter::encodingForHtml(html.constData(), html.size()), encoding);
|
||||
}
|
||||
|
||||
class LoadAndConvert: public QRunnable
|
||||
{
|
||||
public:
|
||||
|
Loading…
Reference in New Issue
Block a user