From b8db1233411893aaecb0b6a61b02b0ef3c1520e5 Mon Sep 17 00:00:00 2001 From: Lars Knoll Date: Thu, 30 Apr 2020 11:43:21 +0200 Subject: [PATCH] Add a method to determine the encoding for encoded HTML data This is a replacement for Qt::codecForHtml(). Change-Id: I31f03518fd9c70507cbd210a8bcf405b6a0106b1 Reviewed-by: Thiago Macieira --- src/corelib/text/qstringconverter.cpp | 48 ++++++++++ src/corelib/text/qstringconverter.h | 1 + .../qstringconverter/tst_qstringconverter.cpp | 89 +++++++++++++++++++ 3 files changed, 138 insertions(+) diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 9a3d92dbaa..bc81b06c0e 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -44,6 +44,7 @@ #include "private/qsimd_p.h" #include "private/qstringiterator_p.h" +#include "qbytearraymatcher.h" #ifdef Q_OS_WIN #include @@ -1531,6 +1532,53 @@ std::optional QStringConverter::encodingForData(cons return std::nullopt; } +/*! + Tries to determine the encoding of the HTML in \a buf by looking at leading byte order marks or + a charset specifier in the HTML meta tag. If the optional is empty, the encoding specified is + not supported by QStringConverter. If no encoding is detected, the method returns Utf8. + */ +std::optional QStringConverter::encodingForHtml(const char *buf, qsizetype arraySize) +{ + // determine charset + auto encoding = encodingForData(buf, arraySize); + if (encoding) + // trust the initial BOM + return encoding; + + QByteArray header = QByteArray(buf, qMin(arraySize, qsizetype(1024))).toLower(); + int pos = header.indexOf("meta "); + if (pos != -1) { + pos = header.indexOf("charset=", pos); + if (pos != -1) { + pos += qstrlen("charset="); + if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\'')) + ++pos; + + int pos2 = pos; + // The attribute can be closed with either """, "'", ">" or "/", + // none of which are valid charset characters. + while (++pos2 < header.size()) { + char ch = header.at(pos2); + if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') { + QByteArray name = header.mid(pos, pos2 - pos); + int colon = name.indexOf(':'); + if (colon > 0) + name = name.left(colon); + name = name.simplified(); + if (name == "unicode") // QTBUG-41998, ICU will return UTF-16. + name = QByteArrayLiteral("UTF-8"); + if (!name.isEmpty()) + return encodingForName(name); + } + } + } + } + return Utf8; +} + +/*! + Returns the canonical name for \a encoding. +*/ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e) { return encodingInterfaces[int(e)].name; diff --git a/src/corelib/text/qstringconverter.h b/src/corelib/text/qstringconverter.h index 6269ace4ac..7443173df6 100644 --- a/src/corelib/text/qstringconverter.h +++ b/src/corelib/text/qstringconverter.h @@ -168,6 +168,7 @@ public: Q_CORE_EXPORT static std::optional encodingForName(const char *name); Q_CORE_EXPORT static const char *nameForEncoding(Encoding e); Q_CORE_EXPORT static std::optional encodingForData(const char *buf, qsizetype arraySize, char16_t expectedFirstCharacter = 0); + Q_CORE_EXPORT static std::optional encodingForHtml(const char *buf, qsizetype arraySize); protected: const Interface *iface; diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp index 3f4bbb413f..78595bc17b 100644 --- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -68,6 +68,9 @@ private slots: void encodingForData_data(); void encodingForData(); + + void encodingForHtml_data(); + void encodingForHtml(); }; void tst_QStringConverter::constructByName() @@ -1722,6 +1725,92 @@ void tst_QStringConverter::encodingForData() QCOMPARE(e, encoding); } + +void tst_QStringConverter::encodingForHtml_data() +{ + QTest::addColumn("html"); + QTest::addColumn>("encoding"); + + QByteArray html = "blah"; + QTest::newRow("no charset") << html << std::optional(QStringConverter::Utf8); + + html = ""; + QTest::newRow("latin 15") << html << std::optional(); + + html = ""; + QTest::newRow("latin 1") << html << std::optional(QStringConverter::Latin1); + + html = "Test"; + QTest::newRow("latin 1 (#2)") << html << std::optional(QStringConverter::Latin1); + + html = "Test"; + QTest::newRow("UTF-8") << html << std::optional(QStringConverter::Utf8); + + html = "Test"; + QTest::newRow("UTF-8 (#2)") << html << std::optional(QStringConverter::Utf8); + + html = ""; + QTest::newRow("UTF-8, no quotes") << html << std::optional(QStringConverter::Utf8); + + html = ""; + QTest::newRow("UTF-8, single quotes") << html << std::optional(QStringConverter::Utf8); + + html = "Test"; + QTest::newRow("UTF-8, > terminator") << html << std::optional(QStringConverter::Utf8); + + html = "Test"; + QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional(QStringConverter::Utf8); + + // Test invalid charsets. + html = "Test"; + QTest::newRow("utf/8") << html << std::optional(); + + html = ""; + QTest::newRow("invalid charset, no default") << html << std::optional(); + + html = "(QStringConverter::Utf8); + + + html = "(QStringConverter::Utf8); + + + html = "Test"; + QTest::newRow("invalid charset, early terminator (')") << html << std::optional(); + + const char src[] = { char(0xff), char(0xfe), char(0x7a), char(0x03), 0, 0 }; + html = src; + QTest::newRow("greek text UTF-16LE") << html << std::optional(QStringConverter::Utf16LE); + + + html = "ͻ\000"; + QTest::newRow("greek text UTF-8") << html << std::optional(QStringConverter::Utf8); + + html = "" + "

bla

"; // QTBUG-41998, ICU will return UTF-16. + QTest::newRow("legacy unicode UTF-8") << html << std::optional(QStringConverter::Utf8); +} + +void tst_QStringConverter::encodingForHtml() +{ + QFETCH(QByteArray, html); + QFETCH(std::optional, encoding); + + QCOMPARE(QStringConverter::encodingForHtml(html.constData(), html.size()), encoding); +} + class LoadAndConvert: public QRunnable { public: