QDom: Stop treating non-BMP characters as invalid

According to https://www.w3.org/TR/REC-xml/#NT-Char unicode characters
within the range of [#x10000-#x10FFFF] are considered to be valid, so
fix the check for valid characters accordingly. This requires changing
the loop over the input QString to iterate over code points (instead of
code units).

Fixes: QTBUG-104362
Pick-to: 6.4 6.3 6.2 5.15
Change-Id: I7dcf5cad05265a54882807a50522d28b647e06ee
Reviewed-by: Marc Mutz <marc.mutz@qt.io>
This commit is contained in:
Sona Kurazyan 2022-06-17 14:43:17 +02:00 committed by Marc Mutz
parent 3d73aa660b
commit da0d7f61c8
4 changed files with 37 additions and 11 deletions

View File

@ -235,13 +235,16 @@ bool QXmlUtils::isLetter(const QChar c)
\sa {http://www.w3.org/TR/REC-xml/#NT-Char},
{Extensible Markup Language (XML) 1.0 (Fourth Edition), [2] Char}
*/
bool QXmlUtils::isChar(const QChar c)
bool QXmlUtils::isChar(const char32_t c)
{
return (c.unicode() >= 0x0020 && c.unicode() <= 0xD7FF)
|| c.unicode() == 0x0009
|| c.unicode() == 0x000A
|| c.unicode() == 0x000D
|| (c.unicode() >= 0xE000 && c.unicode() <= 0xFFFD);
// The valid range is defined by https://www.w3.org/TR/REC-xml/#NT-Char as following:
// Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
return (c >= 0x0020 && c <= 0xD7FF)
|| c == 0x0009
|| c == 0x000A
|| c == 0x000D
|| (c >= 0xE000 && c <= 0xFFFD)
|| (c >= 0x10000 && c <= 0x10FFFF);
}
/*!

View File

@ -33,7 +33,7 @@ class Q_CORE_EXPORT QXmlUtils
{
public:
static bool isEncName(QStringView encName);
static bool isChar(const QChar c);
static bool isChar(const char32_t c);
static bool isNameChar(const QChar c);
static bool isLetter(const QChar c);
static bool isNCName(QStringView ncName);

View File

@ -22,7 +22,7 @@
#include <qdebug.h>
#include <qxmlstream.h>
#include <private/qduplicatetracker_p.h>
#include <private/qstringiterator_p.h>
#include <stdio.h>
#include <limits>
@ -156,10 +156,11 @@ static QString fixedCharData(const QString &data, bool *ok)
}
QString result;
for (int i = 0; i < data.size(); ++i) {
QChar c = data.at(i);
QStringIterator it(data);
while (it.hasNext()) {
const char32_t c = it.next(QChar::Null);
if (QXmlUtils::isChar(c)) {
result.append(c);
result.append(QChar::fromUcs4(c));
} else if (QDomImplementationPrivate::invalidDataPolicy == QDomImplementation::ReturnNullNode) {
*ok = false;
return QString();

View File

@ -9,6 +9,7 @@
#include <QFile>
#include <QList>
#include <QRegularExpression>
#include <QScopeGuard>
#include <QTextStream>
#include <QTest>
#include <QtXml>
@ -62,6 +63,7 @@ private slots:
void invalidQualifiedName();
void invalidCharData_data();
void invalidCharData();
void nonBMPCharacters();
void roundTripAttributes() const;
void roundTripCDATA() const;
@ -1342,6 +1344,10 @@ void tst_QDom::invalidCharData_data()
QTest::newRow( "f<o&o" ) << QString("f<o&o") << true << true << true << QString("f<o&o");
QTest::newRow( "empty" ) << QString() << true << true << true << QString();
QTest::newRow("f\\x07o\\x02")<< QString("f\x07o\x02")<< true << true << false << QString("fo");
const QChar pair[2] = { QChar(0xdc00), QChar(0xe000) };
QString invalid(pair, 2);
QTest::newRow("\\xdc00\\xe000") << invalid << true << true << false << invalid.last(1);
}
void tst_QDom::invalidCharData()
@ -1385,6 +1391,22 @@ void tst_QDom::invalidCharData()
}
}
void tst_QDom::nonBMPCharacters()
{
const auto invalidDataPolicy = QDomImplementation::invalidDataPolicy();
auto resetInvalidDataPolicy = qScopeGuard(
[invalidDataPolicy] { QDomImplementation::setInvalidDataPolicy(invalidDataPolicy); });
QDomImplementation::setInvalidDataPolicy(QDomImplementation::DropInvalidChars);
const QString input = u"<text>Supplementary Plane: 𝄞 😂 🀄 🀶 🃪 🃋</text>"_qs;
QString errorMsg;
QDomDocument doc;
doc.setContent(input, &errorMsg);
QVERIFY(errorMsg.isEmpty());
QCOMPARE(doc.toString(-1), input);
}
void tst_QDom::roundTripAttributes() const
{
/* Create an attribute via the QDom API with weird whitespace content. */