Introduce QString(View)::isValidUtf16

QString(View)s can be built or manipulated in ways that make them
contain/refer to improperly encoded UTF-16 data. Problem is,
we don't have public APIs to check whether a string contains
valid UTF-16. This knowledge is precious if the string is to be fed in
algorithms, regular expressions, etc. that expect validated input
(e.g. QRegularExpression can be faster if it can assume valid UTF-16,
otherwise it has to employ extra checks).

Add a function that does the validation.

[ChangeLog][QtCore][QStringView] Added QStringView::isValidUtf16.

[ChangeLog][QtCore][QString] Added QString::isValidUtf16.

Change-Id: Idd699183f6ec08013046c76c6a5a7c524b6c6fbc
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Giuseppe D'Angelo 2019-12-18 20:23:11 +01:00
parent 556712f511
commit a2ddd96ac8
6 changed files with 97 additions and 0 deletions

View File

@ -591,6 +591,20 @@ bool QtPrivate::isLatin1(QStringView s) noexcept
return true;
}
bool QtPrivate::isValidUtf16(QStringView s) noexcept
{
Q_CONSTEXPR uint InvalidCodePoint = UINT_MAX;
QStringIterator i(s);
while (i.hasNext()) {
uint c = i.next(InvalidCodePoint);
if (c == InvalidCodePoint)
return false;
}
return true;
}
// conversion between Latin 1 and UTF-16
void qt_from_latin1(ushort *dst, const char *str, size_t size) noexcept
{
@ -9046,6 +9060,21 @@ bool QString::isRightToLeft() const
return QtPrivate::isRightToLeft(QStringView(*this));
}
/*!
\fn bool QString::isValidUtf16() const noexcept
\since 5.15
Returns \c true if the string contains valid UTF-16 encoded data,
or \c false otherwise.
Note that this function does not perform any special validation of the
data; it merely checks if it can be successfully decoded from UTF-16.
The data is assumed to be in host byte order; the presence of a BOM
is meaningless.
\sa QStringView::isValidUtf16()
*/
/*! \fn QChar *QString::data()
Returns a pointer to the data stored in the QString. The pointer

View File

@ -919,6 +919,8 @@ public:
bool isSimpleText() const;
bool isRightToLeft() const;
Q_REQUIRED_RESULT bool isValidUtf16() const noexcept
{ return QStringView(*this).isValidUtf16(); }
QString(int size, Qt::Initialization);
Q_DECL_CONSTEXPR inline QString(QStringDataPtr dd) : d(dd.ptr) {}

View File

@ -99,6 +99,7 @@ Q_REQUIRED_RESULT Q_CORE_EXPORT Q_DECL_PURE_FUNCTION bool isAscii(QLatin1String
Q_REQUIRED_RESULT Q_CORE_EXPORT Q_DECL_PURE_FUNCTION bool isAscii(QStringView s) noexcept;
Q_REQUIRED_RESULT Q_DECL_CONSTEXPR inline bool isLatin1(QLatin1String s) noexcept;
Q_REQUIRED_RESULT Q_CORE_EXPORT Q_DECL_PURE_FUNCTION bool isLatin1(QStringView s) noexcept;
Q_REQUIRED_RESULT Q_CORE_EXPORT Q_DECL_PURE_FUNCTION bool isValidUtf16(QStringView s) noexcept;
} // namespace QtPRivate

View File

@ -864,6 +864,21 @@ QT_BEGIN_NAMESPACE
\sa QString::isRightToLeft()
*/
/*!
\fn bool QStringView::isValidUtf16() const
\since 5.15
Returns \c true if the string contains valid UTF-16 encoded data,
or \c false otherwise.
Note that this function does not perform any special validation of the
data; it merely checks if it can be successfully decoded from UTF-16.
The data is assumed to be in host byte order; the presence of a BOM
is meaningless.
\sa QString::isValidUtf16()
*/
/*!
\fn QStringView::toWCharArray(wchar_t *array) const
\since 5.14

View File

@ -294,6 +294,8 @@ public:
Q_REQUIRED_RESULT bool isRightToLeft() const noexcept
{ return QtPrivate::isRightToLeft(*this); }
Q_REQUIRED_RESULT bool isValidUtf16() const noexcept
{ return QtPrivate::isValidUtf16(*this); }
Q_REQUIRED_RESULT inline int toWCharArray(wchar_t *array) const; // defined in qstring.h

View File

@ -596,6 +596,8 @@ private slots:
void assignQChar();
void isRightToLeft_data();
void isRightToLeft();
void isValidUtf16_data();
void isValidUtf16();
void unicodeStrings();
};
@ -7025,6 +7027,52 @@ void tst_QString::isRightToLeft()
QCOMPARE(unicode.isRightToLeft(), rtl);
}
void tst_QString::isValidUtf16_data()
{
QTest::addColumn<QString>("string");
QTest::addColumn<bool>("valid");
int row = 0;
QTest::addRow("valid-%02d", row++) << QString() << true;
QTest::addRow("valid-%02d", row++) << QString("") << true;
QTest::addRow("valid-%02d", row++) << QString("abc def") << true;
QTest::addRow("valid-%02d", row++) << QString("àbç") << true;
QTest::addRow("valid-%02d", row++) << QString("ßẞ") << true;
QTest::addRow("valid-%02d", row++) << QString("𝐀𝐁𝐂abc𝐃𝐄𝐅def") << true;
QTest::addRow("valid-%02d", row++) << QString("abc𝐀𝐁𝐂def𝐃𝐄𝐅") << true;
QTest::addRow("valid-%02d", row++) << (QString("abc") + QChar(0x0000) + QString("def")) << true;
QTest::addRow("valid-%02d", row++) << (QString("abc") + QChar(0xFFFF) + QString("def")) << true;
// check that BOM presence doesn't make any difference
QTest::addRow("valid-%02d", row++) << (QString() + QChar(0xFEFF) + QString("abc𝐀𝐁𝐂def𝐃𝐄𝐅")) << true;
QTest::addRow("valid-%02d", row++) << (QString() + QChar(0xFFFE) + QString("abc𝐀𝐁𝐂def𝐃𝐄𝐅")) << true;
row = 0;
QTest::addRow("stray-high-%02d", row++) << (QString() + QChar(0xD800)) << false;
QTest::addRow("stray-high-%02d", row++) << (QString() + QString("abc") + QChar(0xD800)) << false;
QTest::addRow("stray-high-%02d", row++) << (QString() + QChar(0xD800) + QString("def")) << false;
QTest::addRow("stray-high-%02d", row++) << (QString() + QString("abc") + QChar(0xD800) + QString("def")) << false;
QTest::addRow("stray-high-%02d", row++) << (QString() + QChar(0xD800) + QChar(0xD800)) << false;
QTest::addRow("stray-high-%02d", row++) << (QString() + QString("abc") + QChar(0xD800) + QChar(0xD800)) << false;
QTest::addRow("stray-high-%02d", row++) << (QString() + QChar(0xD800) + QChar(0xD800) + QString("def")) << false;
QTest::addRow("stray-high-%02d", row++) << (QString() + QString("abc") + QChar(0xD800) + QChar(0xD800) + QString("def")) << false;
row = 0;
QTest::addRow("stray-low-%02d", row++) << (QString() + QChar(0xDC00)) << false;
QTest::addRow("stray-low-%02d", row++) << (QString() + QString("abc") + QChar(0xDC00)) << false;
QTest::addRow("stray-low-%02d", row++) << (QString() + QChar(0xDC00) + QString("def")) << false;
QTest::addRow("stray-low-%02d", row++) << (QString() + QString("abc") + QChar(0xDC00) + QString("def")) << false;
QTest::addRow("stray-low-%02d", row++) << (QString() + QChar(0xDC00) + QChar(0xDC00)) << false;
QTest::addRow("stray-low-%02d", row++) << (QString() + QString("abc") + QChar(0xDC00) + QChar(0xDC00)) << false;
QTest::addRow("stray-low-%02d", row++) << (QString() + QChar(0xDC00) + QChar(0xDC00) + QString("def")) << false;
QTest::addRow("stray-low-%02d", row++) << (QString() + QString("abc") + QChar(0xDC00) + QChar(0xDC00) + QString("def")) << false;
}
void tst_QString::isValidUtf16()
{
QFETCH(QString, string);
QTEST(string.isValidUtf16(), "valid");
}
QTEST_APPLESS_MAIN(tst_QString)
#include "tst_qstring.moc"