Fix stateful handling of invalid UTF-8 straddling buffer borders
When a UTF-8 sequences is too short, QUtf8Functions::fromUtf8 returns
EndOfString. If the decoder is stateful, we must save the state and then
restart it when more data is supplied.
The new stateful decoder (8dd47e34b9
)
mishandled the Error case by advancing the src pointer by a negative
number, thus causing a buffer overflow (the issue of the task).
And it also did not handle the len == 0 case properly, though neither
did the older decoder.
Task-number: QTBUG-38939
Change-Id: Ie03d7c55a04e51ee838ccdb3a01e5b989d8e67aa
Reviewed-by: Kai Koehne <kai.koehne@digia.com>
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
This commit is contained in:
parent
f2619db300
commit
b23e72a772
@ -237,7 +237,20 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
|
|||||||
|
|
||||||
QString QUtf8::convertToUnicode(const char *chars, int len)
|
QString QUtf8::convertToUnicode(const char *chars, int len)
|
||||||
{
|
{
|
||||||
QString result(len + 1, Qt::Uninitialized); // worst case
|
// UTF-8 to UTF-16 always needs the exact same number of words or less:
|
||||||
|
// UTF-8 UTF-16
|
||||||
|
// 1 byte 1 word
|
||||||
|
// 2 bytes 1 word
|
||||||
|
// 3 bytes 1 word
|
||||||
|
// 4 bytes 2 words (one surrogate pair)
|
||||||
|
// That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
|
||||||
|
// half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
|
||||||
|
// non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
|
||||||
|
//
|
||||||
|
// The table holds for invalid sequences too: we'll insert one replacement char
|
||||||
|
// per invalid byte.
|
||||||
|
QString result(len, Qt::Uninitialized);
|
||||||
|
|
||||||
ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
|
ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
|
||||||
const uchar *src = reinterpret_cast<const uchar *>(chars);
|
const uchar *src = reinterpret_cast<const uchar *>(chars);
|
||||||
const uchar *end = src + len;
|
const uchar *end = src + len;
|
||||||
@ -282,7 +295,18 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
|
|||||||
int res;
|
int res;
|
||||||
uchar ch = 0;
|
uchar ch = 0;
|
||||||
|
|
||||||
QString result(need + len + 1, Qt::Uninitialized); // worst case
|
// See above for buffer requirements for stateless decoding. However, that
|
||||||
|
// fails if the state is not empty. The following situations can add to the
|
||||||
|
// requirements:
|
||||||
|
// state contains chars starts with requirement
|
||||||
|
// 1 of 2 bytes valid continuation 0
|
||||||
|
// 2 of 3 bytes same 0
|
||||||
|
// 3 bytes of 4 same +1 (need to insert surrogate pair)
|
||||||
|
// 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
|
||||||
|
// 2 of 3 bytes same +1 (same)
|
||||||
|
// 3 of 4 bytes same +1 (same)
|
||||||
|
QString result(need + len + 1, Qt::Uninitialized);
|
||||||
|
|
||||||
ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
|
ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
|
||||||
const uchar *src = reinterpret_cast<const uchar *>(chars);
|
const uchar *src = reinterpret_cast<const uchar *>(chars);
|
||||||
const uchar *end = src + len;
|
const uchar *end = src + len;
|
||||||
@ -305,15 +329,17 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
|
|||||||
const uchar *begin = &remainingCharsData[1];
|
const uchar *begin = &remainingCharsData[1];
|
||||||
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
|
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
|
||||||
static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
|
static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
|
||||||
if (res == QUtf8BaseTraits::EndOfString) {
|
if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) {
|
||||||
|
// special case for len == 0:
|
||||||
|
// if we were supplied an empty string, terminate the previous, unfinished sequence with error
|
||||||
|
++invalid;
|
||||||
|
*dst++ = replacement;
|
||||||
|
} else if (res == QUtf8BaseTraits::EndOfString) {
|
||||||
// if we got EndOfString again, then there were too few bytes in src;
|
// if we got EndOfString again, then there were too few bytes in src;
|
||||||
// copy to our state and return
|
// copy to our state and return
|
||||||
state->remainingChars = remainingCharsCount + newCharsToCopy;
|
state->remainingChars = remainingCharsCount + newCharsToCopy;
|
||||||
memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
|
memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
|
||||||
return QString();
|
return QString();
|
||||||
} else if (res == QUtf8BaseTraits::Error) {
|
|
||||||
++invalid;
|
|
||||||
*dst++ = replacement;
|
|
||||||
} else if (!headerdone && res >= 0) {
|
} else if (!headerdone && res >= 0) {
|
||||||
// eat the UTF-8 BOM
|
// eat the UTF-8 BOM
|
||||||
headerdone = true;
|
headerdone = true;
|
||||||
@ -322,8 +348,10 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
|
|||||||
}
|
}
|
||||||
|
|
||||||
// adjust src now that we have maybe consumed a few chars
|
// adjust src now that we have maybe consumed a few chars
|
||||||
//Q_ASSERT(res > remainingCharsCount)
|
if (res >= 0) {
|
||||||
src += res - remainingCharsCount;
|
Q_ASSERT(res > remainingCharsCount);
|
||||||
|
src += res - remainingCharsCount;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -80,6 +80,9 @@ private slots:
|
|||||||
void utf8bom_data();
|
void utf8bom_data();
|
||||||
void utf8bom();
|
void utf8bom();
|
||||||
|
|
||||||
|
void utf8stateful_data();
|
||||||
|
void utf8stateful();
|
||||||
|
|
||||||
void utfHeaders_data();
|
void utfHeaders_data();
|
||||||
void utfHeaders();
|
void utfHeaders();
|
||||||
|
|
||||||
@ -1611,6 +1614,99 @@ void tst_QTextCodec::utf8bom()
|
|||||||
QCOMPARE(codec->toUnicode(data.constData(), data.length(), &state), result);
|
QCOMPARE(codec->toUnicode(data.constData(), data.length(), &state), result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void tst_QTextCodec::utf8stateful_data()
|
||||||
|
{
|
||||||
|
QTest::addColumn<QByteArray>("buffer1");
|
||||||
|
QTest::addColumn<QByteArray>("buffer2");
|
||||||
|
QTest::addColumn<QString>("result"); // null QString indicates decoder error
|
||||||
|
|
||||||
|
// valid buffer continuations
|
||||||
|
QTest::newRow("1of2+valid") << QByteArray("\xc2") << QByteArray("\xa0") << "\xc2\xa0";
|
||||||
|
QTest::newRow("1of3+valid") << QByteArray("\xe0") << QByteArray("\xa0\x80") << "\xe0\xa0\x80";
|
||||||
|
QTest::newRow("2of3+valid") << QByteArray("\xe0\xa0") << QByteArray("\x80") << "\xe0\xa0\x80";
|
||||||
|
QTest::newRow("1of4+valid") << QByteArray("\360") << QByteArray("\220\210\203") << "\360\220\210\203";
|
||||||
|
QTest::newRow("2of4+valid") << QByteArray("\360\220") << QByteArray("\210\203") << "\360\220\210\203";
|
||||||
|
QTest::newRow("3of4+valid") << QByteArray("\360\220\210") << QByteArray("\203") << "\360\220\210\203";
|
||||||
|
QTest::newRow("1ofBom+valid") << QByteArray("\xef") << QByteArray("\xbb\xbf") << "";
|
||||||
|
QTest::newRow("2ofBom+valid") << QByteArray("\xef\xbb") << QByteArray("\xbf") << "";
|
||||||
|
|
||||||
|
// invalid continuation
|
||||||
|
QTest::newRow("1of2+invalid") << QByteArray("\xc2") << QByteArray("a") << QString();
|
||||||
|
QTest::newRow("1of3+invalid") << QByteArray("\xe0") << QByteArray("a") << QString();
|
||||||
|
QTest::newRow("2of3+invalid") << QByteArray("\xe0\xa0") << QByteArray("a") << QString();
|
||||||
|
QTest::newRow("1of4+invalid") << QByteArray("\360") << QByteArray("a") << QString();
|
||||||
|
QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString();
|
||||||
|
QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString();
|
||||||
|
|
||||||
|
// invalid: sequence too short (the empty second buffer causes a state reset)
|
||||||
|
QTest::newRow("1of2+empty") << QByteArray("\xc2") << QByteArray() << QString();
|
||||||
|
QTest::newRow("1of3+empty") << QByteArray("\xe0") << QByteArray() << QString();
|
||||||
|
QTest::newRow("2of3+empty") << QByteArray("\xe0\xa0") << QByteArray() << QString();
|
||||||
|
QTest::newRow("1of4+empty") << QByteArray("\360") << QByteArray() << QString();
|
||||||
|
QTest::newRow("2of4+empty") << QByteArray("\360\220") << QByteArray() << QString();
|
||||||
|
QTest::newRow("3of4+empty") << QByteArray("\360\220\210") << QByteArray() << QString();
|
||||||
|
|
||||||
|
// overlong sequence:
|
||||||
|
QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString();
|
||||||
|
QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString();
|
||||||
|
QTest::newRow("overlong-2of3") << QByteArray("\xe0\x81") << QByteArray("\x81") << QString();
|
||||||
|
QTest::newRow("overlong-1of4") << QByteArray("\xf0") << QByteArray("\x80\x81\x81") << QString();
|
||||||
|
QTest::newRow("overlong-2of4") << QByteArray("\xf0\x80") << QByteArray("\x81\x81") << QString();
|
||||||
|
QTest::newRow("overlong-3of4") << QByteArray("\xf0\x80\x81") << QByteArray("\x81") << QString();
|
||||||
|
|
||||||
|
// out of range:
|
||||||
|
// leading byte 0xF4 can produce codepoints above U+10FFFF, which aren't valid
|
||||||
|
QTest::newRow("outofrange1-1of4") << QByteArray("\xf4") << QByteArray("\x90\x80\x80") << QString();
|
||||||
|
QTest::newRow("outofrange1-2of4") << QByteArray("\xf4\x90") << QByteArray("\x80\x80") << QString();
|
||||||
|
QTest::newRow("outofrange1-3of4") << QByteArray("\xf4\x90\x80") << QByteArray("\x80") << QString();
|
||||||
|
QTest::newRow("outofrange2-1of4") << QByteArray("\xf5") << QByteArray("\x90\x80\x80") << QString();
|
||||||
|
QTest::newRow("outofrange2-2of4") << QByteArray("\xf5\x90") << QByteArray("\x80\x80") << QString();
|
||||||
|
QTest::newRow("outofrange2-3of4") << QByteArray("\xf5\x90\x80") << QByteArray("\x80") << QString();
|
||||||
|
QTest::newRow("outofrange-1of5") << QByteArray("\xf8") << QByteArray("\x88\x80\x80\x80") << QString();
|
||||||
|
QTest::newRow("outofrange-2of5") << QByteArray("\xf8\x88") << QByteArray("\x80\x80\x80") << QString();
|
||||||
|
QTest::newRow("outofrange-3of5") << QByteArray("\xf8\x88\x80") << QByteArray("\x80\x80") << QString();
|
||||||
|
QTest::newRow("outofrange-4of5") << QByteArray("\xf8\x88\x80\x80") << QByteArray("\x80") << QString();
|
||||||
|
QTest::newRow("outofrange-1of6") << QByteArray("\xfc") << QByteArray("\x84\x80\x80\x80\x80") << QString();
|
||||||
|
QTest::newRow("outofrange-2of6") << QByteArray("\xfc\x84") << QByteArray("\x80\x80\x80\x80") << QString();
|
||||||
|
QTest::newRow("outofrange-3of6") << QByteArray("\xfc\x84\x80") << QByteArray("\x80\x80\x80") << QString();
|
||||||
|
QTest::newRow("outofrange-4of6") << QByteArray("\xfc\x84\x80\x80") << QByteArray("\x80\x80") << QString();
|
||||||
|
QTest::newRow("outofrange-5of6") << QByteArray("\xfc\x84\x80\x80\x80") << QByteArray("\x80") << QString();
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_QTextCodec::utf8stateful()
|
||||||
|
{
|
||||||
|
QFETCH(QByteArray, buffer1);
|
||||||
|
QFETCH(QByteArray, buffer2);
|
||||||
|
QFETCH(QString, result);
|
||||||
|
|
||||||
|
QTextCodec *utf8codec = QTextCodec::codecForName("utf-8");
|
||||||
|
QVERIFY(utf8codec);
|
||||||
|
|
||||||
|
QTextCodec::ConverterState state;
|
||||||
|
memset(&state, 0, sizeof state);
|
||||||
|
|
||||||
|
QString decoded1 = utf8codec->toUnicode(buffer1, buffer1.size(), &state);
|
||||||
|
if (result.isNull()) {
|
||||||
|
// the decoder may have found an early error (invalidChars > 0):
|
||||||
|
// if it has, remainingChars == 0;
|
||||||
|
// if it hasn't, then it must have a state
|
||||||
|
QVERIFY2((state.remainingChars == 0) != (state.invalidChars == 0),
|
||||||
|
"remainingChars = " + QByteArray::number(state.remainingChars) +
|
||||||
|
"; invalidChars = " + QByteArray::number(state.invalidChars));
|
||||||
|
} else {
|
||||||
|
QVERIFY(state.remainingChars > 0);
|
||||||
|
QCOMPARE(state.invalidChars, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
QString decoded2 = utf8codec->toUnicode(buffer2, buffer2.size(), &state);
|
||||||
|
QCOMPARE(state.remainingChars, 0);
|
||||||
|
if (result.isNull()) {
|
||||||
|
QVERIFY(state.invalidChars > 0);
|
||||||
|
} else {
|
||||||
|
QCOMPARE(decoded1 + decoded2, result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void tst_QTextCodec::utfHeaders_data()
|
void tst_QTextCodec::utfHeaders_data()
|
||||||
{
|
{
|
||||||
QTest::addColumn<QByteArray>("codecName");
|
QTest::addColumn<QByteArray>("codecName");
|
||||||
|
Loading…
Reference in New Issue
Block a user