Allow non-character codes in utf8 strings

Changed the processing of non-character code handling in the UTF8 codec.
Non-character codes are now accepted in QStrings, QUrls and QJson strings.
Unit tests were adapted accordingly.
For more info about non-character codes,
see: http://www.unicode.org/versions/corrigendum9.html

[ChangeLog][QtCore][QUtf8]
UTF-8 now accepts non-character unicode points; these are not replaced
by the replacement character anymore

[ChangeLog][QtCore][QUrl]
QUrl now fully accepts non-character unicode points; they are encoded as
percent characters; they can also be pretty decoded

[ChangeLog][QtCore][QJson]
The Writer and the Parser now fully accept non-character unicode points.

Change-Id: I77cf4f0e6210741eac8082912a0b6118eced4f77
Task-number: QTBUG-33229
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Kurt Pattyn 2013-10-06 11:40:47 +02:00 committed by The Qt Project
parent e8853506bf
commit add2bf739a
12 changed files with 78 additions and 96 deletions

View File

@ -106,14 +106,6 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
if (u < 0x0800) { if (u < 0x0800) {
*cursor++ = 0xc0 | ((uchar) (u >> 6)); *cursor++ = 0xc0 | ((uchar) (u >> 6));
} else { } else {
// is it one of the Unicode non-characters?
if (QChar::isNonCharacter(u)) {
*cursor++ = replacement;
++ch;
++invalid;
continue;
}
if (QChar::requiresSurrogates(u)) { if (QChar::requiresSurrogates(u)) {
*cursor++ = 0xf0 | ((uchar) (u >> 18)); *cursor++ = 0xf0 | ((uchar) (u >> 18));
*cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
@ -180,15 +172,14 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
--need; --need;
if (!need) { if (!need) {
// utf-8 bom composes into 0xfeff code point // utf-8 bom composes into 0xfeff code point
bool nonCharacter;
if (!headerdone && uc == 0xfeff) { if (!headerdone && uc == 0xfeff) {
// don't do anything, just skip the BOM // don't do anything, just skip the BOM
} else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
// surrogate pair // surrogate pair
Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
*qch++ = QChar::highSurrogate(uc); *qch++ = QChar::highSurrogate(uc);
*qch++ = QChar::lowSurrogate(uc); *qch++ = QChar::lowSurrogate(uc);
} else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) { } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
// error: overlong sequence, UTF16 surrogate or non-character // error: overlong sequence, UTF16 surrogate or non-character
*qch++ = replacement; *qch++ = replacement;
++invalid; ++invalid;

View File

@ -304,7 +304,7 @@ static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *b
// we've decoded something; safety-check it // we've decoded something; safety-check it
if (uc < min_uc) if (uc < min_uc)
return false; return false;
if (QChar::isSurrogate(uc) || QChar::isNonCharacter(uc) || uc > QChar::LastValidCodePoint) if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
return false; return false;
if (!QChar::requiresSurrogates(uc)) { if (!QChar::requiresSurrogates(uc)) {

View File

@ -853,7 +853,7 @@ static inline bool scanUtf8Char(const char *&json, const char *end, uint *result
uc = (uc << 6) | (ch & 0x3f); uc = (uc << 6) | (ch & 0x3f);
} }
if (uc < min_uc || QChar::isNonCharacter(uc) || if (uc < min_uc ||
QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) { QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
return false; return false;
} }

View File

@ -138,13 +138,6 @@ static QByteArray escapedString(const QString &s)
if (u < 0x0800) { if (u < 0x0800) {
*cursor++ = 0xc0 | ((uchar) (u >> 6)); *cursor++ = 0xc0 | ((uchar) (u >> 6));
} else { } else {
// is it one of the Unicode non-characters?
if (QChar::isNonCharacter(u)) {
*cursor++ = replacement;
++ch;
continue;
}
if (QChar::requiresSurrogates(u)) { if (QChar::requiresSurrogates(u)) {
*cursor++ = 0xf0 | ((uchar) (u >> 18)); *cursor++ = 0xf0 | ((uchar) (u >> 18));
*cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);

View File

@ -66,9 +66,9 @@ private slots:
void codecForLocale(); void codecForLocale();
void asciiToIscii() const; void asciiToIscii() const;
void flagCodepointFFFF() const; void nonFlaggedCodepointFFFF() const;
void flagF7808080() const; void flagF7808080() const;
void flagEFBFBF() const; void nonFlaggedEFBFBF() const;
void decode0D() const; void decode0D() const;
void aliasForUTF16() const; void aliasForUTF16() const;
void mibForTSCII() const; void mibForTSCII() const;
@ -409,9 +409,9 @@ void tst_QTextCodec::asciiToIscii() const
} }
} }
void tst_QTextCodec::flagCodepointFFFF() const void tst_QTextCodec::nonFlaggedCodepointFFFF() const
{ {
// This is an invalid Unicode codepoint. //Check that the code point 0xFFFF (=non-character code 0xEFBFBF) is not flagged
const QChar ch(0xFFFF); const QChar ch(0xFFFF);
QString input(ch); QString input(ch);
@ -419,12 +419,11 @@ void tst_QTextCodec::flagCodepointFFFF() const
QVERIFY(codec); QVERIFY(codec);
const QByteArray asDecoded(codec->fromUnicode(input)); const QByteArray asDecoded(codec->fromUnicode(input));
QCOMPARE(asDecoded, QByteArray("?")); QCOMPARE(asDecoded, QByteArray("\357\277\277"));
QByteArray ffff("\357\277\277"); QByteArray ffff("\357\277\277");
QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull); QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
QVERIFY(codec->toUnicode(ffff.constData(), ffff.length(), &state) == QChar(0)); QVERIFY(codec->toUnicode(ffff.constData(), ffff.length(), &state) == QByteArray::fromHex("EFBFBF"));
QVERIFY(codec->toUnicode(ffff) == QChar(0xfffd));
} }
void tst_QTextCodec::flagF7808080() const void tst_QTextCodec::flagF7808080() const
@ -460,13 +459,16 @@ void tst_QTextCodec::flagF7808080() const
QVERIFY(codec->toUnicode(input.constData(), input.length(), &state) == QChar(0)); QVERIFY(codec->toUnicode(input.constData(), input.length(), &state) == QChar(0));
} }
void tst_QTextCodec::flagEFBFBF() const void tst_QTextCodec::nonFlaggedEFBFBF() const
{ {
QByteArray invalidInput; /* Check that the codec does NOT flag EFBFBF.
invalidInput.resize(3); * This is a regression test; see QTBUG-33229
invalidInput[0] = char(0xEF); */
invalidInput[1] = char(0xBF); QByteArray validInput;
invalidInput[2] = char(0xBF); validInput.resize(3);
validInput[0] = char(0xEF);
validInput[1] = char(0xBF);
validInput[2] = char(0xBF);
const QTextCodec *const codec = QTextCodec::codecForMib(106); // UTF-8 const QTextCodec *const codec = QTextCodec::codecForMib(106); // UTF-8
QVERIFY(codec); QVERIFY(codec);
@ -474,21 +476,20 @@ void tst_QTextCodec::flagEFBFBF() const
{ {
//QVERIFY(!codec->canEncode(QChar(0xFFFF))); //QVERIFY(!codec->canEncode(QChar(0xFFFF)));
QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull); QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
QVERIFY(codec->toUnicode(invalidInput.constData(), invalidInput.length(), &state) == QChar(0)); QVERIFY(codec->toUnicode(validInput.constData(), validInput.length(), &state) == QByteArray::fromHex("EFBFBF"));
QByteArray start("<?pi "); QByteArray start("<?pi ");
start.append(invalidInput); start.append(validInput);
start.append("?>"); start.append("?>");
} }
/* When 0xEFBFBF is preceded by what seems to be an arbitrary character, // Check that 0xEFBFBF is correctly decoded when preceded by an arbitrary character
* QTextCodec fails to flag it. */
{ {
QByteArray start("B"); QByteArray start("B");
start.append(invalidInput); start.append(validInput);
QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull); QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
QVERIFY(codec->toUnicode(start.constData(), start.length(), &state) == QString::fromLatin1("B\0", 2)); QVERIFY(codec->toUnicode(start.constData(), start.length(), &state) == QByteArray("B").append(QByteArray::fromHex("EFBFBF")));
} }
} }
@ -674,13 +675,12 @@ void tst_QTextCodec::utf8Codec_data()
str = QChar(0x7ff); str = QChar(0x7ff);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.2") << utf8 << str << -1; QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.2") << utf8 << str << -1;
// 2.2.3 U+000FFFF // 2.2.3 U+000FFFF - non-character code
utf8.clear(); utf8.clear();
utf8 += char(0xef); utf8 += char(0xef);
utf8 += char(0xbf); utf8 += char(0xbf);
utf8 += char(0xbf); utf8 += char(0xbf);
str.clear(); str = QString::fromUtf8(utf8);
str += QChar::ReplacementCharacter;
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.3") << utf8 << str << -1; QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.3") << utf8 << str << -1;
// 2.2.4 U+001FFFFF // 2.2.4 U+001FFFFF
@ -1535,20 +1535,22 @@ void tst_QTextCodec::utf8Codec_data()
str += QChar(QChar::ReplacementCharacter); str += QChar(QChar::ReplacementCharacter);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.8") << utf8 << str << -1; QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.8") << utf8 << str << -1;
// 5.3.1 // 5.3.1 - non-character code
utf8.clear(); utf8.clear();
utf8 += char(0xef); utf8 += char(0xef);
utf8 += char(0xbf); utf8 += char(0xbf);
utf8 += char(0xbe); utf8 += char(0xbe);
str = QChar(QChar::ReplacementCharacter); //str = QChar(QChar::ReplacementCharacter);
str = QString::fromUtf8(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.1") << utf8 << str << -1; QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.1") << utf8 << str << -1;
// 5.3.2 // 5.3.2 - non-character code
utf8.clear(); utf8.clear();
utf8 += char(0xef); utf8 += char(0xef);
utf8 += char(0xbf); utf8 += char(0xbf);
utf8 += char(0xbf); utf8 += char(0xbf);
str = QChar(QChar::ReplacementCharacter); //str = QChar(QChar::ReplacementCharacter);
str = QString::fromUtf8(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.2") << utf8 << str << -1; QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.2") << utf8 << str << -1;
} }

View File

@ -233,8 +233,9 @@ void tst_Utf8::nonCharacters_data()
QTest::addColumn<QByteArray>("utf8"); QTest::addColumn<QByteArray>("utf8");
QTest::addColumn<QString>("utf16"); QTest::addColumn<QString>("utf16");
// Unicode has a couple of "non-characters" that one can use internally, // Unicode has a couple of "non-characters" that one can use internally
// but are not allowed to be used for text interchange. // These characters may be used for interchange;
// see: http://www.unicode.org/versions/corrigendum9.html
// //
// Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
// U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
@ -279,20 +280,17 @@ void tst_Utf8::nonCharacters()
decoder->toUnicode(utf8); decoder->toUnicode(utf8);
// Only enforce correctness on our UTF-8 decoder // Only enforce correctness on our UTF-8 decoder
// The system's UTF-8 codec is sometimes buggy
// GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8
// OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF
if (!useLocale) if (!useLocale)
QVERIFY(decoder->hasFailure()); QVERIFY(!decoder->hasFailure());
else if (!decoder->hasFailure()) else if (decoder->hasFailure())
qWarning("System codec does not report failure when it should. Should report bug upstream."); qWarning("System codec reports failure when it shouldn't. Should report bug upstream.");
QSharedPointer<QTextEncoder> encoder(codec->makeEncoder()); QSharedPointer<QTextEncoder> encoder(codec->makeEncoder());
encoder->fromUnicode(utf16); encoder->fromUnicode(utf16);
if (!useLocale) if (!useLocale)
QVERIFY(encoder->hasFailure()); QVERIFY(!encoder->hasFailure());
else if (!encoder->hasFailure()) else if (encoder->hasFailure())
qWarning("System codec does not report failure when it should. Should report bug upstream."); qWarning("System codec reports failure when it shouldn't. Should report bug upstream.");
} }
QTEST_MAIN(tst_Utf8) QTEST_MAIN(tst_Utf8)

View File

@ -129,8 +129,8 @@ void loadInvalidUtf8Rows()
void loadNonCharactersRows() void loadNonCharactersRows()
{ {
// Unicode has a couple of "non-characters" that one can use internally, // Unicode has a couple of "non-characters" that one can use internally
// but are not allowed to be used for text interchange. // These characters are allowed for text-interchange (see http://www.unicode.org/versions/corrigendum9.html)
// //
// Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
// U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and

View File

@ -964,8 +964,10 @@ void tst_QUrlInternal::encodingRecode_data()
addUtf8Data("utf8-string-2", "\xDF\xBF\xE0\xA0\x80""A"); addUtf8Data("utf8-string-2", "\xDF\xBF\xE0\xA0\x80""A");
addUtf8Data("utf8-string-3", "\xE0\xA0\x80\xDF\xBF..."); addUtf8Data("utf8-string-3", "\xE0\xA0\x80\xDF\xBF...");
QTest::newRow("encode-unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::FullyEncoded) << "%EF%BF%BF";
QTest::newRow("decode-unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::PrettyDecoded) << QString::fromUtf8("\xEF\xBF\xBF");
// special cases: stuff we can encode, but not decode // special cases: stuff we can encode, but not decode
QTest::newRow("unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::FullyEncoded) << "%EF%BF%BF";
QTest::newRow("unicode-lo-surrogate") << QString(QChar(0xD800)) << F(QUrl::FullyEncoded) << "%ED%A0%80"; QTest::newRow("unicode-lo-surrogate") << QString(QChar(0xD800)) << F(QUrl::FullyEncoded) << "%ED%A0%80";
QTest::newRow("unicode-hi-surrogate") << QString(QChar(0xDC00)) << F(QUrl::FullyEncoded) << "%ED%B0%80"; QTest::newRow("unicode-hi-surrogate") << QString(QChar(0xDC00)) << F(QUrl::FullyEncoded) << "%ED%B0%80";
@ -1011,9 +1013,6 @@ void tst_QUrlInternal::encodingRecodeInvalidUtf8_data()
extern void loadInvalidUtf8Rows(); extern void loadInvalidUtf8Rows();
loadInvalidUtf8Rows(); loadInvalidUtf8Rows();
extern void loadNonCharactersRows();
loadNonCharactersRows();
QTest::newRow("utf8-mix-4") << QByteArray("\xE0.A2\x80"); QTest::newRow("utf8-mix-4") << QByteArray("\xE0.A2\x80");
QTest::newRow("utf8-mix-5") << QByteArray("\xE0\xA2.80"); QTest::newRow("utf8-mix-5") << QByteArray("\xE0\xA2.80");
QTest::newRow("utf8-mix-6") << QByteArray("\xE0\xA2\x33"); QTest::newRow("utf8-mix-6") << QByteArray("\xE0\xA2\x33");

View File

@ -47,7 +47,8 @@
#include "qjsondocument.h" #include "qjsondocument.h"
#include <limits> #include <limits>
#define INVALID_UNICODE "\357\277\277" // "\uffff" #define INVALID_UNICODE "\xCE\xBA\xE1"
#define UNICODE_NON_CHARACTER "\xEF\xBF\xBF"
#define UNICODE_DJE "\320\202" // Character from the Serbian Cyrillic alphabet #define UNICODE_DJE "\320\202" // Character from the Serbian Cyrillic alphabet
class tst_QtJson: public QObject class tst_QtJson: public QObject
@ -1305,6 +1306,19 @@ void tst_QtJson::fromJson()
QCOMPARE(array.at(0).toBool(), true); QCOMPARE(array.at(0).toBool(), true);
QCOMPARE(doc.toJson(), json); QCOMPARE(doc.toJson(), json);
} }
{
//regression test: test if unicode_control_characters are correctly decoded
QByteArray json = "[\n \"" UNICODE_NON_CHARACTER "\"\n]\n";
QJsonDocument doc = QJsonDocument::fromJson(json);
QVERIFY(!doc.isEmpty());
QCOMPARE(doc.isArray(), true);
QCOMPARE(doc.isObject(), false);
QJsonArray array = doc.array();
QCOMPARE(array.size(), 1);
QCOMPARE(array.at(0).type(), QJsonValue::String);
QCOMPARE(array.at(0).toString(), QString::fromUtf8(UNICODE_NON_CHARACTER));
QCOMPARE(doc.toJson(), json);
}
{ {
QByteArray json = "[]"; QByteArray json = "[]";
QJsonDocument doc = QJsonDocument::fromJson(json); QJsonDocument doc = QJsonDocument::fromJson(json);
@ -1532,7 +1546,7 @@ void tst_QtJson::fromJsonErrors()
QJsonDocument doc = QJsonDocument::fromJson(json, &error); QJsonDocument doc = QJsonDocument::fromJson(json, &error);
QVERIFY(doc.isEmpty()); QVERIFY(doc.isEmpty());
QCOMPARE(error.error, QJsonParseError::IllegalUTF8String); QCOMPARE(error.error, QJsonParseError::IllegalUTF8String);
QCOMPARE(error.offset, 13); QCOMPARE(error.offset, 14);
} }
{ {
QJsonParseError error; QJsonParseError error;
@ -1556,7 +1570,7 @@ void tst_QtJson::fromJsonErrors()
QJsonDocument doc = QJsonDocument::fromJson(json, &error); QJsonDocument doc = QJsonDocument::fromJson(json, &error);
QVERIFY(doc.isEmpty()); QVERIFY(doc.isEmpty());
QCOMPARE(error.error, QJsonParseError::IllegalUTF8String); QCOMPARE(error.error, QJsonParseError::IllegalUTF8String);
QCOMPARE(error.offset, 14); QCOMPARE(error.offset, 15);
} }
{ {
QJsonParseError error; QJsonParseError error;
@ -1702,6 +1716,7 @@ void tst_QtJson::parseStrings()
"abc\\tabc", "abc\\tabc",
"abc\\u0019abc", "abc\\u0019abc",
"abc" UNICODE_DJE "abc", "abc" UNICODE_DJE "abc",
UNICODE_NON_CHARACTER
}; };
int size = sizeof(strings)/sizeof(const char *); int size = sizeof(strings)/sizeof(const char *);
@ -1728,7 +1743,8 @@ void tst_QtJson::parseStrings()
Pairs pairs [] = { Pairs pairs [] = {
{ "abc\\/abc", "abc/abc" }, { "abc\\/abc", "abc/abc" },
{ "abc\\u0402abc", "abc" UNICODE_DJE "abc" }, { "abc\\u0402abc", "abc" UNICODE_DJE "abc" },
{ "abc\\u0065abc", "abceabc" } { "abc\\u0065abc", "abceabc" },
{ "abc\\uFFFFabc", "abc" UNICODE_NON_CHARACTER "abc" }
}; };
size = sizeof(pairs)/sizeof(Pairs); size = sizeof(pairs)/sizeof(Pairs);

View File

@ -315,8 +315,6 @@ void tst_QXmlSimpleReader::testGoodXmlFile()
QVERIFY(file.open(QIODevice::ReadOnly)); QVERIFY(file.open(QIODevice::ReadOnly));
Parser parser; Parser parser;
// static int i = 0;
// qWarning("Test nr: " + QString::number(i)); ++i;
QEXPECT_FAIL("xmldocs/valid/sa/089.xml", "", Continue); QEXPECT_FAIL("xmldocs/valid/sa/089.xml", "", Continue);
QVERIFY(parser.parseFile(&file)); QVERIFY(parser.parseFile(&file));
@ -326,7 +324,6 @@ void tst_QXmlSimpleReader::testGoodXmlFile()
ref_stream.setCodec("UTF-8"); ref_stream.setCodec("UTF-8");
QString ref_file_contents = ref_stream.readAll(); QString ref_file_contents = ref_stream.readAll();
QEXPECT_FAIL("xmldocs/valid/sa/089.xml", "", Continue);
QCOMPARE(parser.result(), ref_file_contents); QCOMPARE(parser.result(), ref_file_contents);
} }
@ -355,8 +352,6 @@ void tst_QXmlSimpleReader::testBadXmlFile()
QVERIFY(file.open(QIODevice::ReadOnly)); QVERIFY(file.open(QIODevice::ReadOnly));
Parser parser; Parser parser;
// static int i = 0;
// qWarning("Test nr: " + QString::number(++i));
QEXPECT_FAIL("xmldocs/not-wf/sa/030.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/030.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/031.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/031.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/032.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/032.xml", "", Continue);
@ -381,22 +376,17 @@ void tst_QXmlSimpleReader::testBadXmlFile()
QEXPECT_FAIL("xmldocs/not-wf/sa/132.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/132.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/142.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/142.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/143.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/143.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/144.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/144.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/145.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/145.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/146.xml", "", Abort); QEXPECT_FAIL("xmldocs/not-wf/sa/146.xml", "", Abort);
QEXPECT_FAIL("xmldocs/not-wf/sa/160.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/160.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/162.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/162.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/166.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/167.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/168.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/168.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/169.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/169.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/170.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/170.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/171.xml", "", Abort);
QEXPECT_FAIL("xmldocs/not-wf/sa/172.xml", "", Abort);
QEXPECT_FAIL("xmldocs/not-wf/sa/173.xml", "", Abort);
QEXPECT_FAIL("xmldocs/not-wf/sa/174.xml", "", Abort);
QEXPECT_FAIL("xmldocs/not-wf/sa/175.xml", "", Abort);
QEXPECT_FAIL("xmldocs/not-wf/sa/177.xml", "", Abort);
QEXPECT_FAIL("xmldocs/not-wf/sa/180.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/180.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/181.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/181.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/182.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/182.xml", "", Continue);
@ -411,12 +401,7 @@ void tst_QXmlSimpleReader::testBadXmlFile()
ref_stream.setCodec("UTF-8"); ref_stream.setCodec("UTF-8");
QString ref_file_contents = ref_stream.readAll(); QString ref_file_contents = ref_stream.readAll();
QEXPECT_FAIL("xmldocs/not-wf/sa/144.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/145.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/145.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/146.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/167.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/166.xml", "", Continue);
QEXPECT_FAIL("xmldocs/not-wf/sa/170.xml", "", Continue);
QCOMPARE(parser.result(), ref_file_contents); QCOMPARE(parser.result(), ref_file_contents);
} }

View File

@ -1,6 +1,6 @@
setDocumentLocator(locator={columnNumber=1, lineNumber=1}) setDocumentLocator(locator={columnNumber=1, lineNumber=1})
startDocument() startDocument()
startElement(namespaceURI="", localName="doc", qName="doc", atts=[]) startElement(namespaceURI="", localName="doc", qName="doc", atts=[])
characters(ch="<22><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>") characters(ch="<22>")
endElement(namespaceURI="", localName="doc", qName="doc") endElement(namespaceURI="", localName="doc", qName="doc")
endDocument() endDocument()

View File

@ -1980,16 +1980,15 @@ int fromUtf8_qt47(ushort *dst, const char *chars, int len)
--need; --need;
if (!need) { if (!need) {
// utf-8 bom composes into 0xfeff code point // utf-8 bom composes into 0xfeff code point
bool nonCharacter;
if (!headerdone && uc == 0xfeff) { if (!headerdone && uc == 0xfeff) {
// don't do anything, just skip the BOM // don't do anything, just skip the BOM
} else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
// surrogate pair // surrogate pair
//Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
*qch++ = QChar::highSurrogate(uc); *qch++ = QChar::highSurrogate(uc);
*qch++ = QChar::lowSurrogate(uc); *qch++ = QChar::lowSurrogate(uc);
} else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) { } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
// error: overlong sequence, UTF16 surrogate or non-character // error: overlong sequence or UTF16 surrogate
*qch++ = replacement; *qch++ = replacement;
++invalid; ++invalid;
} else { } else {
@ -2086,16 +2085,15 @@ int fromUtf8_qt47_stateless(ushort *dst, const char *chars, int len)
--need; --need;
if (!need) { if (!need) {
// utf-8 bom composes into 0xfeff code point // utf-8 bom composes into 0xfeff code point
bool nonCharacter;
if (!headerdone && uc == 0xfeff) { if (!headerdone && uc == 0xfeff) {
// don't do anything, just skip the BOM // don't do anything, just skip the BOM
} else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
// surrogate pair // surrogate pair
//Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
*qch++ = QChar::highSurrogate(uc); *qch++ = QChar::highSurrogate(uc);
*qch++ = QChar::lowSurrogate(uc); *qch++ = QChar::lowSurrogate(uc);
} else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) { } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
// error: overlong sequence, UTF16 surrogate or non-character // error: overlong sequence or UTF16 surrogate
*qch++ = replacement; *qch++ = replacement;
++invalid; ++invalid;
} else { } else {
@ -2214,7 +2212,7 @@ static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptr
chars += 2; chars += 2;
len -= 2; len -= 2;
if (!trusted && if (!trusted &&
(ucs < 0x800 || QChar::isNonCharacter(ucs) || QChar::isSurrogate(ucs))) (ucs < 0x800 || QChar::isSurrogate(ucs)))
dst[counter] = QChar::ReplacementCharacter; dst[counter] = QChar::ReplacementCharacter;
else else
dst[counter] = ucs; dst[counter] = ucs;
@ -2245,7 +2243,7 @@ static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptr
// dst[counter] will correspond to chars[counter..counter+2], so adjust // dst[counter] will correspond to chars[counter..counter+2], so adjust
chars += 3; chars += 3;
len -= 3; len -= 3;
if (trusted || (QChar::requiresSurrogates(ucs) && ucs <= QChar::LastValidCodePoint && !QChar::isNonCharacter(ucs))) { if (trusted || (QChar::requiresSurrogates(ucs) && ucs <= QChar::LastValidCodePoint)) {
dst[counter + 0] = QChar::highSurrogate(ucs); dst[counter + 0] = QChar::highSurrogate(ucs);
dst[counter + 1] = QChar::lowSurrogate(ucs); dst[counter + 1] = QChar::lowSurrogate(ucs);
counter += 2; counter += 2;