Allow non-character codes in utf8 strings
Changed the processing of non-character code handling in the UTF8 codec. Non-character codes are now accepted in QStrings, QUrls and QJson strings. Unit tests were adapted accordingly. For more info about non-character codes, see: http://www.unicode.org/versions/corrigendum9.html [ChangeLog][QtCore][QUtf8] UTF-8 now accepts non-character unicode points; these are not replaced by the replacement character anymore [ChangeLog][QtCore][QUrl] QUrl now fully accepts non-character unicode points; they are encoded as percent characters; they can also be pretty decoded [ChangeLog][QtCore][QJson] The Writer and the Parser now fully accept non-character unicode points. Change-Id: I77cf4f0e6210741eac8082912a0b6118eced4f77 Task-number: QTBUG-33229 Reviewed-by: Lars Knoll <lars.knoll@digia.com> Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
e8853506bf
commit
add2bf739a
@ -106,14 +106,6 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
|
||||
if (u < 0x0800) {
|
||||
*cursor++ = 0xc0 | ((uchar) (u >> 6));
|
||||
} else {
|
||||
// is it one of the Unicode non-characters?
|
||||
if (QChar::isNonCharacter(u)) {
|
||||
*cursor++ = replacement;
|
||||
++ch;
|
||||
++invalid;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (QChar::requiresSurrogates(u)) {
|
||||
*cursor++ = 0xf0 | ((uchar) (u >> 18));
|
||||
*cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
|
||||
@ -180,15 +172,14 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
|
||||
--need;
|
||||
if (!need) {
|
||||
// utf-8 bom composes into 0xfeff code point
|
||||
bool nonCharacter;
|
||||
if (!headerdone && uc == 0xfeff) {
|
||||
// don't do anything, just skip the BOM
|
||||
} else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
|
||||
} else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
|
||||
// surrogate pair
|
||||
Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
|
||||
*qch++ = QChar::highSurrogate(uc);
|
||||
*qch++ = QChar::lowSurrogate(uc);
|
||||
} else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) {
|
||||
} else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
|
||||
// error: overlong sequence, UTF16 surrogate or non-character
|
||||
*qch++ = replacement;
|
||||
++invalid;
|
||||
|
@ -304,7 +304,7 @@ static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *b
|
||||
// we've decoded something; safety-check it
|
||||
if (uc < min_uc)
|
||||
return false;
|
||||
if (QChar::isSurrogate(uc) || QChar::isNonCharacter(uc) || uc > QChar::LastValidCodePoint)
|
||||
if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
|
||||
return false;
|
||||
|
||||
if (!QChar::requiresSurrogates(uc)) {
|
||||
|
@ -853,7 +853,7 @@ static inline bool scanUtf8Char(const char *&json, const char *end, uint *result
|
||||
uc = (uc << 6) | (ch & 0x3f);
|
||||
}
|
||||
|
||||
if (uc < min_uc || QChar::isNonCharacter(uc) ||
|
||||
if (uc < min_uc ||
|
||||
QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
|
||||
return false;
|
||||
}
|
||||
|
@ -138,13 +138,6 @@ static QByteArray escapedString(const QString &s)
|
||||
if (u < 0x0800) {
|
||||
*cursor++ = 0xc0 | ((uchar) (u >> 6));
|
||||
} else {
|
||||
// is it one of the Unicode non-characters?
|
||||
if (QChar::isNonCharacter(u)) {
|
||||
*cursor++ = replacement;
|
||||
++ch;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (QChar::requiresSurrogates(u)) {
|
||||
*cursor++ = 0xf0 | ((uchar) (u >> 18));
|
||||
*cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
|
||||
|
@ -66,9 +66,9 @@ private slots:
|
||||
void codecForLocale();
|
||||
|
||||
void asciiToIscii() const;
|
||||
void flagCodepointFFFF() const;
|
||||
void nonFlaggedCodepointFFFF() const;
|
||||
void flagF7808080() const;
|
||||
void flagEFBFBF() const;
|
||||
void nonFlaggedEFBFBF() const;
|
||||
void decode0D() const;
|
||||
void aliasForUTF16() const;
|
||||
void mibForTSCII() const;
|
||||
@ -409,9 +409,9 @@ void tst_QTextCodec::asciiToIscii() const
|
||||
}
|
||||
}
|
||||
|
||||
void tst_QTextCodec::flagCodepointFFFF() const
|
||||
void tst_QTextCodec::nonFlaggedCodepointFFFF() const
|
||||
{
|
||||
// This is an invalid Unicode codepoint.
|
||||
//Check that the code point 0xFFFF (=non-character code 0xEFBFBF) is not flagged
|
||||
const QChar ch(0xFFFF);
|
||||
QString input(ch);
|
||||
|
||||
@ -419,12 +419,11 @@ void tst_QTextCodec::flagCodepointFFFF() const
|
||||
QVERIFY(codec);
|
||||
|
||||
const QByteArray asDecoded(codec->fromUnicode(input));
|
||||
QCOMPARE(asDecoded, QByteArray("?"));
|
||||
QCOMPARE(asDecoded, QByteArray("\357\277\277"));
|
||||
|
||||
QByteArray ffff("\357\277\277");
|
||||
QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
|
||||
QVERIFY(codec->toUnicode(ffff.constData(), ffff.length(), &state) == QChar(0));
|
||||
QVERIFY(codec->toUnicode(ffff) == QChar(0xfffd));
|
||||
QVERIFY(codec->toUnicode(ffff.constData(), ffff.length(), &state) == QByteArray::fromHex("EFBFBF"));
|
||||
}
|
||||
|
||||
void tst_QTextCodec::flagF7808080() const
|
||||
@ -460,13 +459,16 @@ void tst_QTextCodec::flagF7808080() const
|
||||
QVERIFY(codec->toUnicode(input.constData(), input.length(), &state) == QChar(0));
|
||||
}
|
||||
|
||||
void tst_QTextCodec::flagEFBFBF() const
|
||||
void tst_QTextCodec::nonFlaggedEFBFBF() const
|
||||
{
|
||||
QByteArray invalidInput;
|
||||
invalidInput.resize(3);
|
||||
invalidInput[0] = char(0xEF);
|
||||
invalidInput[1] = char(0xBF);
|
||||
invalidInput[2] = char(0xBF);
|
||||
/* Check that the codec does NOT flag EFBFBF.
|
||||
* This is a regression test; see QTBUG-33229
|
||||
*/
|
||||
QByteArray validInput;
|
||||
validInput.resize(3);
|
||||
validInput[0] = char(0xEF);
|
||||
validInput[1] = char(0xBF);
|
||||
validInput[2] = char(0xBF);
|
||||
|
||||
const QTextCodec *const codec = QTextCodec::codecForMib(106); // UTF-8
|
||||
QVERIFY(codec);
|
||||
@ -474,21 +476,20 @@ void tst_QTextCodec::flagEFBFBF() const
|
||||
{
|
||||
//QVERIFY(!codec->canEncode(QChar(0xFFFF)));
|
||||
QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
|
||||
QVERIFY(codec->toUnicode(invalidInput.constData(), invalidInput.length(), &state) == QChar(0));
|
||||
QVERIFY(codec->toUnicode(validInput.constData(), validInput.length(), &state) == QByteArray::fromHex("EFBFBF"));
|
||||
|
||||
QByteArray start("<?pi ");
|
||||
start.append(invalidInput);
|
||||
start.append(validInput);
|
||||
start.append("?>");
|
||||
}
|
||||
|
||||
/* When 0xEFBFBF is preceded by what seems to be an arbitrary character,
|
||||
* QTextCodec fails to flag it. */
|
||||
// Check that 0xEFBFBF is correctly decoded when preceded by an arbitrary character
|
||||
{
|
||||
QByteArray start("B");
|
||||
start.append(invalidInput);
|
||||
start.append(validInput);
|
||||
|
||||
QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
|
||||
QVERIFY(codec->toUnicode(start.constData(), start.length(), &state) == QString::fromLatin1("B\0", 2));
|
||||
QVERIFY(codec->toUnicode(start.constData(), start.length(), &state) == QByteArray("B").append(QByteArray::fromHex("EFBFBF")));
|
||||
}
|
||||
}
|
||||
|
||||
@ -674,13 +675,12 @@ void tst_QTextCodec::utf8Codec_data()
|
||||
str = QChar(0x7ff);
|
||||
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.2") << utf8 << str << -1;
|
||||
|
||||
// 2.2.3 U+000FFFF
|
||||
// 2.2.3 U+000FFFF - non-character code
|
||||
utf8.clear();
|
||||
utf8 += char(0xef);
|
||||
utf8 += char(0xbf);
|
||||
utf8 += char(0xbf);
|
||||
str.clear();
|
||||
str += QChar::ReplacementCharacter;
|
||||
str = QString::fromUtf8(utf8);
|
||||
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.3") << utf8 << str << -1;
|
||||
|
||||
// 2.2.4 U+001FFFFF
|
||||
@ -1535,20 +1535,22 @@ void tst_QTextCodec::utf8Codec_data()
|
||||
str += QChar(QChar::ReplacementCharacter);
|
||||
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.8") << utf8 << str << -1;
|
||||
|
||||
// 5.3.1
|
||||
// 5.3.1 - non-character code
|
||||
utf8.clear();
|
||||
utf8 += char(0xef);
|
||||
utf8 += char(0xbf);
|
||||
utf8 += char(0xbe);
|
||||
str = QChar(QChar::ReplacementCharacter);
|
||||
//str = QChar(QChar::ReplacementCharacter);
|
||||
str = QString::fromUtf8(utf8);
|
||||
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.1") << utf8 << str << -1;
|
||||
|
||||
// 5.3.2
|
||||
// 5.3.2 - non-character code
|
||||
utf8.clear();
|
||||
utf8 += char(0xef);
|
||||
utf8 += char(0xbf);
|
||||
utf8 += char(0xbf);
|
||||
str = QChar(QChar::ReplacementCharacter);
|
||||
//str = QChar(QChar::ReplacementCharacter);
|
||||
str = QString::fromUtf8(utf8);
|
||||
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.2") << utf8 << str << -1;
|
||||
}
|
||||
|
||||
|
@ -233,8 +233,9 @@ void tst_Utf8::nonCharacters_data()
|
||||
QTest::addColumn<QByteArray>("utf8");
|
||||
QTest::addColumn<QString>("utf16");
|
||||
|
||||
// Unicode has a couple of "non-characters" that one can use internally,
|
||||
// but are not allowed to be used for text interchange.
|
||||
// Unicode has a couple of "non-characters" that one can use internally
|
||||
// These characters may be used for interchange;
|
||||
// see: http://www.unicode.org/versions/corrigendum9.html
|
||||
//
|
||||
// Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
|
||||
// U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
|
||||
@ -279,20 +280,17 @@ void tst_Utf8::nonCharacters()
|
||||
decoder->toUnicode(utf8);
|
||||
|
||||
// Only enforce correctness on our UTF-8 decoder
|
||||
// The system's UTF-8 codec is sometimes buggy
|
||||
// GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8
|
||||
// OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF
|
||||
if (!useLocale)
|
||||
QVERIFY(decoder->hasFailure());
|
||||
else if (!decoder->hasFailure())
|
||||
qWarning("System codec does not report failure when it should. Should report bug upstream.");
|
||||
QVERIFY(!decoder->hasFailure());
|
||||
else if (decoder->hasFailure())
|
||||
qWarning("System codec reports failure when it shouldn't. Should report bug upstream.");
|
||||
|
||||
QSharedPointer<QTextEncoder> encoder(codec->makeEncoder());
|
||||
encoder->fromUnicode(utf16);
|
||||
if (!useLocale)
|
||||
QVERIFY(encoder->hasFailure());
|
||||
else if (!encoder->hasFailure())
|
||||
qWarning("System codec does not report failure when it should. Should report bug upstream.");
|
||||
QVERIFY(!encoder->hasFailure());
|
||||
else if (encoder->hasFailure())
|
||||
qWarning("System codec reports failure when it shouldn't. Should report bug upstream.");
|
||||
}
|
||||
|
||||
QTEST_MAIN(tst_Utf8)
|
||||
|
@ -129,8 +129,8 @@ void loadInvalidUtf8Rows()
|
||||
|
||||
void loadNonCharactersRows()
|
||||
{
|
||||
// Unicode has a couple of "non-characters" that one can use internally,
|
||||
// but are not allowed to be used for text interchange.
|
||||
// Unicode has a couple of "non-characters" that one can use internally
|
||||
// These characters are allowed for text-interchange (see http://www.unicode.org/versions/corrigendum9.html)
|
||||
//
|
||||
// Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
|
||||
// U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
|
||||
|
@ -964,8 +964,10 @@ void tst_QUrlInternal::encodingRecode_data()
|
||||
addUtf8Data("utf8-string-2", "\xDF\xBF\xE0\xA0\x80""A");
|
||||
addUtf8Data("utf8-string-3", "\xE0\xA0\x80\xDF\xBF...");
|
||||
|
||||
QTest::newRow("encode-unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::FullyEncoded) << "%EF%BF%BF";
|
||||
QTest::newRow("decode-unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::PrettyDecoded) << QString::fromUtf8("\xEF\xBF\xBF");
|
||||
|
||||
// special cases: stuff we can encode, but not decode
|
||||
QTest::newRow("unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::FullyEncoded) << "%EF%BF%BF";
|
||||
QTest::newRow("unicode-lo-surrogate") << QString(QChar(0xD800)) << F(QUrl::FullyEncoded) << "%ED%A0%80";
|
||||
QTest::newRow("unicode-hi-surrogate") << QString(QChar(0xDC00)) << F(QUrl::FullyEncoded) << "%ED%B0%80";
|
||||
|
||||
@ -1011,9 +1013,6 @@ void tst_QUrlInternal::encodingRecodeInvalidUtf8_data()
|
||||
extern void loadInvalidUtf8Rows();
|
||||
loadInvalidUtf8Rows();
|
||||
|
||||
extern void loadNonCharactersRows();
|
||||
loadNonCharactersRows();
|
||||
|
||||
QTest::newRow("utf8-mix-4") << QByteArray("\xE0.A2\x80");
|
||||
QTest::newRow("utf8-mix-5") << QByteArray("\xE0\xA2.80");
|
||||
QTest::newRow("utf8-mix-6") << QByteArray("\xE0\xA2\x33");
|
||||
|
@ -47,7 +47,8 @@
|
||||
#include "qjsondocument.h"
|
||||
#include <limits>
|
||||
|
||||
#define INVALID_UNICODE "\357\277\277" // "\uffff"
|
||||
#define INVALID_UNICODE "\xCE\xBA\xE1"
|
||||
#define UNICODE_NON_CHARACTER "\xEF\xBF\xBF"
|
||||
#define UNICODE_DJE "\320\202" // Character from the Serbian Cyrillic alphabet
|
||||
|
||||
class tst_QtJson: public QObject
|
||||
@ -1305,6 +1306,19 @@ void tst_QtJson::fromJson()
|
||||
QCOMPARE(array.at(0).toBool(), true);
|
||||
QCOMPARE(doc.toJson(), json);
|
||||
}
|
||||
{
|
||||
//regression test: test if unicode_control_characters are correctly decoded
|
||||
QByteArray json = "[\n \"" UNICODE_NON_CHARACTER "\"\n]\n";
|
||||
QJsonDocument doc = QJsonDocument::fromJson(json);
|
||||
QVERIFY(!doc.isEmpty());
|
||||
QCOMPARE(doc.isArray(), true);
|
||||
QCOMPARE(doc.isObject(), false);
|
||||
QJsonArray array = doc.array();
|
||||
QCOMPARE(array.size(), 1);
|
||||
QCOMPARE(array.at(0).type(), QJsonValue::String);
|
||||
QCOMPARE(array.at(0).toString(), QString::fromUtf8(UNICODE_NON_CHARACTER));
|
||||
QCOMPARE(doc.toJson(), json);
|
||||
}
|
||||
{
|
||||
QByteArray json = "[]";
|
||||
QJsonDocument doc = QJsonDocument::fromJson(json);
|
||||
@ -1532,7 +1546,7 @@ void tst_QtJson::fromJsonErrors()
|
||||
QJsonDocument doc = QJsonDocument::fromJson(json, &error);
|
||||
QVERIFY(doc.isEmpty());
|
||||
QCOMPARE(error.error, QJsonParseError::IllegalUTF8String);
|
||||
QCOMPARE(error.offset, 13);
|
||||
QCOMPARE(error.offset, 14);
|
||||
}
|
||||
{
|
||||
QJsonParseError error;
|
||||
@ -1556,7 +1570,7 @@ void tst_QtJson::fromJsonErrors()
|
||||
QJsonDocument doc = QJsonDocument::fromJson(json, &error);
|
||||
QVERIFY(doc.isEmpty());
|
||||
QCOMPARE(error.error, QJsonParseError::IllegalUTF8String);
|
||||
QCOMPARE(error.offset, 14);
|
||||
QCOMPARE(error.offset, 15);
|
||||
}
|
||||
{
|
||||
QJsonParseError error;
|
||||
@ -1702,6 +1716,7 @@ void tst_QtJson::parseStrings()
|
||||
"abc\\tabc",
|
||||
"abc\\u0019abc",
|
||||
"abc" UNICODE_DJE "abc",
|
||||
UNICODE_NON_CHARACTER
|
||||
};
|
||||
int size = sizeof(strings)/sizeof(const char *);
|
||||
|
||||
@ -1728,7 +1743,8 @@ void tst_QtJson::parseStrings()
|
||||
Pairs pairs [] = {
|
||||
{ "abc\\/abc", "abc/abc" },
|
||||
{ "abc\\u0402abc", "abc" UNICODE_DJE "abc" },
|
||||
{ "abc\\u0065abc", "abceabc" }
|
||||
{ "abc\\u0065abc", "abceabc" },
|
||||
{ "abc\\uFFFFabc", "abc" UNICODE_NON_CHARACTER "abc" }
|
||||
};
|
||||
size = sizeof(pairs)/sizeof(Pairs);
|
||||
|
||||
|
@ -315,8 +315,6 @@ void tst_QXmlSimpleReader::testGoodXmlFile()
|
||||
QVERIFY(file.open(QIODevice::ReadOnly));
|
||||
Parser parser;
|
||||
|
||||
// static int i = 0;
|
||||
// qWarning("Test nr: " + QString::number(i)); ++i;
|
||||
QEXPECT_FAIL("xmldocs/valid/sa/089.xml", "", Continue);
|
||||
QVERIFY(parser.parseFile(&file));
|
||||
|
||||
@ -326,7 +324,6 @@ void tst_QXmlSimpleReader::testGoodXmlFile()
|
||||
ref_stream.setCodec("UTF-8");
|
||||
QString ref_file_contents = ref_stream.readAll();
|
||||
|
||||
QEXPECT_FAIL("xmldocs/valid/sa/089.xml", "", Continue);
|
||||
QCOMPARE(parser.result(), ref_file_contents);
|
||||
}
|
||||
|
||||
@ -355,8 +352,6 @@ void tst_QXmlSimpleReader::testBadXmlFile()
|
||||
QVERIFY(file.open(QIODevice::ReadOnly));
|
||||
Parser parser;
|
||||
|
||||
// static int i = 0;
|
||||
// qWarning("Test nr: " + QString::number(++i));
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/030.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/031.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/032.xml", "", Continue);
|
||||
@ -381,22 +376,17 @@ void tst_QXmlSimpleReader::testBadXmlFile()
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/132.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/142.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/143.xml", "", Continue);
|
||||
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/144.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/145.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/146.xml", "", Abort);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/160.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/162.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/166.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/167.xml", "", Continue);
|
||||
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/168.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/169.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/170.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/171.xml", "", Abort);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/172.xml", "", Abort);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/173.xml", "", Abort);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/174.xml", "", Abort);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/175.xml", "", Abort);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/177.xml", "", Abort);
|
||||
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/180.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/181.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/182.xml", "", Continue);
|
||||
@ -411,12 +401,7 @@ void tst_QXmlSimpleReader::testBadXmlFile()
|
||||
ref_stream.setCodec("UTF-8");
|
||||
QString ref_file_contents = ref_stream.readAll();
|
||||
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/144.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/145.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/146.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/167.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/166.xml", "", Continue);
|
||||
QEXPECT_FAIL("xmldocs/not-wf/sa/170.xml", "", Continue);
|
||||
|
||||
QCOMPARE(parser.result(), ref_file_contents);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
setDocumentLocator(locator={columnNumber=1, lineNumber=1})
|
||||
startDocument()
|
||||
startElement(namespaceURI="", localName="doc", qName="doc", atts=[])
|
||||
characters(ch="<22><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>")
|
||||
characters(ch="<22>")
|
||||
endElement(namespaceURI="", localName="doc", qName="doc")
|
||||
endDocument()
|
||||
|
@ -1980,16 +1980,15 @@ int fromUtf8_qt47(ushort *dst, const char *chars, int len)
|
||||
--need;
|
||||
if (!need) {
|
||||
// utf-8 bom composes into 0xfeff code point
|
||||
bool nonCharacter;
|
||||
if (!headerdone && uc == 0xfeff) {
|
||||
// don't do anything, just skip the BOM
|
||||
} else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
|
||||
} else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
|
||||
// surrogate pair
|
||||
//Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
|
||||
*qch++ = QChar::highSurrogate(uc);
|
||||
*qch++ = QChar::lowSurrogate(uc);
|
||||
} else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) {
|
||||
// error: overlong sequence, UTF16 surrogate or non-character
|
||||
} else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
|
||||
// error: overlong sequence or UTF16 surrogate
|
||||
*qch++ = replacement;
|
||||
++invalid;
|
||||
} else {
|
||||
@ -2086,16 +2085,15 @@ int fromUtf8_qt47_stateless(ushort *dst, const char *chars, int len)
|
||||
--need;
|
||||
if (!need) {
|
||||
// utf-8 bom composes into 0xfeff code point
|
||||
bool nonCharacter;
|
||||
if (!headerdone && uc == 0xfeff) {
|
||||
// don't do anything, just skip the BOM
|
||||
} else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
|
||||
} else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
|
||||
// surrogate pair
|
||||
//Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
|
||||
*qch++ = QChar::highSurrogate(uc);
|
||||
*qch++ = QChar::lowSurrogate(uc);
|
||||
} else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) {
|
||||
// error: overlong sequence, UTF16 surrogate or non-character
|
||||
} else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
|
||||
// error: overlong sequence or UTF16 surrogate
|
||||
*qch++ = replacement;
|
||||
++invalid;
|
||||
} else {
|
||||
@ -2214,7 +2212,7 @@ static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptr
|
||||
chars += 2;
|
||||
len -= 2;
|
||||
if (!trusted &&
|
||||
(ucs < 0x800 || QChar::isNonCharacter(ucs) || QChar::isSurrogate(ucs)))
|
||||
(ucs < 0x800 || QChar::isSurrogate(ucs)))
|
||||
dst[counter] = QChar::ReplacementCharacter;
|
||||
else
|
||||
dst[counter] = ucs;
|
||||
@ -2245,7 +2243,7 @@ static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptr
|
||||
// dst[counter] will correspond to chars[counter..counter+2], so adjust
|
||||
chars += 3;
|
||||
len -= 3;
|
||||
if (trusted || (QChar::requiresSurrogates(ucs) && ucs <= QChar::LastValidCodePoint && !QChar::isNonCharacter(ucs))) {
|
||||
if (trusted || (QChar::requiresSurrogates(ucs) && ucs <= QChar::LastValidCodePoint)) {
|
||||
dst[counter + 0] = QChar::highSurrogate(ucs);
|
||||
dst[counter + 1] = QChar::lowSurrogate(ucs);
|
||||
counter += 2;
|
||||
|
Loading…
Reference in New Issue
Block a user