From add2bf739ae96603cb919b908cbb53c00d0628cc Mon Sep 17 00:00:00 2001 From: Kurt Pattyn Date: Sun, 6 Oct 2013 11:40:47 +0200 Subject: [PATCH] Allow non-character codes in utf8 strings Changed the processing of non-character code handling in the UTF8 codec. Non-character codes are now accepted in QStrings, QUrls and QJson strings. Unit tests were adapted accordingly. For more info about non-character codes, see: http://www.unicode.org/versions/corrigendum9.html [ChangeLog][QtCore][QUtf8] UTF-8 now accepts non-character unicode points; these are not replaced by the replacement character anymore [ChangeLog][QtCore][QUrl] QUrl now fully accepts non-character unicode points; they are encoded as percent characters; they can also be pretty decoded [ChangeLog][QtCore][QJson] The Writer and the Parser now fully accept non-character unicode points. Change-Id: I77cf4f0e6210741eac8082912a0b6118eced4f77 Task-number: QTBUG-33229 Reviewed-by: Lars Knoll Reviewed-by: Thiago Macieira --- src/corelib/codecs/qutfcodec.cpp | 13 +---- src/corelib/io/qurlrecode.cpp | 2 +- src/corelib/json/qjsonparser.cpp | 2 +- src/corelib/json/qjsonwriter.cpp | 7 --- .../codecs/qtextcodec/tst_qtextcodec.cpp | 54 ++++++++++--------- tests/auto/corelib/codecs/utf8/tst_utf8.cpp | 20 ++++--- tests/auto/corelib/codecs/utf8/utf8data.cpp | 4 +- .../io/qurlinternal/tst_qurlinternal.cpp | 7 ++- tests/auto/corelib/json/tst_qtjson.cpp | 24 +++++++-- .../qxmlsimplereader/tst_qxmlsimplereader.cpp | 21 ++------ .../xmldocs/not-wf/sa/170.xml.ref | 2 +- .../benchmarks/corelib/tools/qstring/main.cpp | 18 +++---- 12 files changed, 78 insertions(+), 96 deletions(-) diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index aeedcf1aa1..e425f8634c 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -106,14 +106,6 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve if (u < 0x0800) { *cursor++ = 0xc0 | ((uchar) (u >> 6)); } else { - // is it one of the Unicode non-characters? - if (QChar::isNonCharacter(u)) { - *cursor++ = replacement; - ++ch; - ++invalid; - continue; - } - if (QChar::requiresSurrogates(u)) { *cursor++ = 0xf0 | ((uchar) (u >> 18)); *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); @@ -180,15 +172,14 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte --need; if (!need) { // utf-8 bom composes into 0xfeff code point - bool nonCharacter; if (!headerdone && uc == 0xfeff) { // don't do anything, just skip the BOM - } else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { + } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { // surrogate pair Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); *qch++ = QChar::highSurrogate(uc); *qch++ = QChar::lowSurrogate(uc); - } else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) { + } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) { // error: overlong sequence, UTF16 surrogate or non-character *qch++ = replacement; ++invalid; diff --git a/src/corelib/io/qurlrecode.cpp b/src/corelib/io/qurlrecode.cpp index 7e77b9c251..ba1a77744c 100644 --- a/src/corelib/io/qurlrecode.cpp +++ b/src/corelib/io/qurlrecode.cpp @@ -304,7 +304,7 @@ static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *b // we've decoded something; safety-check it if (uc < min_uc) return false; - if (QChar::isSurrogate(uc) || QChar::isNonCharacter(uc) || uc > QChar::LastValidCodePoint) + if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) return false; if (!QChar::requiresSurrogates(uc)) { diff --git a/src/corelib/json/qjsonparser.cpp b/src/corelib/json/qjsonparser.cpp index 8721f06064..516c53775c 100644 --- a/src/corelib/json/qjsonparser.cpp +++ b/src/corelib/json/qjsonparser.cpp @@ -853,7 +853,7 @@ static inline bool scanUtf8Char(const char *&json, const char *end, uint *result uc = (uc << 6) | (ch & 0x3f); } - if (uc < min_uc || QChar::isNonCharacter(uc) || + if (uc < min_uc || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) { return false; } diff --git a/src/corelib/json/qjsonwriter.cpp b/src/corelib/json/qjsonwriter.cpp index 8426b351f6..86cca4bb26 100644 --- a/src/corelib/json/qjsonwriter.cpp +++ b/src/corelib/json/qjsonwriter.cpp @@ -138,13 +138,6 @@ static QByteArray escapedString(const QString &s) if (u < 0x0800) { *cursor++ = 0xc0 | ((uchar) (u >> 6)); } else { - // is it one of the Unicode non-characters? - if (QChar::isNonCharacter(u)) { - *cursor++ = replacement; - ++ch; - continue; - } - if (QChar::requiresSurrogates(u)) { *cursor++ = 0xf0 | ((uchar) (u >> 18)); *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp index dd557b8d21..8e1b3cf3b2 100644 --- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp +++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp @@ -66,9 +66,9 @@ private slots: void codecForLocale(); void asciiToIscii() const; - void flagCodepointFFFF() const; + void nonFlaggedCodepointFFFF() const; void flagF7808080() const; - void flagEFBFBF() const; + void nonFlaggedEFBFBF() const; void decode0D() const; void aliasForUTF16() const; void mibForTSCII() const; @@ -409,9 +409,9 @@ void tst_QTextCodec::asciiToIscii() const } } -void tst_QTextCodec::flagCodepointFFFF() const +void tst_QTextCodec::nonFlaggedCodepointFFFF() const { - // This is an invalid Unicode codepoint. + //Check that the code point 0xFFFF (=non-character code 0xEFBFBF) is not flagged const QChar ch(0xFFFF); QString input(ch); @@ -419,12 +419,11 @@ void tst_QTextCodec::flagCodepointFFFF() const QVERIFY(codec); const QByteArray asDecoded(codec->fromUnicode(input)); - QCOMPARE(asDecoded, QByteArray("?")); + QCOMPARE(asDecoded, QByteArray("\357\277\277")); QByteArray ffff("\357\277\277"); QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull); - QVERIFY(codec->toUnicode(ffff.constData(), ffff.length(), &state) == QChar(0)); - QVERIFY(codec->toUnicode(ffff) == QChar(0xfffd)); + QVERIFY(codec->toUnicode(ffff.constData(), ffff.length(), &state) == QByteArray::fromHex("EFBFBF")); } void tst_QTextCodec::flagF7808080() const @@ -460,13 +459,16 @@ void tst_QTextCodec::flagF7808080() const QVERIFY(codec->toUnicode(input.constData(), input.length(), &state) == QChar(0)); } -void tst_QTextCodec::flagEFBFBF() const +void tst_QTextCodec::nonFlaggedEFBFBF() const { - QByteArray invalidInput; - invalidInput.resize(3); - invalidInput[0] = char(0xEF); - invalidInput[1] = char(0xBF); - invalidInput[2] = char(0xBF); + /* Check that the codec does NOT flag EFBFBF. + * This is a regression test; see QTBUG-33229 + */ + QByteArray validInput; + validInput.resize(3); + validInput[0] = char(0xEF); + validInput[1] = char(0xBF); + validInput[2] = char(0xBF); const QTextCodec *const codec = QTextCodec::codecForMib(106); // UTF-8 QVERIFY(codec); @@ -474,21 +476,20 @@ void tst_QTextCodec::flagEFBFBF() const { //QVERIFY(!codec->canEncode(QChar(0xFFFF))); QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull); - QVERIFY(codec->toUnicode(invalidInput.constData(), invalidInput.length(), &state) == QChar(0)); + QVERIFY(codec->toUnicode(validInput.constData(), validInput.length(), &state) == QByteArray::fromHex("EFBFBF")); QByteArray start(""); } - /* When 0xEFBFBF is preceded by what seems to be an arbitrary character, - * QTextCodec fails to flag it. */ + // Check that 0xEFBFBF is correctly decoded when preceded by an arbitrary character { QByteArray start("B"); - start.append(invalidInput); + start.append(validInput); QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull); - QVERIFY(codec->toUnicode(start.constData(), start.length(), &state) == QString::fromLatin1("B\0", 2)); + QVERIFY(codec->toUnicode(start.constData(), start.length(), &state) == QByteArray("B").append(QByteArray::fromHex("EFBFBF"))); } } @@ -674,13 +675,12 @@ void tst_QTextCodec::utf8Codec_data() str = QChar(0x7ff); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.2") << utf8 << str << -1; - // 2.2.3 U+000FFFF + // 2.2.3 U+000FFFF - non-character code utf8.clear(); utf8 += char(0xef); utf8 += char(0xbf); utf8 += char(0xbf); - str.clear(); - str += QChar::ReplacementCharacter; + str = QString::fromUtf8(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.3") << utf8 << str << -1; // 2.2.4 U+001FFFFF @@ -1535,20 +1535,22 @@ void tst_QTextCodec::utf8Codec_data() str += QChar(QChar::ReplacementCharacter); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.8") << utf8 << str << -1; - // 5.3.1 + // 5.3.1 - non-character code utf8.clear(); utf8 += char(0xef); utf8 += char(0xbf); utf8 += char(0xbe); - str = QChar(QChar::ReplacementCharacter); + //str = QChar(QChar::ReplacementCharacter); + str = QString::fromUtf8(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.1") << utf8 << str << -1; - // 5.3.2 + // 5.3.2 - non-character code utf8.clear(); utf8 += char(0xef); utf8 += char(0xbf); utf8 += char(0xbf); - str = QChar(QChar::ReplacementCharacter); + //str = QChar(QChar::ReplacementCharacter); + str = QString::fromUtf8(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.2") << utf8 << str << -1; } diff --git a/tests/auto/corelib/codecs/utf8/tst_utf8.cpp b/tests/auto/corelib/codecs/utf8/tst_utf8.cpp index 99147f3aff..e18f6f73b9 100644 --- a/tests/auto/corelib/codecs/utf8/tst_utf8.cpp +++ b/tests/auto/corelib/codecs/utf8/tst_utf8.cpp @@ -233,8 +233,9 @@ void tst_Utf8::nonCharacters_data() QTest::addColumn("utf8"); QTest::addColumn("utf16"); - // Unicode has a couple of "non-characters" that one can use internally, - // but are not allowed to be used for text interchange. + // Unicode has a couple of "non-characters" that one can use internally + // These characters may be used for interchange; + // see: http://www.unicode.org/versions/corrigendum9.html // // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and @@ -279,20 +280,17 @@ void tst_Utf8::nonCharacters() decoder->toUnicode(utf8); // Only enforce correctness on our UTF-8 decoder - // The system's UTF-8 codec is sometimes buggy - // GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8 - // OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF if (!useLocale) - QVERIFY(decoder->hasFailure()); - else if (!decoder->hasFailure()) - qWarning("System codec does not report failure when it should. Should report bug upstream."); + QVERIFY(!decoder->hasFailure()); + else if (decoder->hasFailure()) + qWarning("System codec reports failure when it shouldn't. Should report bug upstream."); QSharedPointer encoder(codec->makeEncoder()); encoder->fromUnicode(utf16); if (!useLocale) - QVERIFY(encoder->hasFailure()); - else if (!encoder->hasFailure()) - qWarning("System codec does not report failure when it should. Should report bug upstream."); + QVERIFY(!encoder->hasFailure()); + else if (encoder->hasFailure()) + qWarning("System codec reports failure when it shouldn't. Should report bug upstream."); } QTEST_MAIN(tst_Utf8) diff --git a/tests/auto/corelib/codecs/utf8/utf8data.cpp b/tests/auto/corelib/codecs/utf8/utf8data.cpp index 2516cc9734..a41b0772e6 100644 --- a/tests/auto/corelib/codecs/utf8/utf8data.cpp +++ b/tests/auto/corelib/codecs/utf8/utf8data.cpp @@ -129,8 +129,8 @@ void loadInvalidUtf8Rows() void loadNonCharactersRows() { - // Unicode has a couple of "non-characters" that one can use internally, - // but are not allowed to be used for text interchange. + // Unicode has a couple of "non-characters" that one can use internally + // These characters are allowed for text-interchange (see http://www.unicode.org/versions/corrigendum9.html) // // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and diff --git a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp index 75b17df759..d3a8bcfd13 100644 --- a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp +++ b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp @@ -964,8 +964,10 @@ void tst_QUrlInternal::encodingRecode_data() addUtf8Data("utf8-string-2", "\xDF\xBF\xE0\xA0\x80""A"); addUtf8Data("utf8-string-3", "\xE0\xA0\x80\xDF\xBF..."); + QTest::newRow("encode-unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::FullyEncoded) << "%EF%BF%BF"; + QTest::newRow("decode-unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::PrettyDecoded) << QString::fromUtf8("\xEF\xBF\xBF"); + // special cases: stuff we can encode, but not decode - QTest::newRow("unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::FullyEncoded) << "%EF%BF%BF"; QTest::newRow("unicode-lo-surrogate") << QString(QChar(0xD800)) << F(QUrl::FullyEncoded) << "%ED%A0%80"; QTest::newRow("unicode-hi-surrogate") << QString(QChar(0xDC00)) << F(QUrl::FullyEncoded) << "%ED%B0%80"; @@ -1011,9 +1013,6 @@ void tst_QUrlInternal::encodingRecodeInvalidUtf8_data() extern void loadInvalidUtf8Rows(); loadInvalidUtf8Rows(); - extern void loadNonCharactersRows(); - loadNonCharactersRows(); - QTest::newRow("utf8-mix-4") << QByteArray("\xE0.A2\x80"); QTest::newRow("utf8-mix-5") << QByteArray("\xE0\xA2.80"); QTest::newRow("utf8-mix-6") << QByteArray("\xE0\xA2\x33"); diff --git a/tests/auto/corelib/json/tst_qtjson.cpp b/tests/auto/corelib/json/tst_qtjson.cpp index 9dbd6414ad..c79e7273c0 100644 --- a/tests/auto/corelib/json/tst_qtjson.cpp +++ b/tests/auto/corelib/json/tst_qtjson.cpp @@ -47,7 +47,8 @@ #include "qjsondocument.h" #include -#define INVALID_UNICODE "\357\277\277" // "\uffff" +#define INVALID_UNICODE "\xCE\xBA\xE1" +#define UNICODE_NON_CHARACTER "\xEF\xBF\xBF" #define UNICODE_DJE "\320\202" // Character from the Serbian Cyrillic alphabet class tst_QtJson: public QObject @@ -1305,6 +1306,19 @@ void tst_QtJson::fromJson() QCOMPARE(array.at(0).toBool(), true); QCOMPARE(doc.toJson(), json); } + { + //regression test: test if unicode_control_characters are correctly decoded + QByteArray json = "[\n \"" UNICODE_NON_CHARACTER "\"\n]\n"; + QJsonDocument doc = QJsonDocument::fromJson(json); + QVERIFY(!doc.isEmpty()); + QCOMPARE(doc.isArray(), true); + QCOMPARE(doc.isObject(), false); + QJsonArray array = doc.array(); + QCOMPARE(array.size(), 1); + QCOMPARE(array.at(0).type(), QJsonValue::String); + QCOMPARE(array.at(0).toString(), QString::fromUtf8(UNICODE_NON_CHARACTER)); + QCOMPARE(doc.toJson(), json); + } { QByteArray json = "[]"; QJsonDocument doc = QJsonDocument::fromJson(json); @@ -1532,7 +1546,7 @@ void tst_QtJson::fromJsonErrors() QJsonDocument doc = QJsonDocument::fromJson(json, &error); QVERIFY(doc.isEmpty()); QCOMPARE(error.error, QJsonParseError::IllegalUTF8String); - QCOMPARE(error.offset, 13); + QCOMPARE(error.offset, 14); } { QJsonParseError error; @@ -1556,7 +1570,7 @@ void tst_QtJson::fromJsonErrors() QJsonDocument doc = QJsonDocument::fromJson(json, &error); QVERIFY(doc.isEmpty()); QCOMPARE(error.error, QJsonParseError::IllegalUTF8String); - QCOMPARE(error.offset, 14); + QCOMPARE(error.offset, 15); } { QJsonParseError error; @@ -1702,6 +1716,7 @@ void tst_QtJson::parseStrings() "abc\\tabc", "abc\\u0019abc", "abc" UNICODE_DJE "abc", + UNICODE_NON_CHARACTER }; int size = sizeof(strings)/sizeof(const char *); @@ -1728,7 +1743,8 @@ void tst_QtJson::parseStrings() Pairs pairs [] = { { "abc\\/abc", "abc/abc" }, { "abc\\u0402abc", "abc" UNICODE_DJE "abc" }, - { "abc\\u0065abc", "abceabc" } + { "abc\\u0065abc", "abceabc" }, + { "abc\\uFFFFabc", "abc" UNICODE_NON_CHARACTER "abc" } }; size = sizeof(pairs)/sizeof(Pairs); diff --git a/tests/auto/xml/sax/qxmlsimplereader/tst_qxmlsimplereader.cpp b/tests/auto/xml/sax/qxmlsimplereader/tst_qxmlsimplereader.cpp index d4c0ff44ca..5be43e2c8f 100644 --- a/tests/auto/xml/sax/qxmlsimplereader/tst_qxmlsimplereader.cpp +++ b/tests/auto/xml/sax/qxmlsimplereader/tst_qxmlsimplereader.cpp @@ -315,8 +315,6 @@ void tst_QXmlSimpleReader::testGoodXmlFile() QVERIFY(file.open(QIODevice::ReadOnly)); Parser parser; -// static int i = 0; -// qWarning("Test nr: " + QString::number(i)); ++i; QEXPECT_FAIL("xmldocs/valid/sa/089.xml", "", Continue); QVERIFY(parser.parseFile(&file)); @@ -326,7 +324,6 @@ void tst_QXmlSimpleReader::testGoodXmlFile() ref_stream.setCodec("UTF-8"); QString ref_file_contents = ref_stream.readAll(); - QEXPECT_FAIL("xmldocs/valid/sa/089.xml", "", Continue); QCOMPARE(parser.result(), ref_file_contents); } @@ -355,8 +352,6 @@ void tst_QXmlSimpleReader::testBadXmlFile() QVERIFY(file.open(QIODevice::ReadOnly)); Parser parser; -// static int i = 0; -// qWarning("Test nr: " + QString::number(++i)); QEXPECT_FAIL("xmldocs/not-wf/sa/030.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/031.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/032.xml", "", Continue); @@ -381,22 +376,17 @@ void tst_QXmlSimpleReader::testBadXmlFile() QEXPECT_FAIL("xmldocs/not-wf/sa/132.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/142.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/143.xml", "", Continue); + QEXPECT_FAIL("xmldocs/not-wf/sa/144.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/145.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/146.xml", "", Abort); QEXPECT_FAIL("xmldocs/not-wf/sa/160.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/162.xml", "", Continue); - QEXPECT_FAIL("xmldocs/not-wf/sa/166.xml", "", Continue); - QEXPECT_FAIL("xmldocs/not-wf/sa/167.xml", "", Continue); + QEXPECT_FAIL("xmldocs/not-wf/sa/168.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/169.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/170.xml", "", Continue); - QEXPECT_FAIL("xmldocs/not-wf/sa/171.xml", "", Abort); - QEXPECT_FAIL("xmldocs/not-wf/sa/172.xml", "", Abort); - QEXPECT_FAIL("xmldocs/not-wf/sa/173.xml", "", Abort); - QEXPECT_FAIL("xmldocs/not-wf/sa/174.xml", "", Abort); - QEXPECT_FAIL("xmldocs/not-wf/sa/175.xml", "", Abort); - QEXPECT_FAIL("xmldocs/not-wf/sa/177.xml", "", Abort); + QEXPECT_FAIL("xmldocs/not-wf/sa/180.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/181.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/182.xml", "", Continue); @@ -411,12 +401,7 @@ void tst_QXmlSimpleReader::testBadXmlFile() ref_stream.setCodec("UTF-8"); QString ref_file_contents = ref_stream.readAll(); - QEXPECT_FAIL("xmldocs/not-wf/sa/144.xml", "", Continue); QEXPECT_FAIL("xmldocs/not-wf/sa/145.xml", "", Continue); - QEXPECT_FAIL("xmldocs/not-wf/sa/146.xml", "", Continue); - QEXPECT_FAIL("xmldocs/not-wf/sa/167.xml", "", Continue); - QEXPECT_FAIL("xmldocs/not-wf/sa/166.xml", "", Continue); - QEXPECT_FAIL("xmldocs/not-wf/sa/170.xml", "", Continue); QCOMPARE(parser.result(), ref_file_contents); } diff --git a/tests/auto/xml/sax/qxmlsimplereader/xmldocs/not-wf/sa/170.xml.ref b/tests/auto/xml/sax/qxmlsimplereader/xmldocs/not-wf/sa/170.xml.ref index 0508ee88c7..eca786f688 100644 --- a/tests/auto/xml/sax/qxmlsimplereader/xmldocs/not-wf/sa/170.xml.ref +++ b/tests/auto/xml/sax/qxmlsimplereader/xmldocs/not-wf/sa/170.xml.ref @@ -1,6 +1,6 @@ setDocumentLocator(locator={columnNumber=1, lineNumber=1}) startDocument() startElement(namespaceURI="", localName="doc", qName="doc", atts=[]) - characters(ch="í»€í°€") + characters(ch="�") endElement(namespaceURI="", localName="doc", qName="doc") endDocument() diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp index 67ed4c32b9..6101cfe8fb 100644 --- a/tests/benchmarks/corelib/tools/qstring/main.cpp +++ b/tests/benchmarks/corelib/tools/qstring/main.cpp @@ -1980,16 +1980,15 @@ int fromUtf8_qt47(ushort *dst, const char *chars, int len) --need; if (!need) { // utf-8 bom composes into 0xfeff code point - bool nonCharacter; if (!headerdone && uc == 0xfeff) { // don't do anything, just skip the BOM - } else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { + } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { // surrogate pair //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); *qch++ = QChar::highSurrogate(uc); *qch++ = QChar::lowSurrogate(uc); - } else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) { - // error: overlong sequence, UTF16 surrogate or non-character + } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) { + // error: overlong sequence or UTF16 surrogate *qch++ = replacement; ++invalid; } else { @@ -2086,16 +2085,15 @@ int fromUtf8_qt47_stateless(ushort *dst, const char *chars, int len) --need; if (!need) { // utf-8 bom composes into 0xfeff code point - bool nonCharacter; if (!headerdone && uc == 0xfeff) { // don't do anything, just skip the BOM - } else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { + } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { // surrogate pair //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); *qch++ = QChar::highSurrogate(uc); *qch++ = QChar::lowSurrogate(uc); - } else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) { - // error: overlong sequence, UTF16 surrogate or non-character + } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) { + // error: overlong sequence or UTF16 surrogate *qch++ = replacement; ++invalid; } else { @@ -2214,7 +2212,7 @@ static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptr chars += 2; len -= 2; if (!trusted && - (ucs < 0x800 || QChar::isNonCharacter(ucs) || QChar::isSurrogate(ucs))) + (ucs < 0x800 || QChar::isSurrogate(ucs))) dst[counter] = QChar::ReplacementCharacter; else dst[counter] = ucs; @@ -2245,7 +2243,7 @@ static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptr // dst[counter] will correspond to chars[counter..counter+2], so adjust chars += 3; len -= 3; - if (trusted || (QChar::requiresSurrogates(ucs) && ucs <= QChar::LastValidCodePoint && !QChar::isNonCharacter(ucs))) { + if (trusted || (QChar::requiresSurrogates(ucs) && ucs <= QChar::LastValidCodePoint)) { dst[counter + 0] = QChar::highSurrogate(ucs); dst[counter + 1] = QChar::lowSurrogate(ucs); counter += 2;