From add2bf739ae96603cb919b908cbb53c00d0628cc Mon Sep 17 00:00:00 2001
From: Kurt Pattyn <pattyn.kurt@gmail.com>
Date: Sun, 6 Oct 2013 11:40:47 +0200
Subject: [PATCH] Allow non-character codes in utf8 strings

Changed the processing of non-character code handling in the UTF8 codec.
Non-character codes are now accepted in QStrings, QUrls and QJson strings.
Unit tests were adapted accordingly.
For more info about non-character codes,
see: http://www.unicode.org/versions/corrigendum9.html

[ChangeLog][QtCore][QUtf8]
UTF-8 now accepts non-character unicode points; these are not replaced
by the replacement character anymore

[ChangeLog][QtCore][QUrl]
QUrl now fully accepts non-character unicode points; they are encoded as
percent characters; they can also be pretty decoded

[ChangeLog][QtCore][QJson]
The Writer and the Parser now fully accept non-character unicode points.

Change-Id: I77cf4f0e6210741eac8082912a0b6118eced4f77
Task-number: QTBUG-33229
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
---
 src/corelib/codecs/qutfcodec.cpp              | 13 +----
 src/corelib/io/qurlrecode.cpp                 |  2 +-
 src/corelib/json/qjsonparser.cpp              |  2 +-
 src/corelib/json/qjsonwriter.cpp              |  7 ---
 .../codecs/qtextcodec/tst_qtextcodec.cpp      | 54 ++++++++++---------
 tests/auto/corelib/codecs/utf8/tst_utf8.cpp   | 20 ++++---
 tests/auto/corelib/codecs/utf8/utf8data.cpp   |  4 +-
 .../io/qurlinternal/tst_qurlinternal.cpp      |  7 ++-
 tests/auto/corelib/json/tst_qtjson.cpp        | 24 +++++++--
 .../qxmlsimplereader/tst_qxmlsimplereader.cpp | 21 ++------
 .../xmldocs/not-wf/sa/170.xml.ref             |  2 +-
 .../benchmarks/corelib/tools/qstring/main.cpp | 18 +++----
 12 files changed, 78 insertions(+), 96 deletions(-)

diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index aeedcf1aa1..e425f8634c 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -106,14 +106,6 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
             if (u < 0x0800) {
                 *cursor++ = 0xc0 | ((uchar) (u >> 6));
             } else {
-                // is it one of the Unicode non-characters?
-                if (QChar::isNonCharacter(u)) {
-                    *cursor++ = replacement;
-                    ++ch;
-                    ++invalid;
-                    continue;
-                }
-
                 if (QChar::requiresSurrogates(u)) {
                     *cursor++ = 0xf0 | ((uchar) (u >> 18));
                     *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
@@ -180,15 +172,14 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
                 --need;
                 if (!need) {
                     // utf-8 bom composes into 0xfeff code point
-                    bool nonCharacter;
                     if (!headerdone && uc == 0xfeff) {
                         // don't do anything, just skip the BOM
-                    } else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
+                    } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
                         // surrogate pair
                         Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
                         *qch++ = QChar::highSurrogate(uc);
                         *qch++ = QChar::lowSurrogate(uc);
-                    } else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) {
+                    } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
                         // error: overlong sequence, UTF16 surrogate or non-character
                         *qch++ = replacement;
                         ++invalid;
diff --git a/src/corelib/io/qurlrecode.cpp b/src/corelib/io/qurlrecode.cpp
index 7e77b9c251..ba1a77744c 100644
--- a/src/corelib/io/qurlrecode.cpp
+++ b/src/corelib/io/qurlrecode.cpp
@@ -304,7 +304,7 @@ static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *b
     // we've decoded something; safety-check it
     if (uc < min_uc)
         return false;
-    if (QChar::isSurrogate(uc) || QChar::isNonCharacter(uc) || uc > QChar::LastValidCodePoint)
+    if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
         return false;
 
     if (!QChar::requiresSurrogates(uc)) {
diff --git a/src/corelib/json/qjsonparser.cpp b/src/corelib/json/qjsonparser.cpp
index 8721f06064..516c53775c 100644
--- a/src/corelib/json/qjsonparser.cpp
+++ b/src/corelib/json/qjsonparser.cpp
@@ -853,7 +853,7 @@ static inline bool scanUtf8Char(const char *&json, const char *end, uint *result
         uc = (uc << 6) | (ch & 0x3f);
     }
 
-    if (uc < min_uc || QChar::isNonCharacter(uc) ||
+    if (uc < min_uc ||
         QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
         return false;
     }
diff --git a/src/corelib/json/qjsonwriter.cpp b/src/corelib/json/qjsonwriter.cpp
index 8426b351f6..86cca4bb26 100644
--- a/src/corelib/json/qjsonwriter.cpp
+++ b/src/corelib/json/qjsonwriter.cpp
@@ -138,13 +138,6 @@ static QByteArray escapedString(const QString &s)
             if (u < 0x0800) {
                 *cursor++ = 0xc0 | ((uchar) (u >> 6));
             } else {
-                // is it one of the Unicode non-characters?
-                if (QChar::isNonCharacter(u)) {
-                    *cursor++ = replacement;
-                    ++ch;
-                    continue;
-                }
-
                 if (QChar::requiresSurrogates(u)) {
                     *cursor++ = 0xf0 | ((uchar) (u >> 18));
                     *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
index dd557b8d21..8e1b3cf3b2 100644
--- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
+++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
@@ -66,9 +66,9 @@ private slots:
     void codecForLocale();
 
     void asciiToIscii() const;
-    void flagCodepointFFFF() const;
+    void nonFlaggedCodepointFFFF() const;
     void flagF7808080() const;
-    void flagEFBFBF() const;
+    void nonFlaggedEFBFBF() const;
     void decode0D() const;
     void aliasForUTF16() const;
     void mibForTSCII() const;
@@ -409,9 +409,9 @@ void tst_QTextCodec::asciiToIscii() const
     }
 }
 
-void tst_QTextCodec::flagCodepointFFFF() const
+void tst_QTextCodec::nonFlaggedCodepointFFFF() const
 {
-    // This is an invalid Unicode codepoint.
+    //Check that the code point 0xFFFF (=non-character code 0xEFBFBF) is not flagged
     const QChar ch(0xFFFF);
     QString input(ch);
 
@@ -419,12 +419,11 @@ void tst_QTextCodec::flagCodepointFFFF() const
     QVERIFY(codec);
 
     const QByteArray asDecoded(codec->fromUnicode(input));
-    QCOMPARE(asDecoded, QByteArray("?"));
+    QCOMPARE(asDecoded, QByteArray("\357\277\277"));
 
     QByteArray ffff("\357\277\277");
     QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
-    QVERIFY(codec->toUnicode(ffff.constData(), ffff.length(), &state) == QChar(0));
-    QVERIFY(codec->toUnicode(ffff) == QChar(0xfffd));
+    QVERIFY(codec->toUnicode(ffff.constData(), ffff.length(), &state) == QByteArray::fromHex("EFBFBF"));
 }
 
 void tst_QTextCodec::flagF7808080() const
@@ -460,13 +459,16 @@ void tst_QTextCodec::flagF7808080() const
     QVERIFY(codec->toUnicode(input.constData(), input.length(), &state) == QChar(0));
 }
 
-void tst_QTextCodec::flagEFBFBF() const
+void tst_QTextCodec::nonFlaggedEFBFBF() const
 {
-    QByteArray invalidInput;
-    invalidInput.resize(3);
-    invalidInput[0] = char(0xEF);
-    invalidInput[1] = char(0xBF);
-    invalidInput[2] = char(0xBF);
+    /* Check that the codec does NOT flag EFBFBF.
+     * This is a regression test; see QTBUG-33229
+     */
+    QByteArray validInput;
+    validInput.resize(3);
+    validInput[0] = char(0xEF);
+    validInput[1] = char(0xBF);
+    validInput[2] = char(0xBF);
 
     const QTextCodec *const codec = QTextCodec::codecForMib(106); // UTF-8
     QVERIFY(codec);
@@ -474,21 +476,20 @@ void tst_QTextCodec::flagEFBFBF() const
     {
         //QVERIFY(!codec->canEncode(QChar(0xFFFF)));
         QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
-        QVERIFY(codec->toUnicode(invalidInput.constData(), invalidInput.length(), &state) == QChar(0));
+        QVERIFY(codec->toUnicode(validInput.constData(), validInput.length(), &state) == QByteArray::fromHex("EFBFBF"));
 
         QByteArray start("<?pi ");
-        start.append(invalidInput);
+        start.append(validInput);
         start.append("?>");
     }
 
-    /* When 0xEFBFBF is preceded by what seems to be an arbitrary character,
-     * QTextCodec fails to flag it. */
+    // Check that 0xEFBFBF is correctly decoded when preceded by an arbitrary character
     {
         QByteArray start("B");
-        start.append(invalidInput);
+        start.append(validInput);
 
         QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
-        QVERIFY(codec->toUnicode(start.constData(), start.length(), &state) == QString::fromLatin1("B\0", 2));
+        QVERIFY(codec->toUnicode(start.constData(), start.length(), &state) == QByteArray("B").append(QByteArray::fromHex("EFBFBF")));
     }
 }
 
@@ -674,13 +675,12 @@ void tst_QTextCodec::utf8Codec_data()
     str = QChar(0x7ff);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.2") << utf8 << str << -1;
 
-    // 2.2.3 U+000FFFF
+    // 2.2.3 U+000FFFF - non-character code
     utf8.clear();
     utf8 += char(0xef);
     utf8 += char(0xbf);
     utf8 += char(0xbf);
-    str.clear();
-    str += QChar::ReplacementCharacter;
+    str = QString::fromUtf8(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.3") << utf8 << str << -1;
 
     // 2.2.4 U+001FFFFF
@@ -1535,20 +1535,22 @@ void tst_QTextCodec::utf8Codec_data()
     str += QChar(QChar::ReplacementCharacter);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.8") << utf8 << str << -1;
 
-    // 5.3.1
+    // 5.3.1 - non-character code
     utf8.clear();
     utf8 += char(0xef);
     utf8 += char(0xbf);
     utf8 += char(0xbe);
-    str = QChar(QChar::ReplacementCharacter);
+    //str = QChar(QChar::ReplacementCharacter);
+    str = QString::fromUtf8(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.1") << utf8 << str << -1;
 
-    // 5.3.2
+    // 5.3.2 - non-character code
     utf8.clear();
     utf8 += char(0xef);
     utf8 += char(0xbf);
     utf8 += char(0xbf);
-    str = QChar(QChar::ReplacementCharacter);
+    //str = QChar(QChar::ReplacementCharacter);
+    str = QString::fromUtf8(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.2") << utf8 << str << -1;
 }
 
diff --git a/tests/auto/corelib/codecs/utf8/tst_utf8.cpp b/tests/auto/corelib/codecs/utf8/tst_utf8.cpp
index 99147f3aff..e18f6f73b9 100644
--- a/tests/auto/corelib/codecs/utf8/tst_utf8.cpp
+++ b/tests/auto/corelib/codecs/utf8/tst_utf8.cpp
@@ -233,8 +233,9 @@ void tst_Utf8::nonCharacters_data()
     QTest::addColumn<QByteArray>("utf8");
     QTest::addColumn<QString>("utf16");
 
-    // Unicode has a couple of "non-characters" that one can use internally,
-    // but are not allowed to be used for text interchange.
+    // Unicode has a couple of "non-characters" that one can use internally
+    // These characters may be used for interchange;
+    // see: http://www.unicode.org/versions/corrigendum9.html
     //
     // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
     // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
@@ -279,20 +280,17 @@ void tst_Utf8::nonCharacters()
     decoder->toUnicode(utf8);
 
     // Only enforce correctness on our UTF-8 decoder
-    // The system's UTF-8 codec is sometimes buggy
-    //  GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8
-    //  OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF
     if (!useLocale)
-        QVERIFY(decoder->hasFailure());
-    else if (!decoder->hasFailure())
-        qWarning("System codec does not report failure when it should. Should report bug upstream.");
+        QVERIFY(!decoder->hasFailure());
+    else if (decoder->hasFailure())
+        qWarning("System codec reports failure when it shouldn't. Should report bug upstream.");
 
     QSharedPointer<QTextEncoder> encoder(codec->makeEncoder());
     encoder->fromUnicode(utf16);
     if (!useLocale)
-        QVERIFY(encoder->hasFailure());
-    else if (!encoder->hasFailure())
-        qWarning("System codec does not report failure when it should. Should report bug upstream.");
+        QVERIFY(!encoder->hasFailure());
+    else if (encoder->hasFailure())
+        qWarning("System codec reports failure when it shouldn't. Should report bug upstream.");
 }
 
 QTEST_MAIN(tst_Utf8)
diff --git a/tests/auto/corelib/codecs/utf8/utf8data.cpp b/tests/auto/corelib/codecs/utf8/utf8data.cpp
index 2516cc9734..a41b0772e6 100644
--- a/tests/auto/corelib/codecs/utf8/utf8data.cpp
+++ b/tests/auto/corelib/codecs/utf8/utf8data.cpp
@@ -129,8 +129,8 @@ void loadInvalidUtf8Rows()
 
 void loadNonCharactersRows()
 {
-    // Unicode has a couple of "non-characters" that one can use internally,
-    // but are not allowed to be used for text interchange.
+    // Unicode has a couple of "non-characters" that one can use internally
+    // These characters are allowed for text-interchange (see http://www.unicode.org/versions/corrigendum9.html)
     //
     // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
     // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
diff --git a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
index 75b17df759..d3a8bcfd13 100644
--- a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
+++ b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
@@ -964,8 +964,10 @@ void tst_QUrlInternal::encodingRecode_data()
     addUtf8Data("utf8-string-2", "\xDF\xBF\xE0\xA0\x80""A");
     addUtf8Data("utf8-string-3", "\xE0\xA0\x80\xDF\xBF...");
 
+    QTest::newRow("encode-unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::FullyEncoded) << "%EF%BF%BF";
+    QTest::newRow("decode-unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::PrettyDecoded) << QString::fromUtf8("\xEF\xBF\xBF");
+
     // special cases: stuff we can encode, but not decode
-    QTest::newRow("unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::FullyEncoded) << "%EF%BF%BF";
     QTest::newRow("unicode-lo-surrogate") << QString(QChar(0xD800)) << F(QUrl::FullyEncoded) << "%ED%A0%80";
     QTest::newRow("unicode-hi-surrogate") << QString(QChar(0xDC00)) << F(QUrl::FullyEncoded) << "%ED%B0%80";
 
@@ -1011,9 +1013,6 @@ void tst_QUrlInternal::encodingRecodeInvalidUtf8_data()
     extern void loadInvalidUtf8Rows();
     loadInvalidUtf8Rows();
 
-    extern void loadNonCharactersRows();
-    loadNonCharactersRows();
-
     QTest::newRow("utf8-mix-4") << QByteArray("\xE0.A2\x80");
     QTest::newRow("utf8-mix-5") << QByteArray("\xE0\xA2.80");
     QTest::newRow("utf8-mix-6") << QByteArray("\xE0\xA2\x33");
diff --git a/tests/auto/corelib/json/tst_qtjson.cpp b/tests/auto/corelib/json/tst_qtjson.cpp
index 9dbd6414ad..c79e7273c0 100644
--- a/tests/auto/corelib/json/tst_qtjson.cpp
+++ b/tests/auto/corelib/json/tst_qtjson.cpp
@@ -47,7 +47,8 @@
 #include "qjsondocument.h"
 #include <limits>
 
-#define INVALID_UNICODE "\357\277\277" // "\uffff"
+#define INVALID_UNICODE "\xCE\xBA\xE1"
+#define UNICODE_NON_CHARACTER "\xEF\xBF\xBF"
 #define UNICODE_DJE "\320\202" // Character from the Serbian Cyrillic alphabet
 
 class tst_QtJson: public QObject
@@ -1305,6 +1306,19 @@ void tst_QtJson::fromJson()
         QCOMPARE(array.at(0).toBool(), true);
         QCOMPARE(doc.toJson(), json);
     }
+    {
+        //regression test: test if unicode_control_characters are correctly decoded
+        QByteArray json = "[\n    \"" UNICODE_NON_CHARACTER "\"\n]\n";
+        QJsonDocument doc = QJsonDocument::fromJson(json);
+        QVERIFY(!doc.isEmpty());
+        QCOMPARE(doc.isArray(), true);
+        QCOMPARE(doc.isObject(), false);
+        QJsonArray array = doc.array();
+        QCOMPARE(array.size(), 1);
+        QCOMPARE(array.at(0).type(), QJsonValue::String);
+        QCOMPARE(array.at(0).toString(), QString::fromUtf8(UNICODE_NON_CHARACTER));
+        QCOMPARE(doc.toJson(), json);
+    }
     {
         QByteArray json = "[]";
         QJsonDocument doc = QJsonDocument::fromJson(json);
@@ -1532,7 +1546,7 @@ void tst_QtJson::fromJsonErrors()
         QJsonDocument doc = QJsonDocument::fromJson(json, &error);
         QVERIFY(doc.isEmpty());
         QCOMPARE(error.error, QJsonParseError::IllegalUTF8String);
-        QCOMPARE(error.offset, 13);
+        QCOMPARE(error.offset, 14);
     }
     {
         QJsonParseError error;
@@ -1556,7 +1570,7 @@ void tst_QtJson::fromJsonErrors()
         QJsonDocument doc = QJsonDocument::fromJson(json, &error);
         QVERIFY(doc.isEmpty());
         QCOMPARE(error.error, QJsonParseError::IllegalUTF8String);
-        QCOMPARE(error.offset, 14);
+        QCOMPARE(error.offset, 15);
     }
     {
         QJsonParseError error;
@@ -1702,6 +1716,7 @@ void tst_QtJson::parseStrings()
         "abc\\tabc",
         "abc\\u0019abc",
         "abc" UNICODE_DJE "abc",
+        UNICODE_NON_CHARACTER
     };
     int size = sizeof(strings)/sizeof(const char *);
 
@@ -1728,7 +1743,8 @@ void tst_QtJson::parseStrings()
     Pairs pairs [] = {
         { "abc\\/abc", "abc/abc" },
         { "abc\\u0402abc", "abc" UNICODE_DJE "abc" },
-        { "abc\\u0065abc", "abceabc" }
+        { "abc\\u0065abc", "abceabc" },
+        { "abc\\uFFFFabc", "abc" UNICODE_NON_CHARACTER "abc" }
     };
     size = sizeof(pairs)/sizeof(Pairs);
 
diff --git a/tests/auto/xml/sax/qxmlsimplereader/tst_qxmlsimplereader.cpp b/tests/auto/xml/sax/qxmlsimplereader/tst_qxmlsimplereader.cpp
index d4c0ff44ca..5be43e2c8f 100644
--- a/tests/auto/xml/sax/qxmlsimplereader/tst_qxmlsimplereader.cpp
+++ b/tests/auto/xml/sax/qxmlsimplereader/tst_qxmlsimplereader.cpp
@@ -315,8 +315,6 @@ void tst_QXmlSimpleReader::testGoodXmlFile()
     QVERIFY(file.open(QIODevice::ReadOnly));
     Parser parser;
 
-//    static int i = 0;
-//    qWarning("Test nr: " + QString::number(i)); ++i;
     QEXPECT_FAIL("xmldocs/valid/sa/089.xml", "", Continue);
     QVERIFY(parser.parseFile(&file));
 
@@ -326,7 +324,6 @@ void tst_QXmlSimpleReader::testGoodXmlFile()
     ref_stream.setCodec("UTF-8");
     QString ref_file_contents = ref_stream.readAll();
 
-    QEXPECT_FAIL("xmldocs/valid/sa/089.xml", "", Continue);
     QCOMPARE(parser.result(), ref_file_contents);
 }
 
@@ -355,8 +352,6 @@ void tst_QXmlSimpleReader::testBadXmlFile()
     QVERIFY(file.open(QIODevice::ReadOnly));
     Parser parser;
 
-//    static int i = 0;
-//    qWarning("Test nr: " + QString::number(++i));
     QEXPECT_FAIL("xmldocs/not-wf/sa/030.xml", "", Continue);
     QEXPECT_FAIL("xmldocs/not-wf/sa/031.xml", "", Continue);
     QEXPECT_FAIL("xmldocs/not-wf/sa/032.xml", "", Continue);
@@ -381,22 +376,17 @@ void tst_QXmlSimpleReader::testBadXmlFile()
     QEXPECT_FAIL("xmldocs/not-wf/sa/132.xml", "", Continue);
     QEXPECT_FAIL("xmldocs/not-wf/sa/142.xml", "", Continue);
     QEXPECT_FAIL("xmldocs/not-wf/sa/143.xml", "", Continue);
+
     QEXPECT_FAIL("xmldocs/not-wf/sa/144.xml", "", Continue);
     QEXPECT_FAIL("xmldocs/not-wf/sa/145.xml", "", Continue);
     QEXPECT_FAIL("xmldocs/not-wf/sa/146.xml", "", Abort);
     QEXPECT_FAIL("xmldocs/not-wf/sa/160.xml", "", Continue);
     QEXPECT_FAIL("xmldocs/not-wf/sa/162.xml", "", Continue);
-    QEXPECT_FAIL("xmldocs/not-wf/sa/166.xml", "", Continue);
-    QEXPECT_FAIL("xmldocs/not-wf/sa/167.xml", "", Continue);
+
     QEXPECT_FAIL("xmldocs/not-wf/sa/168.xml", "", Continue);
     QEXPECT_FAIL("xmldocs/not-wf/sa/169.xml", "", Continue);
     QEXPECT_FAIL("xmldocs/not-wf/sa/170.xml", "", Continue);
-    QEXPECT_FAIL("xmldocs/not-wf/sa/171.xml", "", Abort);
-    QEXPECT_FAIL("xmldocs/not-wf/sa/172.xml", "", Abort);
-    QEXPECT_FAIL("xmldocs/not-wf/sa/173.xml", "", Abort);
-    QEXPECT_FAIL("xmldocs/not-wf/sa/174.xml", "", Abort);
-    QEXPECT_FAIL("xmldocs/not-wf/sa/175.xml", "", Abort);
-    QEXPECT_FAIL("xmldocs/not-wf/sa/177.xml", "", Abort);
+
     QEXPECT_FAIL("xmldocs/not-wf/sa/180.xml", "", Continue);
     QEXPECT_FAIL("xmldocs/not-wf/sa/181.xml", "", Continue);
     QEXPECT_FAIL("xmldocs/not-wf/sa/182.xml", "", Continue);
@@ -411,12 +401,7 @@ void tst_QXmlSimpleReader::testBadXmlFile()
     ref_stream.setCodec("UTF-8");
     QString ref_file_contents = ref_stream.readAll();
 
-    QEXPECT_FAIL("xmldocs/not-wf/sa/144.xml", "", Continue);
     QEXPECT_FAIL("xmldocs/not-wf/sa/145.xml", "", Continue);
-    QEXPECT_FAIL("xmldocs/not-wf/sa/146.xml", "", Continue);
-    QEXPECT_FAIL("xmldocs/not-wf/sa/167.xml", "", Continue);
-    QEXPECT_FAIL("xmldocs/not-wf/sa/166.xml", "", Continue);
-    QEXPECT_FAIL("xmldocs/not-wf/sa/170.xml", "", Continue);
 
     QCOMPARE(parser.result(), ref_file_contents);
 }
diff --git a/tests/auto/xml/sax/qxmlsimplereader/xmldocs/not-wf/sa/170.xml.ref b/tests/auto/xml/sax/qxmlsimplereader/xmldocs/not-wf/sa/170.xml.ref
index 0508ee88c7..eca786f688 100644
--- a/tests/auto/xml/sax/qxmlsimplereader/xmldocs/not-wf/sa/170.xml.ref
+++ b/tests/auto/xml/sax/qxmlsimplereader/xmldocs/not-wf/sa/170.xml.ref
@@ -1,6 +1,6 @@
 setDocumentLocator(locator={columnNumber=1, lineNumber=1})
 startDocument()
    startElement(namespaceURI="", localName="doc", qName="doc", atts=[])
-      characters(ch="������")
+      characters(ch="�")
    endElement(namespaceURI="", localName="doc", qName="doc")
 endDocument()
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp
index 67ed4c32b9..6101cfe8fb 100644
--- a/tests/benchmarks/corelib/tools/qstring/main.cpp
+++ b/tests/benchmarks/corelib/tools/qstring/main.cpp
@@ -1980,16 +1980,15 @@ int fromUtf8_qt47(ushort *dst, const char *chars, int len)
                 --need;
                 if (!need) {
                     // utf-8 bom composes into 0xfeff code point
-                    bool nonCharacter;
                     if (!headerdone && uc == 0xfeff) {
                         // don't do anything, just skip the BOM
-                    } else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
+                    } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
                         // surrogate pair
                         //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
                         *qch++ = QChar::highSurrogate(uc);
                         *qch++ = QChar::lowSurrogate(uc);
-                    } else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) {
-                        // error: overlong sequence, UTF16 surrogate or non-character
+                    } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
+                        // error: overlong sequence or UTF16 surrogate
                         *qch++ = replacement;
                         ++invalid;
                     } else {
@@ -2086,16 +2085,15 @@ int fromUtf8_qt47_stateless(ushort *dst, const char *chars, int len)
                 --need;
                 if (!need) {
                     // utf-8 bom composes into 0xfeff code point
-                    bool nonCharacter;
                     if (!headerdone && uc == 0xfeff) {
                         // don't do anything, just skip the BOM
-                    } else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
+                    } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
                         // surrogate pair
                         //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
                         *qch++ = QChar::highSurrogate(uc);
                         *qch++ = QChar::lowSurrogate(uc);
-                    } else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) {
-                        // error: overlong sequence, UTF16 surrogate or non-character
+                    } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
+                        // error: overlong sequence or UTF16 surrogate
                         *qch++ = replacement;
                         ++invalid;
                     } else {
@@ -2214,7 +2212,7 @@ static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptr
         chars += 2;
         len -= 2;
         if (!trusted &&
-            (ucs < 0x800 || QChar::isNonCharacter(ucs) || QChar::isSurrogate(ucs)))
+            (ucs < 0x800 || QChar::isSurrogate(ucs)))
             dst[counter] = QChar::ReplacementCharacter;
         else
             dst[counter] = ucs;
@@ -2245,7 +2243,7 @@ static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptr
         // dst[counter] will correspond to chars[counter..counter+2], so adjust
         chars += 3;
         len -= 3;
-        if (trusted || (QChar::requiresSurrogates(ucs) && ucs <= QChar::LastValidCodePoint && !QChar::isNonCharacter(ucs))) {
+        if (trusted || (QChar::requiresSurrogates(ucs) && ucs <= QChar::LastValidCodePoint)) {
             dst[counter + 0] = QChar::highSurrogate(ucs);
             dst[counter + 1] = QChar::lowSurrogate(ucs);
             counter += 2;