Remove the tolerant parsing function and make the recoder tolerant

The reason for this change is that the strict parser made little sense to exist. What would the recoder do if it was passed an invalid string? I believe that the tolerant recoder is more efficient than the correcting code followed by the strict recoder. This makes the recoder more complex and probably a little less efficient, but it's better in the common case (tolerant that doesn't need fixes) and in the worst case (needs fixes). Change-Id: I68a0c9fda6765de05914cbd6ba7d3cea560a7cd6 Reviewed-by: Lars Knoll <lars.knoll@nokia.com>
2011-09-06 20:36:28 +02:00 · 2011-09-06 20:36:28 +02:00 · 73e16b15a6
commit 73e16b15a6
parent 6028efa3ff
2 changed files with 110 additions and 115 deletions
--- a/src/corelib/io/qurlrecode.cpp
+++ b/src/corelib/io/qurlrecode.cpp
@ -133,6 +133,18 @@ static inline ushort decodeNibble(ushort c)
           c >= 'A' ? c - 'A' + 0xA : c - '0';
 }

+// if the sequence at input is 2*HEXDIG, returns its decoding
+// returns -1 if it isn't.
+// assumes that the range has been checked already
+static inline ushort decodePercentEncoding(const ushort *input)
+{
+    ushort c1 = input[0];
+    ushort c2 = input[1];
+    if (!isHex(c1) || !isHex(c2))
+        return ushort(-1);
+    return decodeNibble(c1) << 4 | decodeNibble(c2);
+}
+
 static inline ushort encodeNibble(ushort c)
 {
    static const uchar hexnumbers[] = "0123456789ABCDEF";
@ -170,16 +182,15 @@ static inline bool isUnicodeNonCharacter(uint ucs4)
 // returns true if we performed an UTF-8 decoding
 static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded)
 {
+    int charsNeeded;
+    uint min_uc;
+    uint uc;
+
    if (decoded <= 0xC1) {
        // an UTF-8 first character must be at least 0xC0
        // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
        return false;
-    }
-
-    int charsNeeded;
-    uint min_uc;
-    uint uc;
-    if (decoded < 0xe0) {
+    } else if (decoded < 0xe0) {
        charsNeeded = 1;
        min_uc = 0x80;
        uc = decoded & 0x1f;
@ -194,7 +205,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
    } else {
        // the last Unicode character is U+10FFFF
        // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
-        // therefore, a byte outside the range 0xC0..0xF4 is not the UTF-8 first byte
+        // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
        return false;
    }

@ -206,7 +217,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
        return false;

    // first continuation character
-    decoded = (decodeNibble(input[3]) << 4) | decodeNibble(input[4]);
+    decoded = decodePercentEncoding(input + 3);
    if ((decoded & 0xc0) != 0x80)
        return false;
    uc <<= 6;
@ -217,7 +228,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
            return false;

        // second continuation character
-        decoded = (decodeNibble(input[6]) << 4) | decodeNibble(input[7]);
+        decoded = decodePercentEncoding(input + 6);
        if ((decoded & 0xc0) != 0x80)
            return false;
        uc <<= 6;
@ -228,7 +239,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
                return false;

            // third continuation character
-            decoded = (decodeNibble(input[9]) << 4) | decodeNibble(input[10]);
+            decoded = decodePercentEncoding(input + 9);
            if ((decoded & 0xc0) != 0x80)
                return false;
            uc <<= 6;
@ -348,72 +359,82 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort
    *output++ = encodeNibble(c & 0xf);
 }

-Q_AUTOTEST_EXPORT QString
-qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
-             const uchar *tableModifications)
+static QString recode(const QString &component, QUrl::ComponentFormattingOptions encoding,
+                      const uchar *actionTable, bool retryBadEncoding)
 {
-    uchar actionTable[sizeof defaultActionTable];
-    memcpy(actionTable, defaultActionTable, sizeof actionTable);
-    if (encoding & QUrl::DecodeSpaces)
-        actionTable[0] = DecodeCharacter; // decode
-
-    if (tableModifications) {
-        for (const ushort *p = tableModifications; *p; ++p)
-            actionTable[uchar(*p) - ' '] = *p >> 8;
-    }
-
    QString result = component;
    const ushort *input = reinterpret_cast<const ushort *>(component.constData());
    const ushort * const end = input + component.length();
    ushort *output = 0;

    while (input != end) {
-        register ushort c = *input++;
-        register ushort decoded;
-        if (c == '%') {
-            // our input is always valid, so there are two hex characters for us to read here
-            decoded = (decodeNibble(input[0]) << 4) | decodeNibble(input[1]);
+        register ushort c;
+        EncodingAction action;
+
+        // try a run where no change is necessary
+        while (input != end) {
+            c = *input++;
+            if (c < 0x20 || c >= 0x80) // also: (c - 0x20 < 0x60U)
+                goto non_trivial;
+            action = EncodingAction(actionTable[c - ' ']);
+            if (action == EncodeCharacter)
+                goto non_trivial;
+            if (output)
+                *output++ = c;
+        }
+        break;
+
+non_trivial:
+        register uint decoded;
+        if (c == '%' && retryBadEncoding) {
+            // always write "%25"
+            ensureDetached(result, output, input, end);
+            *output++ = '%';
+            *output++ = '2';
+            *output++ = '5';
+            continue;
+        } else if (c == '%') {
+            // check if the input is valid
+            if (input + 1 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) {
+                // not valid, retry
+                result.clear();
+                return recode(component, encoding, actionTable, true);
+            }
+
+            if (decoded >= 0x80) {
+                // decode the UTF-8 sequence
+                if (encoding & QUrl::DecodeUnicode &&
+                        encodedUtf8ToUcs4(result, output, input, end, decoded))
+                    continue;
+
+                // decoding the encoded UTF-8 failed
+                action = LeaveCharacter;
+            } else if (decoded >= 0x20) {
+                action = EncodingAction(actionTable[decoded - ' ']);
+            }
        } else {
            decoded = c;
-        }
-
-        EncodingAction action;
-        if (decoded < 0x20) {
-            // always encode control characters
-            action = EncodeCharacter;
-        } else if (decoded < 0x80) {
-            // use the table
-            action = EncodingAction(actionTable[decoded - ' ']);
-        } else {
-            // non-ASCII
-            bool decodeUnicode = encoding & QUrl::DecodeUnicode;
-
-            // should we leave it like this?
-            if ((c != '%' && decodeUnicode) || (c == '%' && !decodeUnicode)) {
-                action = LeaveCharacter;
-            } else if (decodeUnicode) {
-                // c == '%': decode the UTF-8 sequence
-                if (encodedUtf8ToUcs4(result, output, input, end, decoded))
-                    continue;
-                action = LeaveCharacter;
-            } else {
-                // c != '%': encode the UTF-8 sequence
+            if (decoded >= 0x80 && (encoding & QUrl::DecodeUnicode) == 0) {
+                // encode the UTF-8 sequence
                unicodeToEncodedUtf8(result, output, input, end, decoded);
                continue;
+            } else if (decoded >= 0x80) {
+                if (output)
+                    *output++ = c;
+                continue;
            }
        }

+        if (decoded < 0x20)
+            action = EncodeCharacter;
+
        // there are six possibilities:
        //  current \ action  | DecodeCharacter | LeaveCharacter | EncodeCharacter
        //      decoded       |    1:leave      |    2:leave     |    3:encode
        //      encoded       |    4:decode     |    5:leave     |    6:leave
+        // cases 1 and 2 were handled before this section

-        if (c != '%' && (action == LeaveCharacter || action == DecodeCharacter)) {
-            // cases 1 and 2: it's decoded and we're leaving it as is
-            // there's always enough memory allocated for a single character
-            if (output)
-                *output++ = c;
-        } else if (c == '%' && (action == LeaveCharacter || action == EncodeCharacter)) {
+        if (c == '%' && action != DecodeCharacter) {
            // cases 5 and 6: it's encoded and we're leaving it as it is
            // except we're pedantic and we'll uppercase the hex
            if (output || !isUpperHex(input[0]) || !isUpperHex(input[1])) {
@ -442,63 +463,31 @@ qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding
 }

 Q_AUTOTEST_EXPORT QString
-qt_tolerantParsePercentEncoding(const QString &url)
+qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
+             const ushort *tableModifications)
 {
-    // are there any '%'
-    int firstPercent = url.indexOf(QLatin1Char('%'));
-    if (firstPercent == -1) {
-        // none found, the string is fine
-        return url;
+    uchar actionTable[sizeof defaultActionTable];
+    if (encoding & QUrl::DecodeAllDelimiters) {
+        // reset the table
+        memset(actionTable, DecodeCharacter, sizeof actionTable);
+        if (!(encoding & QUrl::DecodeSpaces))
+            actionTable[0] = EncodeCharacter;
+
+        // these are always encoded
+        actionTable['%' - ' '] = EncodeCharacter;
+        actionTable[0x7F - ' '] = EncodeCharacter;
+    } else {
+        memcpy(actionTable, defaultActionTable, sizeof actionTable);
+        if (encoding & QUrl::DecodeSpaces)
+            actionTable[0] = DecodeCharacter; // decode
    }

-    // are there any invalid percents?
-    int nextPercent = firstPercent;
-    int percentCount = 0;
-
-    {
-        int len = url.length();
-        bool ok = true;
-        do {
-            ++percentCount;
-            if (nextPercent + 2 >= len ||
-                    !isHex(url.at(nextPercent + 1).unicode()) ||
-                    !isHex(url.at(nextPercent + 2).unicode())) {
-                ok = false;
-            }
-
-            nextPercent = url.indexOf(QLatin1Char('%'), nextPercent + 1);
-        } while (nextPercent != -1);
-
-        if (ok)
-            return url;
+    if (tableModifications) {
+        for (const ushort *p = tableModifications; *p; ++p)
+            actionTable[uchar(*p) - ' '] = *p >> 8;
    }

-    // we've found at least one invalid percent
-    // that means all of them are invalid
-    QString corrected(url.size() + percentCount * 2, Qt::Uninitialized);
-    ushort *output = reinterpret_cast<ushort *>(corrected.data());
-    const ushort *input = reinterpret_cast<const ushort *>(url.constData());
-    for (int i = 0; i <= firstPercent; ++i)
-        output[i] = input[i];
-
-    const ushort *const end = input + url.length();
-    output += firstPercent + 1;
-    input += firstPercent + 1;
-
-    // we've copied up to the first percent
-    // correct this one and all others
-    *output++ = '2';
-    *output++ = '5';
-    while (input != end) {
-        // copy verbatim until the next percent, inclusive
-        *output++ = *input;
-        if (*input == '%') {
-            *output++ = '2';
-            *output++ = '5';
-        }
-        ++input;
-    }
-    return corrected;
+    return recode(component, encoding, actionTable, false);
 }

 QT_END_NAMESPACE
--- a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
+++ b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
@ -50,7 +50,6 @@ Q_CORE_EXPORT extern void qt_nameprep(QString *source, int from);
 Q_CORE_EXPORT extern bool qt_check_std3rules(const QChar *, int);
 Q_CORE_EXPORT void qt_punycodeEncoder(const QChar *s, int ucLength, QString *output);
 Q_CORE_EXPORT QString qt_punycodeDecoder(const QString &pc);
-Q_CORE_EXPORT QString qt_tolerantParsePercentEncoding(const QString &url);
 Q_CORE_EXPORT QString qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
                                   const ushort *tableModifications = 0);
 QT_END_NAMESPACE
@ -791,6 +790,17 @@ void tst_QUrlInternal::correctEncodedMistakes_data()

    // three percents, one invalid
    QTest::newRow("%01%02%3") << "%01%02%3" << "%2501%2502%253";
+
+    // now mix bad percents with Unicode decoding
+    QTest::newRow("%C2%") << "%C2%" << "%25C2%25";
+    QTest::newRow("%C2%A") << "%C2%A" << "%25C2%25A";
+    QTest::newRow("%C2%Az") << "%C2%Az" << "%25C2%25Az";
+    QTest::newRow("%E2%A0%") << "%E2%A0%" << "%25E2%25A0%25";
+    QTest::newRow("%E2%A0%A") << "%E2%A0%A" << "%25E2%25A0%25A";
+    QTest::newRow("%E2%A0%Az") << "%E2%A0%Az" << "%25E2%25A0%25Az";
+    QTest::newRow("%F2%A0%A0%") << "%F2%A0%A0%" << "%25F2%25A0%25A0%25";
+    QTest::newRow("%F2%A0%A0%A") << "%F2%A0%A0%A" << "%25F2%25A0%25A0%25A";
+    QTest::newRow("%F2%A0%A0%Az") << "%F2%A0%A0%Az" << "%25F2%25A0%25A0%25Az";
 }

 void tst_QUrlInternal::correctEncodedMistakes()
@ -798,7 +808,7 @@ void tst_QUrlInternal::correctEncodedMistakes()
    QFETCH(QString, input);
    QFETCH(QString, expected);

-    QString output = qt_tolerantParsePercentEncoding(input);
+    QString output = qt_urlRecode(input, QUrl::DecodeUnicode);
    QCOMPARE(output, expected);
    QCOMPARE(output.isNull(), expected.isNull());
 }
@ -921,10 +931,6 @@ void tst_QUrlInternal::encodingRecode()
    QFETCH(QString, expected);
    QFETCH(QUrl::ComponentFormattingOptions, encodingMode);

-    // ensure the string is properly percent-encoded
-    QVERIFY2(input == qt_tolerantParsePercentEncoding(input), "Test data is not properly encoded");
-    QVERIFY2(expected == qt_tolerantParsePercentEncoding(expected), "Test data is not properly encoded");
-
    QString output = qt_urlRecode(input, encodingMode);
    QCOMPARE(output, expected);
    QCOMPARE(output.isNull(), expected.isNull());