From 993bbb4d4be524321575668740ea46c6665d6064 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Mon, 1 Jul 2013 17:16:54 -0700 Subject: [PATCH] QUrl: update our understanding of the encoding of delimiters The longer explanation can be found in the comment in qurl.cpp. The short version is as follows: Up to now, we considered that every character could be replaced with its percent-encoding equivalent and vice-versa, so long as the parsing of the URL did not change. For example, x:/path+path and x:/path%2Bpath were the same. However, to do this and yet be compliant with most URL uses in the real world, we had to add exceptions: - "/" and "%2F" were not the same in the path, despite the delimiter being behind (rationale was the complex definition of path) - "+" and "%2B" were not the same in the query, so we ended up not transforming any sub-delim in the query at all Now, we change our understanding based on the following line from RFC 3986 section 2.2: URIs that differ in the replacement of a reserved character with its corresponding percent-encoded octet are not equivalent. From now on, QUrl will not replace any sub-delim or gen-delim ("reserved character"), except where such a character could not exist in the first place. This simplifies the code and removes all exceptions. As a side-effect, this has also changed the behaviour of the "{" and "}" characters, which we previously allowed to remain decoded. [ChangeLog][Important Behavior Changes][QUrl and QUrlQuery] QUrl no longer considers all delimiter characters equivalent to their percent-encoded forms. Now, both classes always keep all delimiters exactly as they were in the original URL text. [ChangeLog][Important Behavior Changes][QUrl and QUrlQuery] QUrl no longer decodes %7B and %7D to "{" and "}" in the output of toString() Task-number: QTBUG-31660 Change-Id: Iba0b5b31b269635ac2d0adb2bb0dfb74c139e08c Reviewed-by: David Faure (KDE) --- src/corelib/io/qurl.cpp | 352 +++++++++++------------- src/corelib/io/qurlrecode.cpp | 57 ---- tests/auto/corelib/io/qurl/tst_qurl.cpp | 164 +++++------ 3 files changed, 222 insertions(+), 351 deletions(-) diff --git a/src/corelib/io/qurl.cpp b/src/corelib/io/qurl.cpp index 30933b32a6..ec5d954a6a 100644 --- a/src/corelib/io/qurl.cpp +++ b/src/corelib/io/qurl.cpp @@ -520,7 +520,7 @@ inline void QUrlPrivate::setError(ErrorCode errorCode, const QString &source, in error->position = supplement; } -// From RFC 3896, Appendix A Collected ABNF for URI +// From RFC 3986, Appendix A Collected ABNF for URI // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] //[...] // scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) @@ -548,46 +548,63 @@ inline void QUrlPrivate::setError(ErrorCode errorCode, const QString &source, in // the path component has a complex ABNF that basically boils down to // slash-separated segments of "pchar" -// The above is the strict definition of the URL components and it is what we -// return encoded as FullyEncoded. However, we store the equivalent to -// PrettyDecoded internally, as that is the default formatting mode and most -// likely to be used. PrettyDecoded decodes spaces, unicode sequences and -// unambiguous delimiters. +// The above is the strict definition of the URL components and we mostly +// adhere to it, with few exceptions. QUrl obeys the following behavior: +// - percent-encoding sequences always use uppercase HEXDIG; +// - unreserved characters are *always* decoded, no exceptions; +// - the space character and bytes with the high bit set are controlled by +// the EncodeSpaces and EncodeUnicode bits; +// - control characters, the percent sign itself, and bytes with the high +// bit set that don't form valid UTF-8 sequences are always encoded, +// except in FullyDecoded mode; +// - sub-delims are always left alone, except in FullyDecoded mode; +// - gen-delim change behavior depending on which section of the URL (or +// the entire URL) we're looking at; see below; +// - characters not mentioned above, like "<", and ">", are usually +// decoded in individual sections of the URL, but encoded when the full +// URL is put together (we can change on subjective definition of +// "pretty"). // -// An ambiguous delimiter is a delimiter that, if appeared decoded, would be -// interpreted as the beginning of a new component. The exact delimiters that -// match that definition change according to the use. When each field is -// considered in isolation from the rest, there are no ambiguities. In other -// words, we always store the most decoded form (except for the query, see -// below). +// The behavior for the delimiters bears some explanation. The spec says in +// section 2.2: +// URIs that differ in the replacement of a reserved character with its +// corresponding percent-encoded octet are not equivalent. +// (note: QUrl API mistakenly uses the "reserved" term, so we will refer to +// them here as "delimiters"). // -// The ambiguities arise when components are put together. From last to first -// component of a full URL, the ambiguities are: -// - fragment: none, since it's the last. -// - query: the "#" character is ambiguous, as it starts the fragment. In -// addition, the "+" character is treated specially, as should be both -// intra-query delimiters. Since we don't know which ones they are, we -// keep all reserved characters untouched. -// - path: the "#" and "?" characters are ambigous. In addition to them, -// the slash itself is considered special. +// For that reason, we cannot encode delimiters found in decoded form and we +// cannot decode the ones found in encoded form if that would change the +// interpretation. Conversely, we *can* perform the transformation if it would +// not change the interpretation. From the last component of a URL to the first, +// here are the gen-delims we can unambiguously transform when the field is +// taken in isolation: +// - fragment: none, since it's the last +// Deviation: the spec says "#" <-> %23 is unambiguous, but we treat it as if were +// - query: "#" is unambiguous +// - path: "#" and "?" are unambiguous // - host: completely special but never ambiguous, see setHost() below. -// - password: the "#", "?", "/", "[", "]" and "@" characters are ambiguous -// - username: the "#", "?", "/", "[", "]", "@", and ":" characters are ambiguous +// - password: the "#", "?", "/", "[", "]" and "@" characters are unambiguous +// - username: the "#", "?", "/", "[", "]", "@", and ":" characters are unambiguous // - scheme: doesn't accept any delimiter, see setScheme() below. // -// When the authority component is considered in isolation, the ambiguities of -// its components are: -// - host: special, never ambiguous -// - password: "[", "]", "@" are ambiguous -// - username: "[", "]", "@", ":" are ambiguous +// Internally, QUrl stores each component in the format that corresponds to the +// default mode (PrettyDecoded). It deviates from the "strict" FullyEncoded +// mode in the following way: +// - spaces are decoded +// - valid UTF-8 sequences are decoded +// - gen-delims that can be unambiguously transformed are decoded +// - characters controlled by DecodeReserved are often decoded, though this behavior +// can change depending on the subjective definition of "pretty" // -// Finally, when the userinfo is considered in isolation, the ambiguities of its -// components are: -// - password: none, since it's the last -// - username: ":" is ambiguous +// Note that the list of gen-delims that we can transform is different for the +// user info (user name + password) and the authority (user info + host + +// port). + // list the recoding table modifications to be used with the recodeFromUser and -// appendToUser functions, according to the rules above. +// appendToUser functions, according to the rules above. Spaces and UTF-8 +// sequences are handled outside the tables. + // the encodedXXX tables are run with the delimiters set to "leave" by default; // the decodedXXX tables are run with the delimiters set to "decode" by default // (except for the query, which doesn't use these functions) @@ -596,103 +613,88 @@ inline void QUrlPrivate::setError(ErrorCode errorCode, const QString &source, in #define leave(x) ushort(0x100 | (x)) #define encode(x) ushort(0x200 | (x)) -static const ushort encodedUserNameActions[] = { - // first field, everything must be encoded, including the ":" - // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) - encode('/'), // 0 - encode('?'), // 1 - encode('#'), // 2 +static const ushort userNameInIsolation[] = { + decode(':'), // 0 + decode('@'), // 1 + decode(']'), // 2 + decode('['), // 3 + decode('/'), // 4 + decode('?'), // 5 + decode('#'), // 6 + + decode('"'), // 7 + decode('<'), + decode('>'), + decode('^'), + decode('\\'), + decode('|'), + decode('{'), + decode('}'), + 0 +}; +static const ushort * const passwordInIsolation = userNameInIsolation + 1; +static const ushort * const pathInIsolation = userNameInIsolation + 5; +static const ushort * const queryInIsolation = userNameInIsolation + 6; +static const ushort * const fragmentInIsolation = userNameInIsolation + 7; + +static const ushort userNameInUserInfo[] = { + encode(':'), // 0 + decode('@'), // 1 + decode(']'), // 2 + decode('['), // 3 + decode('/'), // 4 + decode('?'), // 5 + decode('#'), // 6 + + decode('"'), // 7 + decode('<'), + decode('>'), + decode('^'), + decode('\\'), + decode('|'), + decode('{'), + decode('}'), + 0 +}; +static const ushort * const passwordInUserInfo = userNameInUserInfo + 1; + +static const ushort userNameInAuthority[] = { + encode(':'), // 0 + encode('@'), // 1 + encode(']'), // 2 encode('['), // 3 - encode(']'), // 4 - encode('@'), // 5 - encode(':'), // 6 + decode('/'), // 4 + decode('?'), // 5 + decode('#'), // 6 + + decode('"'), // 7 + decode('<'), + decode('>'), + decode('^'), + decode('\\'), + decode('|'), + decode('{'), + decode('}'), 0 }; -static const ushort * const decodedUserNameInAuthorityActions = encodedUserNameActions + 3; -static const ushort * const decodedUserNameInUserInfoActions = encodedUserNameActions + 6; -static const ushort * const decodedUserNameInUrlActions = encodedUserNameActions; -static const ushort * const decodedUserNameInIsolationActions = 0; +static const ushort * const passwordInAuthority = userNameInAuthority + 1; -static const ushort encodedPasswordActions[] = { - // same as encodedUserNameActions, but decode ":" - // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) - encode('/'), // 0 - encode('?'), // 1 - encode('#'), // 2 +static const ushort userNameInUrl[] = { + encode(':'), // 0 + encode('@'), // 1 + encode(']'), // 2 encode('['), // 3 - encode(']'), // 4 - encode('@'), // 5 - 0 -}; -static const ushort * const decodedPasswordInAuthorityActions = encodedPasswordActions + 3; -static const ushort * const decodedPasswordInUserInfoActions = 0; -static const ushort * const decodedPasswordInUrlActions = encodedPasswordActions; -static const ushort * const decodedPasswordInIsolationActions = 0; + encode('/'), // 4 + encode('?'), // 5 + encode('#'), // 6 -static const ushort encodedPathActions[] = { - // pchar = unreserved / pct-encoded / sub-delims / ":" / "@" - encode('['), // 0 - encode(']'), // 1 - encode('?'), // 2 - encode('#'), // 3 - leave('/'), // 4 - 0 -}; -static const ushort decodedPathInUrlActions[] = { - decode('{'), // 0 - decode('}'), // 1 - encode('?'), // 2 - encode('#'), // 3 - leave('/'), // 4 - 0 -}; -static const ushort * const decodedPathInIsolationActions = encodedPathActions + 4; // leave('/') - -static const ushort encodedFragmentActions[] = { - // fragment = *( pchar / "/" / "?" ) - // gen-delims permitted: ":" / "@" / "/" / "?" - // -> must encode: "[" / "]" / "#" - // HOWEVER: we allow "#" to remain decoded - decode('#'), // 0 - decode(':'), // 1 - decode('@'), // 2 - decode('/'), // 3 - decode('?'), // 4 - encode('['), // 5 - encode(']'), // 6 - 0 -}; -//static const ushort * const decodedFragmentInUrlActions = 0; -static const ushort * const decodedFragmentInIsolationActions = 0; - -// the query is handled specially: the decodedQueryXXX tables are run with -// the delimiters set to "leave" by default and the others set to "encode" -static const ushort encodedQueryActions[] = { - // query = *( pchar / "/" / "?" ) - // gen-delims permitted: ":" / "@" / "/" / "?" - // HOWEVER: we leave alone them alone, plus "[" and "]" - // -> must encode: "#" - encode('#'), // 0 - 0 -}; -static const ushort decodedQueryInIsolationActions[] = { - decode('"'), // 0 - decode('<'), // 1 - decode('>'), // 2 - decode('^'), // 3 - decode('\\'),// 4 - decode('|'), // 5 - decode('{'), // 6 - decode('}'), // 7 - decode('#'), // 8 - 0 -}; -static const ushort decodedQueryInUrlActions[] = { - decode('{'), // 6 - decode('}'), // 7 - encode('#'), // 8 + // no need to list encode(x) for the other characters 0 }; +static const ushort * const passwordInUrl = userNameInUrl + 1; +static const ushort * const pathInUrl = userNameInUrl + 5; +static const ushort * const queryInUrl = userNameInUrl + 6; +static const ushort * const fragmentInUrl = 0; static inline void parseDecodedComponent(QString &data) { @@ -705,33 +707,24 @@ recodeFromUser(const QString &input, const ushort *actions, int from, int to) QString output; const QChar *begin = input.constData() + from; const QChar *end = input.constData() + to; - if (qt_urlRecode(output, begin, end, - QUrl::DecodeReserved, actions)) + if (qt_urlRecode(output, begin, end, 0, actions)) return output; return input.mid(from, to - from); } -// appendXXXX functions: -// the internal value is stored in its most decoded form, so that case is easy. -// DecodeUnicode and DecodeSpaces are handled by qt_urlRecode. -// That leaves these functions to handle two cases related to delimiters: -// 1) encoded encodedXXXX tables -// 2) decoded decodedXXXX tables +// appendXXXX functions: copy from the internal form to the external, user form. +// the internal value is stored in its PrettyDecoded form, so that case is easy. static inline void appendToUser(QString &appendTo, const QString &value, QUrl::FormattingOptions options, - const ushort *encodedActions, const ushort *decodedActions) + const ushort *actions) { + options |= QUrl::EncodeDelimiters; + if (options == QUrl::PrettyDecoded) { appendTo += value; return; } - const ushort *actions = 0; - if (options & QUrl::EncodeDelimiters) - actions = encodedActions; - else - actions = decodedActions; - if (!qt_urlRecode(appendTo, value.constData(), value.constEnd(), options, actions)) appendTo += value; } @@ -758,31 +751,33 @@ inline void QUrlPrivate::appendUserInfo(QString &appendTo, QUrl::FormattingOptio const ushort *userNameActions; const ushort *passwordActions; if (options & QUrl::EncodeDelimiters) { - userNameActions = encodedUserNameActions; - passwordActions = encodedPasswordActions; + userNameActions = userNameInUrl; + passwordActions = passwordInUrl; } else { switch (appendingTo) { case UserInfo: - userNameActions = decodedUserNameInUserInfoActions; - passwordActions = decodedPasswordInUserInfoActions; + userNameActions = userNameInUserInfo; + passwordActions = passwordInUserInfo; break; case Authority: - userNameActions = decodedUserNameInAuthorityActions; - passwordActions = decodedPasswordInAuthorityActions; + userNameActions = userNameInAuthority; + passwordActions = passwordInAuthority; break; case FullUrl: + userNameActions = userNameInUrl; + passwordActions = passwordInUrl; + break; + default: - userNameActions = decodedUserNameInUrlActions; - passwordActions = decodedPasswordInUrlActions; + // can't happen + Q_UNREACHABLE(); break; } } - if ((options & QUrl::EncodeReserved) == 0) - options |= QUrl::DecodeReserved; - + options |= QUrl::EncodeDelimiters; if (!qt_urlRecode(appendTo, userName.constData(), userName.constEnd(), options, userNameActions)) appendTo += userName; if (options & QUrl::RemovePassword || !hasPassword()) { @@ -796,12 +791,16 @@ inline void QUrlPrivate::appendUserInfo(QString &appendTo, QUrl::FormattingOptio inline void QUrlPrivate::appendUserName(QString &appendTo, QUrl::FormattingOptions options) const { - appendToUser(appendTo, userName, options, encodedUserNameActions, decodedUserNameInIsolationActions); + // only called from QUrl::userName() + appendToUser(appendTo, userName, options, + options & QUrl::EncodeDelimiters ? userNameInUrl : userNameInIsolation); } inline void QUrlPrivate::appendPassword(QString &appendTo, QUrl::FormattingOptions options) const { - appendToUser(appendTo, password, options, encodedPasswordActions, decodedPasswordInIsolationActions); + // only called from QUrl::password() + appendToUser(appendTo, password, options, + options & QUrl::EncodeDelimiters ? passwordInUrl : passwordInIsolation); } inline void QUrlPrivate::appendPath(QString &appendTo, QUrl::FormattingOptions options, Section appendingTo) const @@ -822,41 +821,21 @@ inline void QUrlPrivate::appendPath(QString &appendTo, QUrl::FormattingOptions o thePath.chop(1); } - if (appendingTo != Path && !(options & QUrl::EncodeDelimiters)) { - if (!qt_urlRecode(appendTo, thePath.constData(), thePath.constEnd(), options, decodedPathInUrlActions)) - appendTo += thePath; + appendToUser(appendTo, thePath, options, + appendingTo == FullUrl || options & QUrl::EncodeDelimiters ? pathInUrl : pathInIsolation); - } else { - appendToUser(appendTo, thePath, options, encodedPathActions, decodedPathInIsolationActions); - } } inline void QUrlPrivate::appendFragment(QString &appendTo, QUrl::FormattingOptions options, Section appendingTo) const { - appendToUser(appendTo, fragment, options, encodedFragmentActions, decodedFragmentInIsolationActions); + appendToUser(appendTo, fragment, options, + appendingTo == FullUrl || options & QUrl::EncodeDelimiters ? fragmentInUrl : fragmentInIsolation); } inline void QUrlPrivate::appendQuery(QString &appendTo, QUrl::FormattingOptions options, Section appendingTo) const { - // almost the same code as the previous functions - // except we prefer not to touch the delimiters - if (options == QUrl::PrettyDecoded && appendingTo == Query) { - appendTo += query; - return; - } - - const ushort *actions = 0; - if (options & QUrl::EncodeDelimiters) { - actions = encodedQueryActions; - } else { - // reset to default qt_urlRecode behaviour (leave delimiters alone) - options |= QUrl::EncodeDelimiters; - actions = appendingTo == Query ? decodedQueryInIsolationActions : decodedQueryInUrlActions; - } - - if (!qt_urlRecode(appendTo, query.constData(), query.constData() + query.length(), - options, actions)) - appendTo += query; + appendToUser(appendTo, query, options, + appendingTo == FullUrl || options & QUrl::EncodeDelimiters ? queryInUrl : queryInIsolation); } // setXXX functions @@ -1001,42 +980,31 @@ inline void QUrlPrivate::setUserInfo(const QString &userInfo, int from, int end) inline void QUrlPrivate::setUserName(const QString &value, int from, int end) { sectionIsPresent |= UserName; - userName = recodeFromUser(value, decodedUserNameInIsolationActions, from, end); + userName = recodeFromUser(value, userNameInIsolation, from, end); } inline void QUrlPrivate::setPassword(const QString &value, int from, int end) { sectionIsPresent |= Password; - password = recodeFromUser(value, decodedPasswordInIsolationActions, from, end); + password = recodeFromUser(value, passwordInIsolation, from, end); } inline void QUrlPrivate::setPath(const QString &value, int from, int end) { // sectionIsPresent |= Path; // not used, save some cycles - path = recodeFromUser(value, decodedPathInIsolationActions, from, end); + path = recodeFromUser(value, pathInIsolation, from, end); } inline void QUrlPrivate::setFragment(const QString &value, int from, int end) { sectionIsPresent |= Fragment; - fragment = recodeFromUser(value, decodedFragmentInIsolationActions, from, end); + fragment = recodeFromUser(value, fragmentInIsolation, from, end); } inline void QUrlPrivate::setQuery(const QString &value, int from, int iend) { sectionIsPresent |= Query; - - // use the default actions for the query (don't set QUrl::DecodeAllDelimiters) - QString output; - const QChar *begin = value.constData() + from; - const QChar *end = value.constData() + iend; - - // leave delimiters alone but decode the rest - if (qt_urlRecode(output, begin, end, QUrl::EncodeDelimiters, - decodedQueryInIsolationActions)) - query = output; - else - query = value.mid(from, iend - from); + query = recodeFromUser(value, queryInIsolation, from, iend); } // Host handling diff --git a/src/corelib/io/qurlrecode.cpp b/src/corelib/io/qurlrecode.cpp index 509a92d2b0..9189cd294f 100644 --- a/src/corelib/io/qurlrecode.cpp +++ b/src/corelib/io/qurlrecode.cpp @@ -113,59 +113,6 @@ static const uchar defaultActionTable[96] = { // 0x00 if it belongs to this category // 0xff if it doesn't -static const uchar delimsMask[96] = { - 0xff, // space - 0x00, // '!' (sub-delim) - 0xff, // '"' - 0x00, // '#' (gen-delim) - 0x00, // '$' (gen-delim) - 0xff, // '%' (percent) - 0x00, // '&' (gen-delim) - 0x00, // "'" (sub-delim) - 0x00, // '(' (sub-delim) - 0x00, // ')' (sub-delim) - 0x00, // '*' (sub-delim) - 0x00, // '+' (sub-delim) - 0x00, // ',' (sub-delim) - 0xff, // '-' (unreserved) - 0xff, // '.' (unreserved) - 0x00, // '/' (gen-delim) - - 0xff, 0xff, 0xff, 0xff, 0xff, // '0' to '4' (unreserved) - 0xff, 0xff, 0xff, 0xff, 0xff, // '5' to '9' (unreserved) - 0x00, // ':' (gen-delim) - 0x00, // ';' (sub-delim) - 0xff, // '<' - 0x00, // '=' (sub-delim) - 0xff, // '>' - 0x00, // '?' (gen-delim) - - 0x00, // '@' (gen-delim) - 0xff, 0xff, 0xff, 0xff, 0xff, // 'A' to 'E' (unreserved) - 0xff, 0xff, 0xff, 0xff, 0xff, // 'F' to 'J' (unreserved) - 0xff, 0xff, 0xff, 0xff, 0xff, // 'K' to 'O' (unreserved) - 0xff, 0xff, 0xff, 0xff, 0xff, // 'P' to 'T' (unreserved) - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // 'U' to 'Z' (unreserved) - 0x00, // '[' (gen-delim) - 0xff, // '\' - 0x00, // ']' (gen-delim) - 0xff, // '^' - 0xff, // '_' (unreserved) - - 0xff, // '`' - 0xff, 0xff, 0xff, 0xff, 0xff, // 'a' to 'e' (unreserved) - 0xff, 0xff, 0xff, 0xff, 0xff, // 'f' to 'j' (unreserved) - 0xff, 0xff, 0xff, 0xff, 0xff, // 'k' to 'o' (unreserved) - 0xff, 0xff, 0xff, 0xff, 0xff, // 'p' to 't' (unreserved) - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // 'u' to 'z' (unreserved) - 0xff, // '{' - 0xff, // '|' - 0xff, // '}' - 0xff, // '~' (unreserved) - - 0xff // BSKP -}; - static const uchar reservedMask[96] = { 0xff, // space 0xff, // '!' (sub-delim) @@ -617,8 +564,6 @@ static void maskTable(uchar (&table)[N], const uchar (&mask)[N]) The \a encoding option modifies the default behaviour: \list - \li QUrl::EncodeDelimiters: if set, delimiters will be left untransformed (note: not encoded!); - if unset, delimiters will be decoded \li QUrl::DecodeReserved: if set, reserved characters will be decoded; if unset, reserved characters will be encoded \li QUrl::EncodeSpaces: if set, spaces will be encoded to "%20"; if unset, they will be " " @@ -664,8 +609,6 @@ qt_urlRecode(QString &appendTo, const QChar *begin, const QChar *end, actionTable[0x7F - ' '] = EncodeCharacter; } else { memcpy(actionTable, defaultActionTable, sizeof actionTable); - if (!(encoding & QUrl::EncodeDelimiters)) - maskTable(actionTable, delimsMask); if (encoding & QUrl::DecodeReserved) maskTable(actionTable, reservedMask); if (!(encoding & QUrl::EncodeSpaces)) diff --git a/tests/auto/corelib/io/qurl/tst_qurl.cpp b/tests/auto/corelib/io/qurl/tst_qurl.cpp index c66140dfae..a55f87b20f 100644 --- a/tests/auto/corelib/io/qurl/tst_qurl.cpp +++ b/tests/auto/corelib/io/qurl/tst_qurl.cpp @@ -758,8 +758,8 @@ void tst_QUrl::setUrl() QVERIFY(url.isValid()); QCOMPARE(url.scheme(), QString("data")); QCOMPARE(url.host(), QString()); - QCOMPARE(url.path(), QString("text/javascript,d5 = 'five\\u0027s';")); - QCOMPARE(url.encodedPath().constData(), "text/javascript,d5%20=%20'five%5Cu0027s';"); + QCOMPARE(url.path(), QString("text/javascript,d5 %3D 'five\\u0027s'%3B")); + QCOMPARE(url.encodedPath().constData(), "text/javascript,d5%20%3D%20'five%5Cu0027s'%3B"); } { @@ -1575,17 +1575,17 @@ void tst_QUrl::relative() void tst_QUrl::percentEncoding_data() { + // This test is limited. It's superseded by componentEncodings below QTest::addColumn("original"); QTest::addColumn("encoded"); QTest::newRow("test_01") << QString::fromLatin1("sdfsdf") << QByteArray("sdfsdf"); QTest::newRow("test_02") << QString::fromUtf8("æss") << QByteArray("%C3%A6ss"); - // not unreserved or reserved - QTest::newRow("test_03") << QString::fromLatin1("{}") << QByteArray("%7B%7D"); } void tst_QUrl::percentEncoding() { + // This test is limited. It's superseded by componentEncodings below QFETCH(QString, original); QFETCH(QByteArray, encoded); @@ -1660,21 +1660,23 @@ void tst_QUrl::symmetry() { QString urlString = QString::fromLatin1("http://desktop:33326/upnp/{32f525a6-6f31-426e-91ca-01c2e6c2c57e}"); + QString encodedUrlString = QString("http://desktop:33326/upnp/%7B32f525a6-6f31-426e-91ca-01c2e6c2c57e%7D"); QUrl urlPreviewList(urlString); - QCOMPARE(urlPreviewList.toString(), urlString); + QCOMPARE(urlPreviewList.toString(), encodedUrlString); QByteArray b = urlPreviewList.toEncoded(); - QCOMPARE(b.constData(), "http://desktop:33326/upnp/%7B32f525a6-6f31-426e-91ca-01c2e6c2c57e%7D"); - QCOMPARE(QUrl::fromEncoded(b).toString(), urlString); - QCOMPARE(QUrl(b).toString(), urlString); + QCOMPARE(b.constData(), encodedUrlString.toLatin1().constData()); + QCOMPARE(QUrl::fromEncoded(b).toString(), encodedUrlString); + QCOMPARE(QUrl(b).toString(), encodedUrlString); } { QString urlString = QString::fromLatin1("http://desktop:53423/deviceDescription?uuid={7977c17b-00bf-4af9-894e-fed28573c3a9}"); + QString encodedUrlString = QString("http://desktop:53423/deviceDescription?uuid=%7B7977c17b-00bf-4af9-894e-fed28573c3a9%7D"); QUrl urlPreviewList(urlString); - QCOMPARE(urlPreviewList.toString(), urlString); + QCOMPARE(urlPreviewList.toString(), encodedUrlString); QByteArray b = urlPreviewList.toEncoded(); - QCOMPARE(b.constData(), "http://desktop:53423/deviceDescription?uuid=%7B7977c17b-00bf-4af9-894e-fed28573c3a9%7D"); - QCOMPARE(QUrl::fromEncoded(b).toString(), urlString); - QCOMPARE(QUrl(b).toString(), urlString); + QCOMPARE(b.constData(), encodedUrlString.toLatin1().constData()); + QCOMPARE(QUrl::fromEncoded(b).toString(), encodedUrlString); + QCOMPARE(QUrl(b).toString(), encodedUrlString); } } @@ -2180,35 +2182,22 @@ void tst_QUrl::tolerantParser() url.setUrl("http://foo.bar/[image][1].jpg"); QVERIFY(url.isValid()); QVERIFY(!url.toString().isEmpty()); - QCOMPARE(url.toString(QUrl::FullyEncoded), QString("http://foo.bar/%5Bimage%5D%5B1%5D.jpg")); - QCOMPARE(url.toEncoded(), QByteArray("http://foo.bar/%5Bimage%5D%5B1%5D.jpg")); + QCOMPARE(url.toString(QUrl::FullyEncoded), QString("http://foo.bar/[image][1].jpg")); + QCOMPARE(url.toEncoded(), QByteArray("http://foo.bar/[image][1].jpg")); QCOMPARE(url.toString(), QString("http://foo.bar/[image][1].jpg")); - url.setUrl("[].jpg"); - QCOMPARE(url.toString(QUrl::FullyEncoded), QString("%5B%5D.jpg")); - QCOMPARE(url.toEncoded(), QByteArray("%5B%5D.jpg")); - QCOMPARE(url.toString(), QString("[].jpg")); - - url.setUrl("/some/[path]/[]"); - QCOMPARE(url.toString(QUrl::FullyEncoded), QString("/some/%5Bpath%5D/%5B%5D")); - QCOMPARE(url.toEncoded(), QByteArray("/some/%5Bpath%5D/%5B%5D")); - QCOMPARE(url.toString(), QString("/some/[path]/[]")); + url.setUrl("http://foo.bar/%5Bimage%5D%5B1%5D.jpg"); + QVERIFY(url.isValid()); + QVERIFY(!url.toString().isEmpty()); + QCOMPARE(url.toString(QUrl::FullyEncoded), QString("http://foo.bar/%5Bimage%5D%5B1%5D.jpg")); + QCOMPARE(url.toEncoded(), QByteArray("http://foo.bar/%5Bimage%5D%5B1%5D.jpg")); + QCOMPARE(url.toString(), QString("http://foo.bar/%5Bimage%5D%5B1%5D.jpg")); url.setUrl("//[::56:56:56:56:56:56:56]"); QCOMPARE(url.toString(QUrl::FullyEncoded), QString("//[0:56:56:56:56:56:56:56]")); QCOMPARE(url.toEncoded(), QByteArray("//[0:56:56:56:56:56:56:56]")); QCOMPARE(url.toString(), QString("//[0:56:56:56:56:56:56:56]")); - url.setUrl("//[::56:56:56:56:56:56:56]#[]"); - QCOMPARE(url.toString(QUrl::FullyEncoded), QString("//[0:56:56:56:56:56:56:56]#%5B%5D")); - QCOMPARE(url.toEncoded(), QByteArray("//[0:56:56:56:56:56:56:56]#%5B%5D")); - QCOMPARE(url.toString(), QString("//[0:56:56:56:56:56:56:56]#[]")); - - url.setUrl("//[::56:56:56:56:56:56:56]?[]"); - QCOMPARE(url.toString(QUrl::FullyEncoded), QString("//[0:56:56:56:56:56:56:56]?[]")); - QCOMPARE(url.toEncoded(), QByteArray("//[0:56:56:56:56:56:56:56]?[]")); - QCOMPARE(url.toString(), QString("//[0:56:56:56:56:56:56:56]?[]")); - // invoke the tolerant parser's error correction url.setUrl("%hello.com/f%"); QCOMPARE(url.toString(QUrl::FullyEncoded), QString("%25hello.com/f%25")); @@ -2221,38 +2210,24 @@ void tst_QUrl::tolerantParser() url.setEncodedUrl("http://foo.bar/[image][1].jpg"); QVERIFY(url.isValid()); - QCOMPARE(url.toString(QUrl::FullyEncoded), QString("http://foo.bar/%5Bimage%5D%5B1%5D.jpg")); - QCOMPARE(url.toEncoded(), QByteArray("http://foo.bar/%5Bimage%5D%5B1%5D.jpg")); + QCOMPARE(url.toString(QUrl::FullyEncoded), QString("http://foo.bar/[image][1].jpg")); + QCOMPARE(url.toEncoded(), QByteArray("http://foo.bar/[image][1].jpg")); QCOMPARE(url.toString(), QString("http://foo.bar/[image][1].jpg")); - url.setEncodedUrl("[].jpg"); - QCOMPARE(url.toString(QUrl::FullyEncoded), QString("%5B%5D.jpg")); - QCOMPARE(url.toEncoded(), QByteArray("%5B%5D.jpg")); - QCOMPARE(url.toString(), QString("[].jpg")); - - url.setEncodedUrl("/some/[path]/[]"); - QCOMPARE(url.toString(QUrl::FullyEncoded), QString("/some/%5Bpath%5D/%5B%5D")); - QCOMPARE(url.toEncoded(), QByteArray("/some/%5Bpath%5D/%5B%5D")); - QCOMPARE(url.toString(), QString("/some/[path]/[]")); + url.setEncodedUrl("http://foo.bar/%5Bimage%5D%5B1%5D.jpg"); + QVERIFY(url.isValid()); + QCOMPARE(url.toString(QUrl::FullyEncoded), QString("http://foo.bar/%5Bimage%5D%5B1%5D.jpg")); + QCOMPARE(url.toEncoded(), QByteArray("http://foo.bar/%5Bimage%5D%5B1%5D.jpg")); + QCOMPARE(url.toString(), QString("http://foo.bar/%5Bimage%5D%5B1%5D.jpg")); url.setEncodedUrl("//[::56:56:56:56:56:56:56]"); QCOMPARE(url.toString(QUrl::FullyEncoded), QString("//[0:56:56:56:56:56:56:56]")); QCOMPARE(url.toEncoded(), QByteArray("//[0:56:56:56:56:56:56:56]")); - url.setEncodedUrl("//[::56:56:56:56:56:56:56]#[]"); - QCOMPARE(url.toString(QUrl::FullyEncoded), QString("//[0:56:56:56:56:56:56:56]#%5B%5D")); - QCOMPARE(url.toEncoded(), QByteArray("//[0:56:56:56:56:56:56:56]#%5B%5D")); - QCOMPARE(url.toString(), QString("//[0:56:56:56:56:56:56:56]#[]")); - - url.setEncodedUrl("//[::56:56:56:56:56:56:56]?[]"); - QCOMPARE(url.toString(QUrl::FullyEncoded), QString("//[0:56:56:56:56:56:56:56]?[]")); - QCOMPARE(url.toEncoded(), QByteArray("//[0:56:56:56:56:56:56:56]?[]")); - QCOMPARE(url.toString(), QString("//[0:56:56:56:56:56:56:56]?[]")); - url.setEncodedUrl("data:text/css,div%20{%20border-right:%20solid;%20}"); QCOMPARE(url.toString(QUrl::FullyEncoded), QString("data:text/css,div%20%7B%20border-right:%20solid;%20%7D")); QCOMPARE(url.toEncoded(), QByteArray("data:text/css,div%20%7B%20border-right:%20solid;%20%7D")); - QCOMPARE(url.toString(), QString("data:text/css,div { border-right: solid; }")); + QCOMPARE(url.toString(), QString("data:text/css,div %7B border-right: solid; %7D")); } { @@ -3147,19 +3122,25 @@ void tst_QUrl::componentEncodings_data() // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" // / "*" / "+" / "," / ";" / "=" - // like the unreserved, these are decoded everywhere - // don't test in query because they might remain encoded - QTest::newRow("decoded-subdelims") << QUrl("x://%21%24%26:%27%28%29@host/%2a%2b%2c#%3b%3d") + // these are always left alone + QTest::newRow("decoded-subdelims") << QUrl("x://!$&:'()@host/*+,?$=(+)#;=") << int(QUrl::FullyEncoded) << "!$&" << "'()" << "!$&:'()" << "host" << "!$&:'()@host" - << "/*+," << "" << ";=" - << "x://!$&:'()@host/*+,#;="; + << "/*+," << "$=(+)" << ";=" + << "x://!$&:'()@host/*+,?$=(+)#;="; + QTest::newRow("encoded-subdelims") << QUrl("x://%21%24%26:%27%28%29@host/%2a%2b%2c?%26=%26&%3d=%3d#%3b%3d") + << MostDecoded + << "%21%24%26" << "%27%28%29" << "%21%24%26:%27%28%29" + << "host" << "%21%24%26:%27%28%29@host" + << "/%2A%2B%2C" << "%26=%26&%3D=%3D" << "%3B%3D" + << "x://%21%24%26:%27%28%29@host/%2A%2B%2C?%26=%26&%3D=%3D#%3B%3D"; // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" // these are the separators between fields - // they must appear encoded in certain positions, no exceptions - // in other positions, they can appear decoded, so they always do + // they must appear encoded in certain positions in the full URL, no exceptions + // when in those positions, they appear decoded in the isolated parts + // in other positions and the other delimiters are always left untransformed // 1) test the delimiters that must appear encoded // (if they were decoded, they'd would change the URL parsing) QTest::newRow("encoded-gendelims-changing") << QUrl("x://%5b%3a%2f%3f%23%40%5d:%5b%2f%3f%23%40%5d@host/%2f%3f%23?%23") @@ -3169,32 +3150,21 @@ void tst_QUrl::componentEncodings_data() << "/%2F?#" << "#" << "" << "x://%5B%3A%2F%3F%23%40%5D:%5B%2F%3F%23%40%5D@host/%2F%3F%23?%23"; - // 2) test the delimiters that may appear decoded and would not change the meaning - // and test that %2f is *not* decoded to a slash in the path - // don't test the query because in this mode it doesn't transform anything - QTest::newRow("decoded-gendelims-unchanging") << QUrl("x://:%3a@host/%2f%3a%40#%23%3a%2f%3f%40") + // 2) test that the other delimiters remain decoded + QTest::newRow("decoded-gendelims-unchanging") << QUrl("x://::@host/:@/[]?:/?@[]?##:/?@[]") << int(QUrl::FullyEncoded) << "" << ":" << "::" << "host" << "::@host" - << "/%2F:@" << "" << "#:/?@" - << "x://::@host/%2F:@##:/?@"; + << "/:@/[]" << ":/?@[]?" << "#:/?@[]" + << "x://::@host/:@/[]?:/?@[]?##:/?@[]"; - // 3) test "[" and "]". Even though they are not ambiguous in the path, query or fragment - // the RFC does not allow them to appear there decoded. QUrl adheres strictly in FullyEncoded mode - QTest::newRow("encoded-square-brackets") << QUrl("x:/[]#[]") - << int(QUrl::FullyEncoded) - << "" << "" << "" - << "" << "" - << "/%5B%5D" << "" << "%5B%5D" - << "x:/%5B%5D#%5B%5D"; - - // 4) like above, but now decode them, which is allowed - QTest::newRow("decoded-square-brackets") << QUrl("x:/%5B%5D#%5B%5D") - << MostDecoded - << "" << "" << "" - << "" << "" - << "/[]" << "" << "[]" - << "x:/[]#[]"; + // 3) and test that the same encoded sequences remain encoded + QTest::newRow("encoded-gendelims-unchanging") << QUrl("x://:%3A@host/%3A%40%5B%5D?%3A%2F%3F%40%5B%5D#%23%3A%2F%3F%40%5B%5D") + << MostDecoded + << "" << "%3A" << ":%3A" + << "host" << ":%3A@host" + << "/%3A%40%5B%5D" << "%3A%2F%3F%40%5B%5D" << "%23%3A%2F%3F%40%5B%5D" + << "x://:%3A@host/%3A%40%5B%5D?%3A%2F%3F%40%5B%5D#%23%3A%2F%3F%40%5B%5D"; // test the query // since QUrl doesn't know what chars the user wants to use for the pair and value delimiters, @@ -3248,23 +3218,13 @@ void tst_QUrl::componentEncodings_data() << QString::fromUtf8("é ") << QString::fromUtf8("x:// é:é @smørbrød.example.no/é ? é#é "); - // the pretty form re-encodes the subdelims (except in the query, where they are left alone) - QTest::newRow("pretty-subdelims") << QUrl("x://%21%24%26:%27%28%29@host/%2a%2b%2c?%26=%26&%3d=%3d#%3b%3d") + // the pretty form decodes all unambiguous gen-delims in the individual parts + QTest::newRow("pretty-gendelims") << QUrl("x://%5b%3a%40%2f%3f%23%5d:%5b%40%2f%3f%23%5d@host/%3f%23?%23") << int(QUrl::PrettyDecoded) - << "!$&" << "'()" << "!$&:'()" - << "host" << "!$&:'()@host" - << "/*+," << "%26=%26&%3D=%3D" << ";=" - << "x://!$&:'()@host/*+,?%26=%26&%3D=%3D#;="; - - // the pretty form decodes all unambiguous gen-delims - // (except in query, where they are left alone) - QTest::newRow("pretty-gendelims") << QUrl("x://%5b%3a%40%2f%5d:%5b%3a%40%2f%5d@host" - "/%3a%40%5b%3f%23%5d?[?%3f%23]%5b:%3a@%40%5d#%23") - << int(QUrl::PrettyDecoded) - << "[:@/]" << "[:@/]" << "[%3A@/]:[:@/]" - << "host" << "%5B%3A%40/%5D:%5B:%40/%5D@host" - << "/:@[?#]" << "[?%3F#]%5B:%3A@%40%5D" << "#" - << "x://%5B%3A%40%2F%5D:%5B:%40%2F%5D@host/:@[%3F%23]?[?%3F%23]%5B:%3A@%40%5D##"; + << "[:@/?#]" << "[@/?#]" << "[%3A@/?#]:[@/?#]" + << "host" << "%5B%3A%40/?#%5D:%5B%40/?#%5D@host" + << "/?#" << "#" << "" + << "x://%5B%3A%40%2F%3F%23%5D:%5B%40%2F%3F%23%5D@host/%3F%23?%23"; // the pretty form keeps the other characters decoded everywhere // except when rebuilding the full URL, when we only allow "{}" to remain decoded @@ -3273,8 +3233,8 @@ void tst_QUrl::componentEncodings_data() << "\"<>^\\{|}" << "\"<>^\\{|}" << "\"<>^\\{|}:\"<>^\\{|}" << "host" << "\"<>^\\{|}:\"<>^\\{|}@host" << "/\"<>^\\{|}" << "\"<>^\\{|}" << "\"<>^\\{|}" - << "x://%22%3C%3E%5E%5C%7B%7C%7D:%22%3C%3E%5E%5C%7B%7C%7D@host/%22%3C%3E%5E%5C{%7C}" - "?%22%3C%3E%5E%5C{%7C}#%22%3C%3E%5E%5C%7B%7C%7D"; + << "x://%22%3C%3E%5E%5C%7B%7C%7D:%22%3C%3E%5E%5C%7B%7C%7D@host/%22%3C%3E%5E%5C%7B%7C%7D" + "?%22%3C%3E%5E%5C%7B%7C%7D#%22%3C%3E%5E%5C%7B%7C%7D"; } void tst_QUrl::componentEncodings()