Fix wxURI::Unescape() to work with Unicode strings

Such strings are not really URIs as they should have been encoded if they were but we can obtain them from e.g. wxFileSystem::FindFirst(), so handle them correctly here as it's simpler than checking all the places where Unescape() is called. Add a unit test checking that decoding an URI containing both Unicode and percent-encoded Unicode characters works correctly.
2016-02-13 03:59:43 +01:00 · 2016-02-13 03:59:43 +01:00 · 314630945a
commit 314630945a
parent 0a555f3c83
4 changed files with 29 additions and 40 deletions
--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -63,6 +63,7 @@ All:

 - Add UTF-8 and ZIP 64 support to wxZip{Input,Output}Stream (Tobias Taschner).
 - Upgrade libpng to 1.6.21 fixing several security bugs (Paul Kulchenko).
+- Fix handling of Unicode file names in wxFileSystem::FindFirst().
 - Add wxStandardPaths::GetUserDir() (Tobias Taschner).
 - Allow calling wxItemContainer::Add() and similar with std::vector<> argument.
 - Add "%z" support to printf()-like functions like wxString::Format() (RIVDSL).
--- a/include/wx/uri.h
+++ b/include/wx/uri.h
@ -137,11 +137,6 @@ protected:
    static bool ParseIPv6address(const char*& uri);
    static bool ParseIPvFuture(const char*& uri);

-    // should be called with i pointing to '%', returns the encoded character
-    // following it or -1 if invalid and advances i past it (so that it points
-    // to the last character consumed on return)
-    static int DecodeEscape(wxString::const_iterator& i);
-
    // append next character pointer to by p to the string in an escaped form
    // and advance p past it
    //
--- a/src/common/uri.cpp
+++ b/src/common/uri.cpp
@ -100,38 +100,32 @@ int wxURI::CharToHex(char c)
    return -1;
 }

-int wxURI::DecodeEscape(wxString::const_iterator& i)
-{
-    int hi = CharToHex(*++i);
-    if ( hi == -1 )
-        return -1;
-
-    int lo = CharToHex(*++i);
-    if ( lo == -1 )
-        return -1;
-
-    return (hi << 4) | lo;
-}
-
 /* static */
 wxString wxURI::Unescape(const wxString& uri)
 {
+    // URIs can contain escaped 8-bit characters that have to be decoded using
+    // UTF-8 (see RFC 3986), however in our (probably broken...) case we can
+    // also end up with not escaped Unicode characters in the URI string which
+    // can't be decoded as UTF-8. So what we do here is to encode all Unicode
+    // characters as UTF-8 only to decode them back below. This is obviously
+    // inefficient but there doesn't seem to be anything else to do, other than
+    // not allowing to mix Unicode characters with escapes in the first place,
+    // but this seems to be done in a lot of places, unfortunately.
+    const wxScopedCharBuffer& uriU8(uri.utf8_str());
+    const size_t len = uriU8.length();
+
    // the unescaped version can't be longer than the original one
-    wxCharBuffer buf(uri.length());
+    wxCharBuffer buf(uriU8.length());
    char *p = buf.data();

-    for ( wxString::const_iterator i = uri.begin(); i != uri.end(); ++i, ++p )
+    const char* const end = uriU8.data() + len;
+    for ( const char* s = uriU8.data(); s != end; ++s, ++p )
    {
-        char c = *i;
-        if ( c == '%' )
+        char c = *s;
+        if ( c == '%' && s < end - 2 && IsHex(s[1]) && IsHex(s[2]) )
        {
-            int n = wxURI::DecodeEscape(i);
-            if ( n == -1 )
-                return wxString();
-
-            wxASSERT_MSG( n >= 0 && n <= 0xff, "unexpected character value" );
-
-            c = static_cast<char>(n);
+            c = (CharToHex(s[1]) << 4) | CharToHex(s[2]);
+            s += 2;
        }

        *p = c;
@ -139,17 +133,7 @@ wxString wxURI::Unescape(const wxString& uri)

    *p = '\0';

-    // by default assume that the URI is in UTF-8, this is the most common
-    // practice
-    wxString s = wxString::FromUTF8(buf);
-    if ( s.empty() )
-    {
-        // if it isn't, use latin-1 as a fallback -- at least this always
-        // succeeds
-        s = wxCSConv(wxFONTENCODING_ISO8859_1).cMB2WC(buf);
-    }
-
-    return s;
+    return wxString::FromUTF8(buf);
 }

 void wxURI::AppendNextEscaped(wxString& s, const char *& p)
--- a/tests/uris/uris.cpp
+++ b/tests/uris/uris.cpp
@ -338,6 +338,15 @@ void URITestCase::Unescaping()
                            "\xD1\x87\xD0\xB8\xD1\x81\xD0\xBB\xD0\xBE"
                          ),
                          unescaped );
+
+    escaped = L"file://\u043C\u043E\u0439%5C%d1%84%d0%b0%d0%b9%d0%bb";
+    unescaped = wxURI::Unescape(escaped);
+
+    CPPUNIT_ASSERT_EQUAL
+    (
+        L"file://\u043C\u043E\u0439\\\u0444\u0430\u0439\u043B",
+        unescaped
+    );
 #endif // wxUSE_UNICODE
 }