Fix wxTextInputStream for input starting with BOM-like bytes

Contrary to what a comment in wxTextInputStream::GetChar() said, it is actually possible to get more than one wide character from a call to wxMBConv::ToWChar(len+1) even if a previous call to ToWChar(len) failed to decode anything at all. This happens with wxConvAuto because it keeps returning an error while it doesn't have enough data to determine if the input contains a BOM or not, but then returns all the characters examined so far at once if it turns out that there was no BOM, after all. The simplest case in which this created problems was just input starting with a NUL byte as it as this could be a start of UTF-32BE BOM. The fix consists in keeping all the bytes read but not yet decoded in the m_lastBytes buffer and retrying to decode them during the next GetChar() call. This implies keeping track of how much valid data is there in m_lastBytes exactly, as we can't discard the already decoded data immediately, but need to keep it in the buffer too, in order to allow implementing UngetLast(). Incidentally, UngetLast() was totally broken for UTF-16/32 input (containing NUL bytes in the middle of the characters) before and this change fixes this as a side effect. Also add test cases for previously failing inputs.
2017-11-09 02:02:54 +01:00 · 2017-11-09 02:02:54 +01:00 · 4502e7563b
commit 4502e7563b
parent 46ea3cb8c0
3 changed files with 120 additions and 26 deletions
--- a/include/wx/txtstrm.h
+++ b/include/wx/txtstrm.h
@ -84,7 +84,22 @@ public:
 protected:
    wxInputStream &m_input;
    wxString m_separators;
-    char m_lastBytes[10]; // stores the bytes that were read for the last character
+
+    // Data possibly (see m_validXXX) read from the stream but not decoded yet.
+    // This is necessary because GetChar() may only return a single character
+    // but we may get more than one character when decoding raw input bytes.
+    char m_lastBytes[10];
+
+    // The bytes [0, m_validEnd) of m_lastBytes contain the bytes read by the
+    // last GetChar() call (this interval may be empty if GetChar() hasn't been
+    // called yet). The bytes [0, m_validBegin) have been already decoded and
+    // returned to caller or stored in m_lastWChar in the particularly
+    // egregious case of decoding a non-BMP character when using UTF-16 for
+    // wchar_t. Finally, the bytes [m_validBegin, m_validEnd) remain to be
+    // decoded and returned during the next call (again, this interval can, and
+    // usually will, be empty too if m_validBegin == m_validEnd).
+    size_t m_validBegin,
+           m_validEnd;

 #if wxUSE_UNICODE
    wxMBConv *m_conv;
--- a/src/common/txtstrm.cpp
+++ b/src/common/txtstrm.cpp
@ -35,7 +35,8 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s,
                                     const wxMBConv& conv)
  : m_input(s), m_separators(sep), m_conv(conv.Clone())
 {
-    memset((void*)m_lastBytes, 0, 10);
+    m_validBegin =
+    m_validEnd = 0;

 #if SIZEOF_WCHAR_T == 2
    m_lastWChar = 0;
@ -45,7 +46,10 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s,
 wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep)
  : m_input(s), m_separators(sep)
 {
-    memset((void*)m_lastBytes, 0, 10);
+    m_validBegin =
+    m_validEnd = 0;
+
+    m_lastBytes[0] = 0;
 }
 #endif

@ -58,11 +62,13 @@ wxTextInputStream::~wxTextInputStream()

 void wxTextInputStream::UngetLast()
 {
-    size_t byteCount = 0;
-    while(m_lastBytes[byteCount]) // pseudo ANSI strlen (even for Unicode!)
-        byteCount++;
-    m_input.Ungetch(m_lastBytes, byteCount);
-    memset((void*)m_lastBytes, 0, 10);
+    if ( m_validEnd )
+    {
+        m_input.Ungetch(m_lastBytes, m_validEnd);
+
+        m_validBegin =
+        m_validEnd = 0;
+    }
 }

 wxChar wxTextInputStream::GetChar()
@ -77,17 +83,37 @@ wxChar wxTextInputStream::GetChar()
        m_lastWChar = 0;
        return wc;
    }
-#endif // !SWIG_ONLY_SCRIPT_API
+#endif // SIZEOF_WCHAR_T

-    wxChar wbuf[2];
-    memset((void*)m_lastBytes, 0, 10);
-    for(size_t inlen = 0; inlen < 9; inlen++)
+    // If we have any non-decoded bytes left from the last call, shift them to
+    // be at the beginning of the buffer.
+    if ( m_validBegin < m_validEnd )
    {
-        // actually read the next character
-        m_lastBytes[inlen] = m_input.GetC();
+        m_validEnd -= m_validBegin;
+        memmove(m_lastBytes, m_lastBytes + m_validBegin, m_validEnd);
+    }
+    else // All bytes were already decoded and consumed.
+    {
+        m_validEnd = 0;
+    }

-        if(m_input.LastRead() <= 0)
-            return 0;
+    // We may need to decode up to 4 characters if we have input starting with
+    // 3 BOM-like bytes, but not actually containing a BOM, as decoding it will
+    // only succeed when 4 bytes are read -- and will yield 4 wide characters.
+    wxChar wbuf[4];
+    for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++)
+    {
+        if ( inlen >= m_validEnd )
+        {
+            // actually read the next character
+            m_lastBytes[inlen] = m_input.GetC();
+
+            if(m_input.LastRead() <= 0)
+                return 0;
+
+            m_validEnd++;
+        }
+        //else: Retry decoding what we already have in the buffer.

        switch ( m_conv->ToWChar(wbuf, WXSIZEOF(wbuf), m_lastBytes, inlen + 1) )
        {
@ -103,12 +129,17 @@ wxChar wxTextInputStream::GetChar()
                break;

            default:
-                // if we couldn't decode a single character during the last
-                // loop iteration we shouldn't be able to decode 2 or more of
-                // them with an extra single byte, something fishy is going on
-                // (except if we use UTF-16, see below)
-                wxFAIL_MSG("unexpected decoding result");
-                return 0;
+                // If we couldn't decode a single character during the last
+                // loop iteration, but decoded more than one of them with just
+                // one extra byte, the only explanation is that we were using a
+                // wxConvAuto conversion recognizing the initial BOM and that
+                // it couldn't detect the presence or absence of BOM so far,
+                // but now finally has enough data to see that there is none.
+                // As we must have fallen back to Latin-1 in this case, return
+                // just the first byte and keep the other ones for the next
+                // time.
+                m_validBegin = 1;
+                return wbuf[0];

 #if SIZEOF_WCHAR_T == 2
            case 2:
@ -118,25 +149,37 @@ wxChar wxTextInputStream::GetChar()
                // remember the second one for the next call, as there is no
                // way to fit both of them into a single wxChar in this case.
                m_lastWChar = wbuf[1];
-#endif // !SWIG_ONLY_SCRIPT_API
+#endif // SIZEOF_WCHAR_T == 2
                wxFALLTHROUGH;

            case 1:
+                m_validBegin = inlen + 1;

                // we finally decoded a character
                return wbuf[0];
        }
    }

-    // there should be no encoding which requires more than nine bytes for one
-    // character so something must be wrong with our conversion but we have no
-    // way to signal it from here
+    // There should be no encoding which requires more than 10 bytes to decode
+    // at least one character (the most actually seems to be 7: 3 for the
+    // initial BOM, which is ignored, and 4 for the longest possible encoding
+    // of a Unicode character in UTF-8), so something must be wrong with our
+    // conversion but we have no way to signal it from here and just return 0
+    // as if we reached the end of the stream.
+    m_validBegin = 0;
+    m_validEnd = sizeof(m_lastBytes);
+
    return 0;
 #else
    m_lastBytes[0] = m_input.GetC();

    if(m_input.LastRead() <= 0)
+    {
+        m_validEnd = 0;
        return 0;
+    }
+
+    m_validEnd = 1;

    return m_lastBytes[0];
 #endif
--- a/tests/streams/textstreamtest.cpp
+++ b/tests/streams/textstreamtest.cpp
@ -290,4 +290,40 @@ void TextStreamTestCase::TestInput(const wxMBConv& conv,
    CPPUNIT_ASSERT_EQUAL( 0, memcmp(txtWchar, temp.wc_str(), sizeof(txtWchar)) );
 }

+TEST_CASE("wxTextInputStream::GetChar", "[text][input][stream][char]")
+{
+    // This is the simplest possible test that used to trigger assertion in
+    // wxTextInputStream::GetChar().
+    SECTION("starts-with-nul")
+    {
+        const wxUint8 buf[] = { 0x00, 0x01, };
+        wxMemoryInputStream mis(buf, sizeof(buf));
+        wxTextInputStream tis(mis);
+
+        REQUIRE( tis.GetChar() == 0x00 );
+        REQUIRE( tis.GetChar() == 0x01 );
+        REQUIRE( tis.GetChar() == 0x00 );
+        CHECK( tis.GetInputStream().Eof() );
+    }
+
+    // This exercises a problematic path in GetChar() as the first 3 bytes of
+    // this stream look like the start of UTF-32BE BOM, but this is not
+    // actually a BOM because the 4th byte is 0xFE and not 0xFF, so the stream
+    // should decode the buffer as Latin-1 once it gets there.
+    SECTION("almost-UTF-32-BOM")
+    {
+        const wxUint8 buf[] = { 0x00, 0x00, 0xFE, 0xFE, 0x01 };
+        wxMemoryInputStream mis(buf, sizeof(buf));
+        wxTextInputStream tis(mis);
+
+        REQUIRE( tis.GetChar() == 0x00 );
+        REQUIRE( tis.GetChar() == 0x00 );
+        REQUIRE( tis.GetChar() == 0xFE );
+        REQUIRE( tis.GetChar() == 0xFE );
+        REQUIRE( tis.GetChar() == 0x01 );
+        REQUIRE( tis.GetChar() == 0x00 );
+        CHECK( tis.GetInputStream().Eof() );
+    }
+}
+
 #endif // wxUSE_UNICODE