wxWidgets/include/wx/txtstrm.h
Vadim Zeitlin 4502e7563b Fix wxTextInputStream for input starting with BOM-like bytes
Contrary to what a comment in wxTextInputStream::GetChar() said, it is
actually possible to get more than one wide character from a call to
wxMBConv::ToWChar(len+1) even if a previous call to ToWChar(len) failed
to decode anything at all. This happens with wxConvAuto because it keeps
returning an error while it doesn't have enough data to determine if the
input contains a BOM or not, but then returns all the characters
examined so far at once if it turns out that there was no BOM, after
all.

The simplest case in which this created problems was just input starting
with a NUL byte as it as this could be a start of UTF-32BE BOM.

The fix consists in keeping all the bytes read but not yet decoded in
the m_lastBytes buffer and retrying to decode them during the next
GetChar() call. This implies keeping track of how much valid data is
there in m_lastBytes exactly, as we can't discard the already decoded
data immediately, but need to keep it in the buffer too, in order to
allow implementing UngetLast(). Incidentally, UngetLast() was totally
broken for UTF-16/32 input (containing NUL bytes in the middle of the
characters) before and this change fixes this as a side effect.

Also add test cases for previously failing inputs.
2017-11-09 23:49:59 +01:00

207 lines
6.7 KiB
C++

/////////////////////////////////////////////////////////////////////////////
// Name: wx/txtstrm.h
// Purpose: Text stream classes
// Author: Guilhem Lavaux
// Modified by:
// Created: 28/06/1998
// Copyright: (c) Guilhem Lavaux
// Licence: wxWindows licence
/////////////////////////////////////////////////////////////////////////////
#ifndef _WX_TXTSTREAM_H_
#define _WX_TXTSTREAM_H_
#include "wx/stream.h"
#include "wx/convauto.h"
#if wxUSE_STREAMS
class WXDLLIMPEXP_FWD_BASE wxTextInputStream;
class WXDLLIMPEXP_FWD_BASE wxTextOutputStream;
typedef wxTextInputStream& (*__wxTextInputManip)(wxTextInputStream&);
typedef wxTextOutputStream& (*__wxTextOutputManip)(wxTextOutputStream&);
WXDLLIMPEXP_BASE wxTextOutputStream &endl( wxTextOutputStream &stream );
// Obsolete constant defined only for compatibility, not used.
#define wxEOT wxT('\4')
// If you're scanning through a file using wxTextInputStream, you should check for EOF _before_
// reading the next item (word / number), because otherwise the last item may get lost.
// You should however be prepared to receive an empty item (empty string / zero number) at the
// end of file, especially on Windows systems. This is unavoidable because most (but not all) files end
// with whitespace (i.e. usually a newline).
class WXDLLIMPEXP_BASE wxTextInputStream
{
public:
#if wxUSE_UNICODE
wxTextInputStream(wxInputStream& s,
const wxString &sep=wxT(" \t"),
const wxMBConv& conv = wxConvAuto());
#else
wxTextInputStream(wxInputStream& s, const wxString &sep=wxT(" \t"));
#endif
~wxTextInputStream();
const wxInputStream& GetInputStream() const { return m_input; }
// base may be between 2 and 36, inclusive, or the special 0 (= C format)
wxUint64 Read64(int base = 10);
wxUint32 Read32(int base = 10);
wxUint16 Read16(int base = 10);
wxUint8 Read8(int base = 10);
wxInt64 Read64S(int base = 10);
wxInt32 Read32S(int base = 10);
wxInt16 Read16S(int base = 10);
wxInt8 Read8S(int base = 10);
double ReadDouble();
wxString ReadLine();
wxString ReadWord();
wxChar GetChar();
wxString GetStringSeparators() const { return m_separators; }
void SetStringSeparators(const wxString &c) { m_separators = c; }
// Operators
wxTextInputStream& operator>>(wxString& word);
wxTextInputStream& operator>>(char& c);
#if wxUSE_UNICODE && wxWCHAR_T_IS_REAL_TYPE
wxTextInputStream& operator>>(wchar_t& wc);
#endif // wxUSE_UNICODE
wxTextInputStream& operator>>(wxInt16& i);
wxTextInputStream& operator>>(wxInt32& i);
wxTextInputStream& operator>>(wxInt64& i);
wxTextInputStream& operator>>(wxUint16& i);
wxTextInputStream& operator>>(wxUint32& i);
wxTextInputStream& operator>>(wxUint64& i);
wxTextInputStream& operator>>(double& i);
wxTextInputStream& operator>>(float& f);
wxTextInputStream& operator>>( __wxTextInputManip func) { return func(*this); }
protected:
wxInputStream &m_input;
wxString m_separators;
// Data possibly (see m_validXXX) read from the stream but not decoded yet.
// This is necessary because GetChar() may only return a single character
// but we may get more than one character when decoding raw input bytes.
char m_lastBytes[10];
// The bytes [0, m_validEnd) of m_lastBytes contain the bytes read by the
// last GetChar() call (this interval may be empty if GetChar() hasn't been
// called yet). The bytes [0, m_validBegin) have been already decoded and
// returned to caller or stored in m_lastWChar in the particularly
// egregious case of decoding a non-BMP character when using UTF-16 for
// wchar_t. Finally, the bytes [m_validBegin, m_validEnd) remain to be
// decoded and returned during the next call (again, this interval can, and
// usually will, be empty too if m_validBegin == m_validEnd).
size_t m_validBegin,
m_validEnd;
#if wxUSE_UNICODE
wxMBConv *m_conv;
// The second half of a surrogate character when using UTF-16 for wchar_t:
// we can't return it immediately from GetChar() when we read a Unicode
// code point outside of the BMP, but we can't keep it in m_lastBytes
// neither because it can't separately decoded, so we have a separate 1
// wchar_t buffer just for this case.
#if SIZEOF_WCHAR_T == 2
wchar_t m_lastWChar;
#endif // SIZEOF_WCHAR_T == 2
#endif // wxUSE_UNICODE
bool EatEOL(const wxChar &c);
void UngetLast(); // should be used instead of wxInputStream::Ungetch() because of Unicode issues
wxChar NextNonSeparators();
wxDECLARE_NO_COPY_CLASS(wxTextInputStream);
};
enum wxEOL
{
wxEOL_NATIVE,
wxEOL_UNIX,
wxEOL_MAC,
wxEOL_DOS
};
class WXDLLIMPEXP_BASE wxTextOutputStream
{
public:
#if wxUSE_UNICODE
wxTextOutputStream(wxOutputStream& s,
wxEOL mode = wxEOL_NATIVE,
const wxMBConv& conv = wxConvAuto());
#else
wxTextOutputStream(wxOutputStream& s, wxEOL mode = wxEOL_NATIVE);
#endif
virtual ~wxTextOutputStream();
const wxOutputStream& GetOutputStream() const { return m_output; }
void SetMode( wxEOL mode = wxEOL_NATIVE );
wxEOL GetMode() { return m_mode; }
template<typename T>
void Write(const T& i)
{
wxString str;
str << i;
WriteString(str);
}
void Write64(wxUint64 i);
void Write32(wxUint32 i);
void Write16(wxUint16 i);
void Write8(wxUint8 i);
virtual void WriteDouble(double d);
virtual void WriteString(const wxString& string);
wxTextOutputStream& PutChar(wxChar c);
void Flush();
wxTextOutputStream& operator<<(const wxString& string);
wxTextOutputStream& operator<<(char c);
#if wxUSE_UNICODE && wxWCHAR_T_IS_REAL_TYPE
wxTextOutputStream& operator<<(wchar_t wc);
#endif // wxUSE_UNICODE
wxTextOutputStream& operator<<(wxInt16 c);
wxTextOutputStream& operator<<(wxInt32 c);
wxTextOutputStream& operator<<(wxInt64 c);
wxTextOutputStream& operator<<(wxUint16 c);
wxTextOutputStream& operator<<(wxUint32 c);
wxTextOutputStream& operator<<(wxUint64 c);
wxTextOutputStream& operator<<(double f);
wxTextOutputStream& operator<<(float f);
wxTextOutputStream& operator<<( __wxTextOutputManip func) { return func(*this); }
protected:
wxOutputStream &m_output;
wxEOL m_mode;
#if wxUSE_UNICODE
wxMBConv *m_conv;
#if SIZEOF_WCHAR_T == 2
// The first half of a surrogate character if one was passed to PutChar()
// and couldn't be output when it was called the last time.
wchar_t m_lastWChar;
#endif // SIZEOF_WCHAR_T == 2
#endif // wxUSE_UNICODE
wxDECLARE_NO_COPY_CLASS(wxTextOutputStream);
};
#endif
// wxUSE_STREAMS
#endif
// _WX_DATSTREAM_H_