rewrote UTF-7 to work on streams of data to be comaptible with the way wxTextStream uses the converters; also converted a couple off by 1 bugs and unit test finally pass now
git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@53889 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
parent
d4df635acb
commit
9d653e810c
@ -271,6 +271,7 @@ All:
|
||||
- wxString now uses std::[w]string internally by default, meaning that it is
|
||||
now thread-safe if the standard library provided with your compiler is.
|
||||
- Added wxCmdLineParser::AddUsageText() (Marcin 'Malcom' Malich).
|
||||
- Fix reading/writing UTF-7-encoded text streams.
|
||||
|
||||
All (Unix):
|
||||
|
||||
|
@ -249,10 +249,81 @@ private:
|
||||
class WXDLLIMPEXP_BASE wxMBConvUTF7 : public wxMBConv
|
||||
{
|
||||
public:
|
||||
virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
|
||||
virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
|
||||
wxMBConvUTF7() { }
|
||||
|
||||
// compiler-generated copy ctor, assignment operator and dtor are ok
|
||||
// (assuming it's ok to copy the shift state -- not really sure about it)
|
||||
|
||||
virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
|
||||
const char *src, size_t srcLen = wxNO_LEN) const;
|
||||
virtual size_t FromWChar(char *dst, size_t dstLen,
|
||||
const wchar_t *src, size_t srcLen = wxNO_LEN) const;
|
||||
|
||||
virtual wxMBConv *Clone() const { return new wxMBConvUTF7; }
|
||||
|
||||
private:
|
||||
// UTF-7 decoder/encoder may be in direct mode or in shifted mode after a
|
||||
// '+' (and until the '-' or any other non-base64 character)
|
||||
enum Mode
|
||||
{
|
||||
Direct, // pass through state
|
||||
Shifted // after a '+' (and before '-')
|
||||
};
|
||||
|
||||
// the current decoder state: this is only used by ToWChar() if srcLen
|
||||
// parameter is not wxNO_LEN, when working on the entire NUL-terminated
|
||||
// strings we neither update nor use the state
|
||||
class DecoderState
|
||||
{
|
||||
private:
|
||||
// current state: this one is private as we want to enforce the use of
|
||||
// ToDirect/ToShifted() methods below
|
||||
Mode mode;
|
||||
|
||||
public:
|
||||
// the initial state is direct
|
||||
DecoderState() { mode = Direct; }
|
||||
|
||||
// switch to/from shifted mode
|
||||
void ToDirect() { mode = Direct; }
|
||||
void ToShifted() { mode = Shifted; accum = bit = 0; isLSB = false; }
|
||||
|
||||
bool IsDirect() const { return mode == Direct; }
|
||||
bool IsShifted() const { return mode == Shifted; }
|
||||
|
||||
|
||||
// these variables are only used in shifted mode
|
||||
|
||||
unsigned int accum; // accumulator of the bit we've already got
|
||||
unsigned int bit; // the number of bits consumed mod 8
|
||||
unsigned char msb; // the high byte of UTF-16 word
|
||||
bool isLSB; // whether we're decoding LSB or MSB of UTF-16 word
|
||||
};
|
||||
|
||||
DecoderState m_stateDecoder;
|
||||
|
||||
|
||||
// encoder state is simpler as we always receive entire Unicode characters
|
||||
// on input
|
||||
class EncoderState
|
||||
{
|
||||
private:
|
||||
Mode mode;
|
||||
|
||||
public:
|
||||
EncoderState() { mode = Direct; }
|
||||
|
||||
void ToDirect() { mode = Direct; }
|
||||
void ToShifted() { mode = Shifted; accum = bit = 0; }
|
||||
|
||||
bool IsDirect() const { return mode == Direct; }
|
||||
bool IsShifted() const { return mode == Shifted; }
|
||||
|
||||
unsigned int accum;
|
||||
unsigned int bit;
|
||||
};
|
||||
|
||||
EncoderState m_stateEncoder;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
@ -291,6 +291,17 @@ public:
|
||||
This class converts between the UTF-7 encoding and Unicode.
|
||||
It has one predefined instance, @b wxConvUTF7.
|
||||
|
||||
Notice that, unlike all the other conversion objects, this converter is
|
||||
stateful, i.e. it remembers its state from the last call to its ToWChar()
|
||||
or FromWChar() and assumes it is called on the continuation of the same
|
||||
string when the same method is called again. This assumption is only made
|
||||
if an explicit length is specified as parameter to these functions as if an
|
||||
entire @c NUL terminated string is processed the state doesn't need to be
|
||||
remembered.
|
||||
|
||||
This also means that, unlike the other predefined conversion objects,
|
||||
@b wxConvUTF7 is @em not thread-safe.
|
||||
|
||||
@library{wxbase}
|
||||
@category{conv}
|
||||
|
||||
|
@ -484,6 +484,8 @@ wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
// Implementation (C) 2004 Fredrik Roubert
|
||||
//
|
||||
// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
|
||||
|
||||
//
|
||||
// BASE64 decoding table
|
||||
@ -521,72 +523,134 @@ static const unsigned char utf7unb64[] =
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
};
|
||||
|
||||
size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
|
||||
size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
|
||||
const char *src, size_t srcLen) const
|
||||
{
|
||||
DecoderState stateOrig,
|
||||
*statePtr;
|
||||
if ( srcLen == wxNO_LEN )
|
||||
{
|
||||
// convert the entire string, up to and including the trailing NUL
|
||||
srcLen = strlen(src) + 1;
|
||||
|
||||
// when working on the entire strings we don't update nor use the shift
|
||||
// state from the previous call
|
||||
statePtr = &stateOrig;
|
||||
}
|
||||
else // when working with partial strings we do use the shift state
|
||||
{
|
||||
statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
|
||||
|
||||
// also save the old state to be able to rollback to it on error
|
||||
stateOrig = m_stateDecoder;
|
||||
}
|
||||
|
||||
// but to simplify the code below we use this variable in both cases
|
||||
DecoderState& state = *statePtr;
|
||||
|
||||
|
||||
// number of characters [which would have been] written to dst [if it were
|
||||
// not NULL]
|
||||
size_t len = 0;
|
||||
|
||||
while ( *psz && (!buf || (len < n)) )
|
||||
const char * const srcEnd = src + srcLen;
|
||||
|
||||
while ( (src < srcEnd) && (!dst || (len < dstLen)) )
|
||||
{
|
||||
unsigned char cc = *psz++;
|
||||
if (cc != '+')
|
||||
const unsigned char cc = *src++;
|
||||
|
||||
if ( state.IsShifted() )
|
||||
{
|
||||
// plain ASCII char
|
||||
if (buf)
|
||||
*buf++ = cc;
|
||||
len++;
|
||||
}
|
||||
else if (*psz == '-')
|
||||
{
|
||||
// encoded plus sign
|
||||
if (buf)
|
||||
*buf++ = cc;
|
||||
len++;
|
||||
psz++;
|
||||
}
|
||||
else // start of BASE64 encoded string
|
||||
{
|
||||
bool lsb, ok;
|
||||
unsigned int d, l;
|
||||
for ( ok = lsb = false, d = 0, l = 0;
|
||||
(cc = utf7unb64[(unsigned char)*psz]) != 0xff;
|
||||
psz++ )
|
||||
const unsigned char dc = utf7unb64[cc];
|
||||
if ( dc == 0xff )
|
||||
{
|
||||
d <<= 6;
|
||||
d += cc;
|
||||
for (l += 6; l >= 8; lsb = !lsb)
|
||||
// end of encoded part
|
||||
state.ToDirect();
|
||||
|
||||
// re-parse this character normally below unless it's '-' which
|
||||
// is consumed by the decoder
|
||||
if ( cc == '-' )
|
||||
continue;
|
||||
}
|
||||
else // valid encoded character
|
||||
{
|
||||
// mini base64 decoder: each character is 6 bits
|
||||
state.bit += 6;
|
||||
state.accum <<= 6;
|
||||
state.accum += dc;
|
||||
|
||||
if ( state.bit >= 8 )
|
||||
{
|
||||
unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
|
||||
if (lsb)
|
||||
// got the full byte, consume it
|
||||
state.bit -= 8;
|
||||
unsigned char b = (state.accum >> state.bit) & 0x00ff;
|
||||
|
||||
if ( state.isLSB )
|
||||
{
|
||||
if (buf)
|
||||
*buf++ |= c;
|
||||
len ++;
|
||||
ok = true;
|
||||
// we've got the full word, output it
|
||||
if ( dst )
|
||||
*dst++ = (state.msb << 8) | b;
|
||||
len++;
|
||||
state.isLSB = false;
|
||||
}
|
||||
else
|
||||
else // MSB
|
||||
{
|
||||
if (buf)
|
||||
*buf = (wchar_t)(c << 8);
|
||||
// just store it while we wait for LSB
|
||||
state.msb = b;
|
||||
state.isLSB = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( !ok )
|
||||
if ( state.IsDirect() )
|
||||
{
|
||||
// start of an encoded segment?
|
||||
if ( cc == '+' )
|
||||
{
|
||||
// in valid UTF7 we should have valid characters after '+'
|
||||
return wxCONV_FAILED;
|
||||
}
|
||||
if ( src == srcEnd )
|
||||
return wxCONV_FAILED; // can't have '+' at the end
|
||||
|
||||
if (*psz == '-')
|
||||
psz++;
|
||||
if ( *src == '-' )
|
||||
{
|
||||
// just the encoded plus sign, don't switch to shifted mode
|
||||
if ( dst )
|
||||
*dst++ = '+';
|
||||
len++;
|
||||
src++;
|
||||
}
|
||||
else
|
||||
{
|
||||
state.ToShifted();
|
||||
}
|
||||
}
|
||||
else // not '+'
|
||||
{
|
||||
// only printable 7 bit ASCII characters (with the exception of
|
||||
// NUL, TAB, CR and LF) can be used directly
|
||||
if ( cc >= 0x7f || (cc < ' ' &&
|
||||
!(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
|
||||
return wxCONV_FAILED;
|
||||
|
||||
if ( dst )
|
||||
*dst++ = cc;
|
||||
len++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( buf && (len < n) )
|
||||
*buf = '\0';
|
||||
if ( !len )
|
||||
{
|
||||
// as we didn't read any characters we should be called with the same
|
||||
// data (followed by some more new data) again later so don't save our
|
||||
// state
|
||||
state = stateOrig;
|
||||
|
||||
return wxCONV_FAILED;
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
@ -616,7 +680,7 @@ static const unsigned char utf7enb64[] =
|
||||
//
|
||||
static const unsigned char utf7encode[128] =
|
||||
{
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
|
||||
0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
|
||||
@ -626,21 +690,72 @@ static const unsigned char utf7encode[128] =
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
|
||||
};
|
||||
|
||||
size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
|
||||
static inline bool wxIsUTF7Direct(wchar_t wc)
|
||||
{
|
||||
return wc < 0x80 && utf7encode[wc] < 1;
|
||||
}
|
||||
|
||||
size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
|
||||
const wchar_t *src, size_t srcLen) const
|
||||
{
|
||||
EncoderState stateOrig,
|
||||
*statePtr;
|
||||
if ( srcLen == wxNO_LEN )
|
||||
{
|
||||
// we don't apply the stored state when operating on entire strings at
|
||||
// once
|
||||
statePtr = &stateOrig;
|
||||
|
||||
srcLen = wxWcslen(src) + 1;
|
||||
}
|
||||
else // do use the mode we left the output in previously
|
||||
{
|
||||
stateOrig = m_stateEncoder;
|
||||
statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
|
||||
}
|
||||
|
||||
EncoderState& state = *statePtr;
|
||||
|
||||
|
||||
size_t len = 0;
|
||||
|
||||
while (*psz && ((!buf) || (len < n)))
|
||||
const wchar_t * const srcEnd = src + srcLen;
|
||||
while ( src < srcEnd && (!dst || len < dstLen) )
|
||||
{
|
||||
wchar_t cc = *psz++;
|
||||
if (cc < 0x80 && utf7encode[cc] < 1)
|
||||
wchar_t cc = *src++;
|
||||
if ( wxIsUTF7Direct(cc) )
|
||||
{
|
||||
// plain ASCII char
|
||||
if (buf)
|
||||
*buf++ = (char)cc;
|
||||
if ( state.IsShifted() )
|
||||
{
|
||||
// pad with zeros the last encoded block if necessary
|
||||
if ( state.bit )
|
||||
{
|
||||
if ( dst )
|
||||
*dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
|
||||
len++;
|
||||
}
|
||||
|
||||
state.ToDirect();
|
||||
|
||||
if ( dst )
|
||||
*dst++ = '-';
|
||||
len++;
|
||||
}
|
||||
|
||||
if ( dst )
|
||||
*dst++ = (char)cc;
|
||||
len++;
|
||||
}
|
||||
else if ( cc == '+' && state.IsDirect() )
|
||||
{
|
||||
if ( dst )
|
||||
{
|
||||
*dst++ = '+';
|
||||
*dst++ = '-';
|
||||
}
|
||||
|
||||
len += 2;
|
||||
}
|
||||
#ifndef WC_UTF16
|
||||
else if (((wxUint32)cc) > 0xffff)
|
||||
{
|
||||
@ -650,52 +765,45 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
|
||||
#endif
|
||||
else
|
||||
{
|
||||
if (buf)
|
||||
*buf++ = '+';
|
||||
|
||||
len++;
|
||||
if (cc != '+')
|
||||
if ( state.IsDirect() )
|
||||
{
|
||||
// BASE64 encode string
|
||||
unsigned int lsb, d, l;
|
||||
for (d = 0, l = 0; /*nothing*/; psz++)
|
||||
{
|
||||
for (lsb = 0; lsb < 2; lsb ++)
|
||||
{
|
||||
d <<= 8;
|
||||
d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
|
||||
state.ToShifted();
|
||||
|
||||
for (l += 8; l >= 6; )
|
||||
{
|
||||
l -= 6;
|
||||
if (buf)
|
||||
*buf++ = utf7enb64[(d >> l) % 64];
|
||||
len++;
|
||||
}
|
||||
}
|
||||
|
||||
cc = *psz;
|
||||
if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
|
||||
break;
|
||||
}
|
||||
|
||||
if (l != 0)
|
||||
{
|
||||
if (buf)
|
||||
*buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
|
||||
|
||||
len++;
|
||||
}
|
||||
if ( dst )
|
||||
*dst++ = '+';
|
||||
len++;
|
||||
}
|
||||
|
||||
if (buf)
|
||||
*buf++ = '-';
|
||||
len++;
|
||||
// BASE64 encode string
|
||||
for ( ;; )
|
||||
{
|
||||
for ( unsigned lsb = 0; lsb < 2; lsb++ )
|
||||
{
|
||||
state.accum <<= 8;
|
||||
state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
|
||||
|
||||
for (state.bit += 8; state.bit >= 6; )
|
||||
{
|
||||
state.bit -= 6;
|
||||
if ( dst )
|
||||
*dst++ = utf7enb64[(state.accum >> state.bit) % 64];
|
||||
len++;
|
||||
}
|
||||
}
|
||||
|
||||
if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
|
||||
break;
|
||||
|
||||
src++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (buf && (len < n))
|
||||
*buf = 0;
|
||||
// we need to restore the original encoder state if we were called just to
|
||||
// calculate the amount of space needed as we will presumably be called
|
||||
// again to really convert the data now
|
||||
if ( !dst )
|
||||
state = stateOrig;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user