replaced recently added wxMBConv::GetMBNul() with a less clever but better

working GetMinMBCharWidth(): the idea is that we can't deal with completely
arbitrary encodings anyhow using the current API as we get confused by NUL
runs in the middle of the string, so instead just deal correctly with normal
multibyte encodings, UTF-16/UCS-2 and UTF-32/UCS-4 which should cover 99.9% of
the cases


git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@38523 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
Vadim Zeitlin 2006-04-02 14:59:53 +00:00
parent 11fead7c46
commit c1464d9d10
2 changed files with 145 additions and 107 deletions

View File

@ -85,15 +85,18 @@ public:
virtual ~wxMBConv();
private:
// this function must return the multibyte representation of L'\0'
// this function is used in the implementation of cMB2WC() to distinguish
// between the following cases:
//
// on error, nulLen should be set to -1
virtual const char *GetMBNul(size_t *nulLen) const
{
*nulLen = 1;
return "";
}
// a) var width encoding with strings terminated by a single NUL
// (usual multibyte encodings): return 1 in this case
// b) fixed width encoding with 2 bytes/char and so terminated by
// 2 NULs (UTF-16/UCS-2 and variants): return 2 in this case
// c) fixed width encoding with 4 bytes/char and so terminated by
// 4 NULs (UTF-32/UCS-4 and variants): return 4 in this case
//
// anything else is not supported currently and -1 should be returned
virtual size_t GetMinMBCharWidth() const { return 1; }
};
// ----------------------------------------------------------------------------
@ -134,10 +137,10 @@ public:
}
private:
virtual const char *GetMBNul(size_t *nulLen) const
virtual size_t GetMinMBCharWidth() const
{
// cast needed to call a private function
return ((wxConvBrokenFileNames *)m_conv)->GetMBNul(nulLen);
return ((wxConvBrokenFileNames *)m_conv)->GetMinMBCharWidth();
}
@ -186,11 +189,7 @@ private:
class WXDLLIMPEXP_BASE wxMBConvUTF16Base : public wxMBConv
{
private:
virtual const char *GetMBNul(size_t *nulLen) const
{
*nulLen = 2;
return "\0";
}
virtual size_t GetMinMBCharWidth() const { return 2; }
};
// ----------------------------------------------------------------------------
@ -222,11 +221,7 @@ public:
class WXDLLIMPEXP_BASE wxMBConvUTF32Base : public wxMBConv
{
private:
virtual const char *GetMBNul(size_t *nulLen) const
{
*nulLen = 4;
return "\0\0\0";
}
virtual size_t GetMinMBCharWidth() const { return 4; }
};
// ----------------------------------------------------------------------------
@ -289,7 +284,7 @@ private:
// charset string
void SetName(const wxChar *charset);
virtual const char *GetMBNul(size_t *nulLen) const;
virtual size_t GetMinMBCharWidth() const;
// note that we can't use wxString here because of compilation

View File

@ -187,6 +187,15 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
return buf;
}
// helper of cMB2WC(): check if n bytes at this location are all NUL
static bool NotAllNULs(const char *p, size_t n)
{
while ( n && *p++ == '\0' )
n--;
return n != 0;
}
const wxWCharBuffer
wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
{
@ -196,87 +205,108 @@ wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
// the current length of wbuf
size_t lenBuf = 0;
// we need to know the representation of L'\0' for this conversion
size_t nulLen;
const char * const nul = GetMBNul(&nulLen);
if ( nulLen == (size_t)-1 || nulLen == 0 )
return wxWCharBuffer();
// the number of NULs terminating this string
size_t nulLen wxDUMMY_INITIALIZE(0);
// make a copy of the input string unless it is already properly
// NUL-terminated
wxCharBuffer bufTmp;
// now we can compute the input size if we were not given it: notice that
// in this case the string must be properly NUL-terminated, of course, as
// otherwise we have no way of knowing how long it is
if ( inLen == (size_t)-1 )
// if we were not given the input size we just have to assume that the
// string is properly terminated as we have no way of knowing how long it
// is anyhow, but if we do have the size check whether there are enough
// NULs at the end
if ( inLen != (size_t)-1 )
{
// not the most efficient algorithm but it shouldn't matter as normally
// there are not many NULs in the string and so normally memcmp()
// should stop on the first character
const char *p = in;
while ( memcmp(p, nul, nulLen) != 0 )
p++;
// we need to know how to find the end of this string
nulLen = GetMinMBCharWidth();
if ( nulLen == (size_t)-1 )
return wbuf;
inLen = p - in + nulLen;
}
else // we already have the size
{
// check if it's not already NUL-terminated too to avoid the copy
if ( inLen < nulLen || memcmp(in + inLen - nulLen, nul, nulLen) != 0 )
// if there are enough NULs we can avoid the copy
if ( inLen < nulLen || NotAllNULs(in + inLen - nulLen, nulLen) )
{
// make a copy in order to properly NUL-terminate the string
bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */);
memcpy(bufTmp.data(), in, inLen);
memcpy(bufTmp.data() + inLen, nul, nulLen);
char * const p = bufTmp.data();
memcpy(p, in, inLen);
for ( char *s = p + inLen; s < p + inLen + nulLen; s++ )
*s = '\0';
}
}
if ( bufTmp )
in = bufTmp;
size_t lenChunk;
for ( const char * const inEnd = in + inLen;; )
{
// try to convert the current chunk if anything left
size_t lenChunk = in < inEnd ? MB2WC(NULL, in, 0) : 0;
// try to convert the current chunk
lenChunk = MB2WC(NULL, in, 0);
if ( lenChunk == 0 )
{
// nothing left in the input string, conversion succeeded
if ( outLen )
{
// we shouldn't include the last NUL in the result length
*outLen = lenBuf ? lenBuf - 1 : 0;
}
return wbuf;
break;
}
if ( lenChunk == (size_t)-1 )
break;
// if we already have a previous chunk, leave the NUL separating it
// from this one
if ( lenBuf )
lenBuf++;
const size_t lenBufNew = lenBuf + lenChunk;
if ( !wbuf.extend(lenBufNew) )
{
lenChunk = (size_t)-1;
break;
}
lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
if ( lenChunk == (size_t)-1 )
break;
// +! for the embedded NUL (if something follows)
lenBuf = lenBufNew + 1;
lenBuf = lenBufNew;
if ( inLen == (size_t)-1 )
{
// convert only one chunk in this case, as we suppose that the
// string is NUL-terminated and so inEnd is not used at all
break;
}
// advance the input pointer past the end of this chunk
while ( memcmp(in, nul, nulLen) != 0 )
in++;
while ( NotAllNULs(in, nulLen) )
{
// notice that we must skip over multiple bytes here as we suppose
// that if NUL takes 2 or 4 bytes, then all the other characters do
// too and so if advanced by a single byte we might erroneously
// detect sequences of NUL bytes in the middle of the input
in += nulLen;
}
in += nulLen; // skipping over its terminator as well
// note that ">=" (and not just "==") is needed here as the terminator
// we skipped just above could be inside or just after the buffer
// delimited by inEnd
if ( in >= inEnd )
break;
}
// conversion failed
if ( outLen )
*outLen = 0;
if ( lenChunk == (size_t)-1 )
{
// conversion failed
lenBuf = 0;
wbuf.reset();
}
return wxWCharBuffer();
if ( outLen )
*outLen = lenBuf;
return wbuf;
}
const wxCharBuffer
@ -1352,7 +1382,9 @@ protected:
#endif
private:
virtual const char *GetMBNul(size_t *nulLen) const;
// classify this encoding as explained in wxMBConv::GetMinMBCharWidth()
// comment
virtual size_t GetMinMBCharWidth() const;
// the name (for iconv_open()) of a wide char charset -- if none is
// available on this machine, it will remain NULL
@ -1362,9 +1394,9 @@ private:
// different endian-ness than the native one
static bool ms_wcNeedsSwap;
// NUL representation
size_t m_nulLen;
char m_nulBuf[8];
// cached result of GetMinMBCharWidth(); set to 0 meaning "unknown"
// initially
size_t m_minMBCharWidth;
};
// make the constructor available for unit testing
@ -1384,7 +1416,7 @@ bool wxMBConv_iconv::ms_wcNeedsSwap = false;
wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
{
m_nulLen = (size_t)-2;
m_minMBCharWidth = 0;
// iconv operates with chars, not wxChars, but luckily it uses only ASCII
// names for the charsets
@ -1642,9 +1674,9 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
return res;
}
const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const
size_t wxMBConv_iconv::GetMinMBCharWidth() const
{
if ( m_nulLen == (size_t)-2 )
if ( m_minMBCharWidth == 0 )
{
wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
@ -1654,22 +1686,22 @@ const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const
#endif
wchar_t *wnul = L"";
char buf[8]; // should be enough for NUL in any encoding
size_t inLen = sizeof(wchar_t),
outLen = WXSIZEOF(m_nulBuf);
outLen = WXSIZEOF(buf);
const char *in = (char *)wnul;
char *out = self->m_nulBuf;
char *out = buf;
if ( iconv(w2m, &in, &inLen, &out, &outLen) == (size_t)-1 )
{
self->m_nulLen = (size_t)-1;
self->m_minMBCharWidth = (size_t)-1;
}
else // ok
{
self->m_nulLen = out - m_nulBuf;
self->m_minMBCharWidth = out - buf;
}
}
*nulLen = m_nulLen;
return m_nulBuf;
return m_minMBCharWidth;
}
#endif // HAVE_ICONV
@ -1693,20 +1725,20 @@ public:
wxMBConv_win32()
{
m_CodePage = CP_ACP;
m_nulLen = (size_t)-2;
m_minMBCharWidth = 0;
}
#if wxUSE_FONTMAP
wxMBConv_win32(const wxChar* name)
{
m_CodePage = wxCharsetToCodepage(name);
m_nulLen = (size_t)-2;
m_minMBCharWidth = 0;
}
wxMBConv_win32(wxFontEncoding encoding)
{
m_CodePage = wxEncodingToCodepage(encoding);
m_nulLen = (size_t)-2;
m_minMBCharWidth = 0;
}
#endif // wxUSE_FONTMAP
@ -1933,35 +1965,50 @@ private:
#endif
}
virtual const char *GetMBNul(size_t *nulLen) const
virtual size_t GetMinMBCharWidth() const
{
if ( m_nulLen == (size_t)-2 )
if ( m_minMBCharWidth == 0 )
{
int len = ::WideCharToMultiByte
(
m_CodePage, // code page
0, // no flags
L"", // input string
1, // translate just the NUL
NULL, // output buffer
0, // and its size
NULL, // no replacement char
NULL // [out] don't care if it was used
);
wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
switch ( len )
{
default:
wxLogDebug(_T("Unexpected NUL length %d"), len);
// fall through
self->m_nulLen = ::WideCharToMultiByte
(
m_CodePage, // code page
0, // no flags
L"", // input string
1, // translate just NUL
self->m_nulBuf, // output buffer
WXSIZEOF(m_nulBuf), // and its size
NULL, // "replacement" char
NULL // [out] was it used?
);
case 0:
self->m_minMBCharWidth = (size_t)-1;
break;
if ( m_nulLen == 0 )
self->m_nulLen = (size_t)-1;
case 1:
case 2:
case 4:
self->m_minMBCharWidth = len;
break;
}
}
*nulLen = m_nulLen;
return m_nulBuf;
return m_minMBCharWidth;
}
// the code page we're working with
long m_CodePage;
size_t m_nulLen;
char m_nulBuf[8];
// cached result of GetMinMBCharWidth(), set to 0 initially meaning
// "unknown"
size_t m_minMBCharWidth;
};
#endif // wxHAVE_WIN32_MB2WC
@ -2602,23 +2649,20 @@ public:
wxEncodingConverter m2w, w2m;
private:
virtual const char *GetMBNul(size_t *nulLen) const
virtual size_t GetMinMBCharWidth() const
{
switch ( m_enc )
{
case wxFONTENCODING_UTF16BE:
case wxFONTENCODING_UTF16LE:
*nulLen = 2;
return "\0";
return 2;
case wxFONTENCODING_UTF32BE:
case wxFONTENCODING_UTF32LE:
*nulLen = 4;
return "\0\0\0";
return 4;
default:
*nulLen = 1;
return "";
return 1;
}
}
@ -3014,18 +3058,17 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
return len;
}
const char *wxCSConv::GetMBNul(size_t *nulLen) const
size_t wxCSConv::GetMinMBCharWidth() const
{
CreateConvIfNeeded();
if ( m_convReal )
{
// cast needed just to call private function of m_convReal
return ((wxCSConv *)m_convReal)->GetMBNul(nulLen);
return ((wxCSConv *)m_convReal)->GetMinMBCharWidth();
}
*nulLen = 1;
return "";
return 1;
}
// ----------------------------------------------------------------------------