replaced recently added wxMBConv::GetMBNul() with a less clever but better

working GetMinMBCharWidth(): the idea is that we can't deal with completely arbitrary encodings anyhow using the current API as we get confused by NUL runs in the middle of the string, so instead just deal correctly with normal multibyte encodings, UTF-16/UCS-2 and UTF-32/UCS-4 which should cover 99.9% of the cases git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@38523 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
2006-04-02 14:59:53 +00:00 · 2006-04-02 14:59:53 +00:00 · c1464d9d10
commit c1464d9d10
parent 11fead7c46
2 changed files with 145 additions and 107 deletions
--- a/include/wx/strconv.h
+++ b/include/wx/strconv.h
@ -85,15 +85,18 @@ public:
    virtual ~wxMBConv();

 private:
-    // this function must return the multibyte representation of L'\0'
+    // this function is used in the implementation of cMB2WC() to distinguish
+    // between the following cases:
    //
-    // on error, nulLen should be set to -1
-    virtual const char *GetMBNul(size_t *nulLen) const
-    {
-        *nulLen = 1;
-
-        return "";
-    }
+    //      a) var width encoding with strings terminated by a single NUL
+    //         (usual multibyte encodings): return 1 in this case
+    //      b) fixed width encoding with 2 bytes/char and so terminated by
+    //         2 NULs (UTF-16/UCS-2 and variants): return 2 in this case
+    //      c) fixed width encoding with 4 bytes/char and so terminated by
+    //         4 NULs (UTF-32/UCS-4 and variants): return 4 in this case
+    //
+    // anything else is not supported currently and -1 should be returned
+    virtual size_t GetMinMBCharWidth() const { return 1; }
 };

 // ----------------------------------------------------------------------------
@ -134,10 +137,10 @@ public:
    }

 private:
-    virtual const char *GetMBNul(size_t *nulLen) const
+    virtual size_t GetMinMBCharWidth() const
    {
        // cast needed to call a private function
-        return ((wxConvBrokenFileNames *)m_conv)->GetMBNul(nulLen);
+        return ((wxConvBrokenFileNames *)m_conv)->GetMinMBCharWidth();
    }


@ -186,11 +189,7 @@ private:
 class WXDLLIMPEXP_BASE wxMBConvUTF16Base : public wxMBConv
 {
 private:
-    virtual const char *GetMBNul(size_t *nulLen) const
-    {
-        *nulLen = 2;
-        return "\0";
-    }
+    virtual size_t GetMinMBCharWidth() const { return 2; }
 };

 // ----------------------------------------------------------------------------
@ -222,11 +221,7 @@ public:
 class WXDLLIMPEXP_BASE wxMBConvUTF32Base : public wxMBConv
 {
 private:
-    virtual const char *GetMBNul(size_t *nulLen) const
-    {
-        *nulLen = 4;
-        return "\0\0\0";
-    }
+    virtual size_t GetMinMBCharWidth() const { return 4; }
 };

 // ----------------------------------------------------------------------------
@ -289,7 +284,7 @@ private:
    // charset string
    void SetName(const wxChar *charset);

-    virtual const char *GetMBNul(size_t *nulLen) const;
+    virtual size_t GetMinMBCharWidth() const;


    // note that we can't use wxString here because of compilation
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@ -187,6 +187,15 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
    return buf;
 }

+// helper of cMB2WC(): check if n bytes at this location are all NUL
+static bool NotAllNULs(const char *p, size_t n)
+{
+    while ( n && *p++ == '\0' )
+        n--;
+
+    return n != 0;
+}
+
 const wxWCharBuffer
 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 {
@ -196,87 +205,108 @@ wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
    // the current length of wbuf
    size_t lenBuf = 0;

-    // we need to know the representation of L'\0' for this conversion
-    size_t nulLen;
-    const char * const nul = GetMBNul(&nulLen);
-    if ( nulLen == (size_t)-1 || nulLen == 0 )
-        return wxWCharBuffer();
+    // the number of NULs terminating this string
+    size_t nulLen   wxDUMMY_INITIALIZE(0);

    // make a copy of the input string unless it is already properly
    // NUL-terminated
    wxCharBuffer bufTmp;

-    // now we can compute the input size if we were not given it: notice that
-    // in this case the string must be properly NUL-terminated, of course, as
-    // otherwise we have no way of knowing how long it is
-    if ( inLen == (size_t)-1 )
+    // if we were not given the input size we just have to assume that the
+    // string is properly terminated as we have no way of knowing how long it
+    // is anyhow, but if we do have the size check whether there are enough
+    // NULs at the end
+    if ( inLen != (size_t)-1 )
    {
-        // not the most efficient algorithm but it shouldn't matter as normally
-        // there are not many NULs in the string and so normally memcmp()
-        // should stop on the first character
-        const char *p = in;
-        while ( memcmp(p, nul, nulLen) != 0 )
-            p++;
+        // we need to know how to find the end of this string
+        nulLen = GetMinMBCharWidth();
+        if ( nulLen == (size_t)-1 )
+            return wbuf;

-        inLen = p - in + nulLen;
-    }
-    else // we already have the size
-    {
-        // check if it's not already NUL-terminated too to avoid the copy
-        if ( inLen < nulLen || memcmp(in + inLen - nulLen, nul, nulLen) != 0 )
+        // if there are enough NULs we can avoid the copy
+        if ( inLen < nulLen || NotAllNULs(in + inLen - nulLen, nulLen) )
        {
            // make a copy in order to properly NUL-terminate the string
            bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */);
-            memcpy(bufTmp.data(), in, inLen);
-            memcpy(bufTmp.data() + inLen, nul, nulLen);
+            char * const p = bufTmp.data();
+            memcpy(p, in, inLen);
+            for ( char *s = p + inLen; s < p + inLen + nulLen; s++ )
+                *s = '\0';
        }
    }

    if ( bufTmp )
        in = bufTmp;

+    size_t lenChunk;
    for ( const char * const inEnd = in + inLen;; )
    {
-        // try to convert the current chunk if anything left
-        size_t lenChunk = in < inEnd ? MB2WC(NULL, in, 0) : 0;
+        // try to convert the current chunk
+        lenChunk = MB2WC(NULL, in, 0);
        if ( lenChunk == 0 )
        {
            // nothing left in the input string, conversion succeeded
-            if ( outLen )
-            {
-                // we shouldn't include the last NUL in the result length
-                *outLen = lenBuf ? lenBuf - 1 : 0;
-            }
-
-            return wbuf;
+            break;
        }

        if ( lenChunk == (size_t)-1 )
            break;

+        // if we already have a previous chunk, leave the NUL separating it
+        // from this one
+        if ( lenBuf )
+            lenBuf++;
+
        const size_t lenBufNew = lenBuf + lenChunk;
        if ( !wbuf.extend(lenBufNew) )
+        {
+            lenChunk = (size_t)-1;
            break;
+        }

        lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
        if ( lenChunk == (size_t)-1 )
            break;

-        // +! for the embedded NUL (if something follows)
-        lenBuf = lenBufNew + 1;
+        lenBuf = lenBufNew;
+
+        if ( inLen == (size_t)-1 )
+        {
+            // convert only one chunk in this case, as we suppose that the
+            // string is NUL-terminated and so inEnd is not used at all
+            break;
+        }

        // advance the input pointer past the end of this chunk
-        while ( memcmp(in, nul, nulLen) != 0 )
-            in++;
+        while ( NotAllNULs(in, nulLen) )
+        {
+            // notice that we must skip over multiple bytes here as we suppose
+            // that if NUL takes 2 or 4 bytes, then all the other characters do
+            // too and so if advanced by a single byte we might erroneously
+            // detect sequences of NUL bytes in the middle of the input
+            in += nulLen;
+        }

        in += nulLen; // skipping over its terminator as well
+
+        // note that ">=" (and not just "==") is needed here as the terminator
+        // we skipped just above could be inside or just after the buffer
+        // delimited by inEnd
+        if ( in >= inEnd )
+            break;
    }

-    // conversion failed
-    if ( outLen )
-        *outLen = 0;
+    if ( lenChunk == (size_t)-1 )
+    {
+        // conversion failed
+        lenBuf = 0;
+        wbuf.reset();
+    }

-    return wxWCharBuffer();
+    if ( outLen )
+        *outLen = lenBuf;
+
+    return wbuf;
 }

 const wxCharBuffer
@ -1352,7 +1382,9 @@ protected:
 #endif

 private:
-    virtual const char *GetMBNul(size_t *nulLen) const;
+    // classify this encoding as explained in wxMBConv::GetMinMBCharWidth()
+    // comment
+    virtual size_t GetMinMBCharWidth() const;

    // the name (for iconv_open()) of a wide char charset -- if none is
    // available on this machine, it will remain NULL
@ -1362,9 +1394,9 @@ private:
    // different endian-ness than the native one
    static bool ms_wcNeedsSwap;

-    // NUL representation
-    size_t m_nulLen;
-    char m_nulBuf[8];
+    // cached result of GetMinMBCharWidth(); set to 0 meaning "unknown"
+    // initially
+    size_t m_minMBCharWidth;
 };

 // make the constructor available for unit testing
@ -1384,7 +1416,7 @@ bool wxMBConv_iconv::ms_wcNeedsSwap = false;

 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
 {
-    m_nulLen = (size_t)-2;
+    m_minMBCharWidth = 0;

    // iconv operates with chars, not wxChars, but luckily it uses only ASCII
    // names for the charsets
@ -1642,9 +1674,9 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
    return res;
 }

-const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const
+size_t wxMBConv_iconv::GetMinMBCharWidth() const
 {
-    if ( m_nulLen == (size_t)-2 )
+    if ( m_minMBCharWidth == 0 )
    {
        wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);

@ -1654,22 +1686,22 @@ const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const
 #endif

        wchar_t *wnul = L"";
+        char buf[8]; // should be enough for NUL in any encoding
        size_t inLen = sizeof(wchar_t),
-               outLen = WXSIZEOF(m_nulBuf);
+               outLen = WXSIZEOF(buf);
        const char *in = (char *)wnul;
-        char *out = self->m_nulBuf;
+        char *out = buf;
        if ( iconv(w2m, &in, &inLen, &out, &outLen) == (size_t)-1 )
        {
-            self->m_nulLen = (size_t)-1;
+            self->m_minMBCharWidth = (size_t)-1;
        }
        else // ok
        {
-            self->m_nulLen = out - m_nulBuf;
+            self->m_minMBCharWidth = out - buf;
        }
    }

-    *nulLen = m_nulLen;
-    return m_nulBuf;
+    return m_minMBCharWidth;
 }

 #endif // HAVE_ICONV
@ -1693,20 +1725,20 @@ public:
    wxMBConv_win32()
    {
        m_CodePage = CP_ACP;
-        m_nulLen = (size_t)-2;
+        m_minMBCharWidth = 0;
    }

 #if wxUSE_FONTMAP
    wxMBConv_win32(const wxChar* name)
    {
        m_CodePage = wxCharsetToCodepage(name);
-        m_nulLen = (size_t)-2;
+        m_minMBCharWidth = 0;
    }

    wxMBConv_win32(wxFontEncoding encoding)
    {
        m_CodePage = wxEncodingToCodepage(encoding);
-        m_nulLen = (size_t)-2;
+        m_minMBCharWidth = 0;
    }
 #endif // wxUSE_FONTMAP

@ -1933,35 +1965,50 @@ private:
 #endif
    }

-    virtual const char *GetMBNul(size_t *nulLen) const
+    virtual size_t GetMinMBCharWidth() const
    {
-        if ( m_nulLen == (size_t)-2 )
+        if ( m_minMBCharWidth == 0 )
        {
+            int len = ::WideCharToMultiByte
+                        (
+                            m_CodePage,     // code page
+                            0,              // no flags
+                            L"",            // input string
+                            1,              // translate just the NUL
+                            NULL,           // output buffer
+                            0,              // and its size
+                            NULL,           // no replacement char
+                            NULL            // [out] don't care if it was used
+                        );
+
            wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
+            switch ( len )
+            {
+                default:
+                    wxLogDebug(_T("Unexpected NUL length %d"), len);
+                    // fall through

-            self->m_nulLen = ::WideCharToMultiByte
-                               (
-                                    m_CodePage,         // code page
-                                    0,                  // no flags
-                                    L"",                // input string
-                                    1,                  // translate just NUL
-                                    self->m_nulBuf,     // output buffer
-                                    WXSIZEOF(m_nulBuf), // and its size
-                                    NULL,               // "replacement" char
-                                    NULL                // [out] was it used?
-                               );
+                case 0:
+                    self->m_minMBCharWidth = (size_t)-1;
+                    break;

-            if ( m_nulLen == 0 )
-                self->m_nulLen = (size_t)-1;
+                case 1:
+                case 2:
+                case 4:
+                    self->m_minMBCharWidth = len;
+                    break;
+            }
        }

-        *nulLen = m_nulLen;
-        return m_nulBuf;
+        return m_minMBCharWidth;
    }

+    // the code page we're working with
    long m_CodePage;
-    size_t m_nulLen;
-    char m_nulBuf[8];
+
+    // cached result of GetMinMBCharWidth(), set to 0 initially meaning
+    // "unknown"
+    size_t m_minMBCharWidth;
 };

 #endif // wxHAVE_WIN32_MB2WC
@ -2602,23 +2649,20 @@ public:
    wxEncodingConverter m2w, w2m;

 private:
-    virtual const char *GetMBNul(size_t *nulLen) const
+    virtual size_t GetMinMBCharWidth() const
    {
        switch ( m_enc )
        {
            case wxFONTENCODING_UTF16BE:
            case wxFONTENCODING_UTF16LE:
-                *nulLen = 2;
-                return "\0";
+                return 2;

            case wxFONTENCODING_UTF32BE:
            case wxFONTENCODING_UTF32LE:
-                *nulLen = 4;
-                return "\0\0\0";
+                return 4;

            default:
-                *nulLen = 1;
-                return "";
+                return 1;
        }
    }

@ -3014,18 +3058,17 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
    return len;
 }

-const char *wxCSConv::GetMBNul(size_t *nulLen) const
+size_t wxCSConv::GetMinMBCharWidth() const
 {
    CreateConvIfNeeded();

    if ( m_convReal )
    {
        // cast needed just to call private function of m_convReal
-        return ((wxCSConv *)m_convReal)->GetMBNul(nulLen);
+        return ((wxCSConv *)m_convReal)->GetMinMBCharWidth();
    }

-    *nulLen = 1;
-    return "";
+    return 1;
 }

 // ----------------------------------------------------------------------------