From 58d940690abbb7f59fd6b3e189328afed7f53821 Mon Sep 17 00:00:00 2001 From: ARATA Mizuki Date: Fri, 21 Apr 2017 04:32:32 +0900 Subject: [PATCH] Better handling of supplementary wxUniChar values in some of wxString methods On MSW, the Unicode code point is now properly encoded as UTF-16 when assigned or appended to a wxString. Closes #11827 --- include/wx/string.h | 109 ++++++++++++++++----------------------- include/wx/stringops.h | 34 +++++++++++- src/common/stringops.cpp | 62 ++++++++++++++++++++++ 3 files changed, 139 insertions(+), 66 deletions(-) diff --git a/include/wx/string.h b/include/wx/string.h index 3441a7f7d6..b80998cc44 100644 --- a/include/wx/string.h +++ b/include/wx/string.h @@ -898,9 +898,6 @@ public: wxStringIteratorNode m_node; }; - size_t IterToImplPos(wxString::iterator i) const - { return wxStringImpl::const_iterator(i.impl()) - m_impl.begin(); } - iterator GetIterForNthChar(size_t n) { return iterator(this, m_impl.begin() + PosToImpl(n)); } const_iterator GetIterForNthChar(size_t n) const @@ -975,6 +972,9 @@ public: const_iterator GetIterForNthChar(size_t n) const { return begin() + n; } #endif // wxUSE_UNICODE_UTF8/!wxUSE_UNICODE_UTF8 + size_t IterToImplPos(wxString::iterator i) const + { return wxStringImpl::const_iterator(i.impl()) - m_impl.begin(); } + #undef WX_STR_ITERATOR_TAG #undef WX_STR_ITERATOR_IMPL @@ -1820,12 +1820,11 @@ public: { wxSTRING_INVALIDATE_CACHE(); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - m_impl = wxStringOperations::EncodeChar(ch); - else -#endif // wxUSE_UNICODE_UTF8 + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) m_impl = (wxStringCharType)ch; + else + m_impl = wxStringOperations::EncodeChar(ch); + return *this; } @@ -2410,20 +2409,18 @@ public: // append n copies of ch wxString& append(size_t n, wxUniChar ch) { -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - { - wxSTRING_INVALIDATE_CACHED_LENGTH(); - - m_impl.append(wxStringOperations::EncodeNChars(n, ch)); - } - else // ASCII -#endif + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) { wxSTRING_UPDATE_CACHED_LENGTH(n); m_impl.append(n, (wxStringCharType)ch); } + else + { + wxSTRING_INVALIDATE_CACHED_LENGTH(); + + m_impl.append(wxStringOperations::EncodeNChars(n, ch)); + } return *this; } @@ -2556,12 +2553,10 @@ public: { wxSTRING_SET_CACHED_LENGTH(n); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - m_impl.assign(wxStringOperations::EncodeNChars(n, ch)); - else -#endif + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) m_impl.assign(n, (wxStringCharType)ch); + else + m_impl.assign(wxStringOperations::EncodeNChars(n, ch)); return *this; } @@ -2671,12 +2666,11 @@ public: { wxSTRING_UPDATE_CACHED_LENGTH(n); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - m_impl.insert(PosToImpl(nPos), wxStringOperations::EncodeNChars(n, ch)); - else -#endif + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) m_impl.insert(PosToImpl(nPos), n, (wxStringCharType)ch); + else + m_impl.insert(PosToImpl(nPos), wxStringOperations::EncodeNChars(n, ch)); + return *this; } @@ -2684,16 +2678,14 @@ public: { wxSTRING_UPDATE_CACHED_LENGTH(1); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) + return iterator(this, m_impl.insert(it.impl(), (wxStringCharType)ch)); + else { size_t pos = IterToImplPos(it); m_impl.insert(pos, wxStringOperations::EncodeChar(ch)); return iterator(this, m_impl.begin() + pos); } - else -#endif - return iterator(this, m_impl.insert(it.impl(), (wxStringCharType)ch)); } void insert(iterator it, const_iterator first, const_iterator last) @@ -2716,12 +2708,10 @@ public: { wxSTRING_UPDATE_CACHED_LENGTH(n); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - m_impl.insert(IterToImplPos(it), wxStringOperations::EncodeNChars(n, ch)); - else -#endif + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) m_impl.insert(it.impl(), n, (wxStringCharType)ch); + else + m_impl.insert(IterToImplPos(it), wxStringOperations::EncodeNChars(n, ch)); } // delete characters from nStart to nStart + nLen @@ -2800,12 +2790,11 @@ public: size_t from, len; PosLenToImpl(nStart, nLen, &from, &len); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - m_impl.replace(from, len, wxStringOperations::EncodeNChars(nCount, ch)); - else -#endif + + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) m_impl.replace(from, len, nCount, (wxStringCharType)ch); + else + m_impl.replace(from, len, wxStringOperations::EncodeNChars(nCount, ch)); return *this; } @@ -2921,13 +2910,11 @@ public: { wxSTRING_INVALIDATE_CACHE(); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) + m_impl.replace(first.impl(), last.impl(), n, (wxStringCharType)ch); + else m_impl.replace(first.impl(), last.impl(), wxStringOperations::EncodeNChars(n, ch)); - else -#endif - m_impl.replace(first.impl(), last.impl(), n, (wxStringCharType)ch); return *this; } @@ -2988,15 +2975,12 @@ public: // find the first occurrence of character ch after nStart size_t find(wxUniChar ch, size_t nStart = 0) const { -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - return PosFromImpl(m_impl.find(wxStringOperations::EncodeChar(ch), - PosToImpl(nStart))); - else -#endif + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) return PosFromImpl(m_impl.find((wxStringCharType)ch, PosToImpl(nStart))); - + else + return PosFromImpl(m_impl.find(wxStringOperations::EncodeChar(ch), + PosToImpl(nStart))); } size_t find(wxUniCharRef ch, size_t nStart = 0) const { return find(wxUniChar(ch), nStart); } @@ -3033,13 +3017,11 @@ public: // as find, but from the end size_t rfind(wxUniChar ch, size_t nStart = npos) const { -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - return PosFromImpl(m_impl.rfind(wxStringOperations::EncodeChar(ch), + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) + return PosFromImpl(m_impl.rfind((wxStringCharType)ch, PosToImpl(nStart))); else -#endif - return PosFromImpl(m_impl.rfind((wxStringCharType)ch, + return PosFromImpl(m_impl.rfind(wxStringOperations::EncodeChar(ch), PosToImpl(nStart))); } size_t rfind(wxUniCharRef ch, size_t nStart = npos) const @@ -3301,12 +3283,11 @@ public: { wxSTRING_UPDATE_CACHED_LENGTH(1); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - m_impl += wxStringOperations::EncodeChar(ch); - else -#endif + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) m_impl += (wxStringCharType)ch; + else + m_impl += wxStringOperations::EncodeChar(ch); + return *this; } wxString& operator+=(wxUniCharRef ch) { return *this += wxUniChar(ch); } diff --git a/include/wx/stringops.h b/include/wx/stringops.h index 21c6121787..fd6695116f 100644 --- a/include/wx/stringops.h +++ b/include/wx/stringops.h @@ -44,9 +44,36 @@ struct WXDLLIMPEXP_BASE wxStringOperationsWchar static ptrdiff_t DiffIters(const Iterator& i1, const Iterator& i2) { return i1 - i2; } +#if wxUSE_UNICODE_UTF16 + // encodes th characters as UTF-16: + struct Utf16CharBuffer + { + Utf16CharBuffer() : data() {} + wchar_t data[3]; + operator const wchar_t*() const { return data; } + }; + static Utf16CharBuffer EncodeChar(const wxUniChar& ch); + static wxWCharBuffer EncodeNChars(size_t n, const wxUniChar& ch); + static bool IsSingleCodeUnitCharacter(const wxUniChar& ch) + { return !ch.IsSupplementary(); } +#else // encodes the character to a form used to represent it in internal - // representation (returns a string in UTF8 version) - static wxChar EncodeChar(const wxUniChar& ch) { return (wxChar)ch; } + // representation + struct SingleCharBuffer + { + SingleCharBuffer() : data() {} + wxChar data[2]; + operator const wxChar*() const { return data; } + }; + static SingleCharBuffer EncodeChar(const wxUniChar& ch) + { + SingleCharBuffer buf; + buf.data[0] = (wxChar)ch; + return buf; + } + static wxWxCharBuffer EncodeNChars(size_t n, const wxUniChar& ch); + static bool IsSingleCodeUnitCharacter(const wxUniChar&) { return true; } +#endif static wxUniChar DecodeChar(const wxStringImpl::const_iterator& i) { return *i; } @@ -134,6 +161,9 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8 return dist; } + static bool IsSingleCodeUnitCharacter(const wxUniChar& ch) + { return ch.IsAscii(); } + // encodes the character as UTF-8: typedef wxUniChar::Utf8CharBuffer Utf8CharBuffer; static Utf8CharBuffer EncodeChar(const wxUniChar& ch) diff --git a/src/common/stringops.cpp b/src/common/stringops.cpp index 36ff4045a4..2d8fcaee3e 100644 --- a/src/common/stringops.cpp +++ b/src/common/stringops.cpp @@ -27,6 +27,68 @@ // implementation // =========================================================================== +#if wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE + +#if wxUSE_UNICODE_UTF16 + +wxStringOperationsWchar::Utf16CharBuffer wxStringOperationsWchar::EncodeChar(const wxUniChar& ch) +{ + Utf16CharBuffer buf; + if ( ch.IsSupplementary() ) + { + buf.data[0] = (wchar_t)ch.HighSurrogate(); + buf.data[1] = (wchar_t)ch.LowSurrogate(); + } + else + { + // Assume ch is a BMP character + buf.data[0] = (wchar_t)ch; + } + return buf; +} + +wxWCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar& ch) +{ + if ( ch.IsSupplementary() ) + { + wxWCharBuffer buf(n * 2); + wchar_t s[2] = { + (wchar_t)ch.HighSurrogate(), + (wchar_t)ch.LowSurrogate(), + }; + wchar_t *ptr = buf.data(); + for (size_t i = 0; i < n; i++, ptr += 2) + { + wmemcpy(ptr, s, 2); + } + return buf; + } + else + { + // Assume ch is a BMP character + wxWCharBuffer buf(n); + wmemset(buf.data(), (wchar_t)ch, n); + return buf; + } +} + +#else + +wxWxCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar& ch) +{ + wxWxCharBuffer buf(n); +#if wxUSE_UNICODE_WCHAR + wmemset(buf.data(), (wchar_t)ch, n); +#else // ANSI + memset(buf.data(), (unsigned char)ch, n); +#endif + return buf; +} + +#endif // wxUSE_UNICODE_UTF16 + +#endif // wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE + #if wxUSE_UNICODE_UTF8 // ---------------------------------------------------------------------------