Better handling of supplementary wxUniChar values in some of wxString methods

On MSW, the Unicode code point is now properly encoded as UTF-16 when assigned or appended to a wxString. Closes #11827
2017-04-21 04:32:32 +09:00 · 2017-04-21 04:32:32 +09:00 · 58d940690a
commit 58d940690a
parent ad47857072
3 changed files with 139 additions and 66 deletions
--- a/include/wx/string.h
+++ b/include/wx/string.h
@ -898,9 +898,6 @@ public:
      wxStringIteratorNode m_node;
  };

-  size_t IterToImplPos(wxString::iterator i) const
-    { return wxStringImpl::const_iterator(i.impl()) - m_impl.begin(); }
-
  iterator GetIterForNthChar(size_t n)
    { return iterator(this, m_impl.begin() + PosToImpl(n)); }
  const_iterator GetIterForNthChar(size_t n) const
@ -975,6 +972,9 @@ public:
  const_iterator GetIterForNthChar(size_t n) const { return begin() + n; }
 #endif // wxUSE_UNICODE_UTF8/!wxUSE_UNICODE_UTF8

+  size_t IterToImplPos(wxString::iterator i) const
+    { return wxStringImpl::const_iterator(i.impl()) - m_impl.begin(); }
+
  #undef WX_STR_ITERATOR_TAG
  #undef WX_STR_ITERATOR_IMPL

@ -1820,12 +1820,11 @@ public:
  {
    wxSTRING_INVALIDATE_CACHE();

-#if wxUSE_UNICODE_UTF8
-    if ( !ch.IsAscii() )
-        m_impl = wxStringOperations::EncodeChar(ch);
-    else
-#endif // wxUSE_UNICODE_UTF8
+    if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
        m_impl = (wxStringCharType)ch;
+    else
+        m_impl = wxStringOperations::EncodeChar(ch);
+
    return *this;
  }

@ -2410,20 +2409,18 @@ public:
    // append n copies of ch
  wxString& append(size_t n, wxUniChar ch)
  {
-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
-      {
-          wxSTRING_INVALIDATE_CACHED_LENGTH();
-
-          m_impl.append(wxStringOperations::EncodeNChars(n, ch));
-      }
-      else // ASCII
-#endif
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
      {
          wxSTRING_UPDATE_CACHED_LENGTH(n);

          m_impl.append(n, (wxStringCharType)ch);
      }
+      else
+      {
+          wxSTRING_INVALIDATE_CACHED_LENGTH();
+
+          m_impl.append(wxStringOperations::EncodeNChars(n, ch));
+      }

      return *this;
  }
@ -2556,12 +2553,10 @@ public:
  {
      wxSTRING_SET_CACHED_LENGTH(n);

-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
-          m_impl.assign(wxStringOperations::EncodeNChars(n, ch));
-      else
-#endif
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
          m_impl.assign(n, (wxStringCharType)ch);
+      else
+          m_impl.assign(wxStringOperations::EncodeNChars(n, ch));

      return *this;
  }
@ -2671,12 +2666,11 @@ public:
  {
      wxSTRING_UPDATE_CACHED_LENGTH(n);

-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
-          m_impl.insert(PosToImpl(nPos), wxStringOperations::EncodeNChars(n, ch));
-      else
-#endif
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
          m_impl.insert(PosToImpl(nPos), n, (wxStringCharType)ch);
+      else
+          m_impl.insert(PosToImpl(nPos), wxStringOperations::EncodeNChars(n, ch));
+
      return *this;
  }

@ -2684,16 +2678,14 @@ public:
  {
      wxSTRING_UPDATE_CACHED_LENGTH(1);

-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
+          return iterator(this, m_impl.insert(it.impl(), (wxStringCharType)ch));
+      else
      {
          size_t pos = IterToImplPos(it);
          m_impl.insert(pos, wxStringOperations::EncodeChar(ch));
          return iterator(this, m_impl.begin() + pos);
      }
-      else
-#endif
-          return iterator(this, m_impl.insert(it.impl(), (wxStringCharType)ch));
  }

  void insert(iterator it, const_iterator first, const_iterator last)
@ -2716,12 +2708,10 @@ public:
  {
      wxSTRING_UPDATE_CACHED_LENGTH(n);

-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
-          m_impl.insert(IterToImplPos(it), wxStringOperations::EncodeNChars(n, ch));
-      else
-#endif
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
          m_impl.insert(it.impl(), n, (wxStringCharType)ch);
+      else
+          m_impl.insert(IterToImplPos(it), wxStringOperations::EncodeNChars(n, ch));
  }

    // delete characters from nStart to nStart + nLen
@ -2800,12 +2790,11 @@ public:

      size_t from, len;
      PosLenToImpl(nStart, nLen, &from, &len);
-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
-          m_impl.replace(from, len, wxStringOperations::EncodeNChars(nCount, ch));
-      else
-#endif
+
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
          m_impl.replace(from, len, nCount, (wxStringCharType)ch);
+      else
+          m_impl.replace(from, len, wxStringOperations::EncodeNChars(nCount, ch));

      return *this;
  }
@ -2921,13 +2910,11 @@ public:
  {
      wxSTRING_INVALIDATE_CACHE();

-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
+          m_impl.replace(first.impl(), last.impl(), n, (wxStringCharType)ch);
+      else
          m_impl.replace(first.impl(), last.impl(),
                  wxStringOperations::EncodeNChars(n, ch));
-      else
-#endif
-          m_impl.replace(first.impl(), last.impl(), n, (wxStringCharType)ch);

      return *this;
  }
@ -2988,15 +2975,12 @@ public:
    // find the first occurrence of character ch after nStart
  size_t find(wxUniChar ch, size_t nStart = 0) const
  {
-#if wxUSE_UNICODE_UTF8
-    if ( !ch.IsAscii() )
-        return PosFromImpl(m_impl.find(wxStringOperations::EncodeChar(ch),
-                                       PosToImpl(nStart)));
-    else
-#endif
+    if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
        return PosFromImpl(m_impl.find((wxStringCharType)ch,
                                       PosToImpl(nStart)));
-
+    else
+        return PosFromImpl(m_impl.find(wxStringOperations::EncodeChar(ch),
+                                       PosToImpl(nStart)));
  }
  size_t find(wxUniCharRef ch, size_t nStart = 0) const
    {  return find(wxUniChar(ch), nStart); }
@ -3033,13 +3017,11 @@ public:
    // as find, but from the end
  size_t rfind(wxUniChar ch, size_t nStart = npos) const
  {
-#if wxUSE_UNICODE_UTF8
-    if ( !ch.IsAscii() )
-        return PosFromImpl(m_impl.rfind(wxStringOperations::EncodeChar(ch),
+    if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
+        return PosFromImpl(m_impl.rfind((wxStringCharType)ch,
                                        PosToImpl(nStart)));
    else
-#endif
-        return PosFromImpl(m_impl.rfind((wxStringCharType)ch,
+        return PosFromImpl(m_impl.rfind(wxStringOperations::EncodeChar(ch),
                                        PosToImpl(nStart)));
  }
  size_t rfind(wxUniCharRef ch, size_t nStart = npos) const
@ -3301,12 +3283,11 @@ public:
  {
      wxSTRING_UPDATE_CACHED_LENGTH(1);

-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
-          m_impl += wxStringOperations::EncodeChar(ch);
-      else
-#endif
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
          m_impl += (wxStringCharType)ch;
+      else
+          m_impl += wxStringOperations::EncodeChar(ch);
+
      return *this;
  }
  wxString& operator+=(wxUniCharRef ch) { return *this += wxUniChar(ch); }
--- a/include/wx/stringops.h
+++ b/include/wx/stringops.h
@ -44,9 +44,36 @@ struct WXDLLIMPEXP_BASE wxStringOperationsWchar
    static ptrdiff_t DiffIters(const Iterator& i1, const Iterator& i2)
        { return i1 - i2; }

+#if wxUSE_UNICODE_UTF16
+    // encodes th characters as UTF-16:
+    struct Utf16CharBuffer
+    {
+        Utf16CharBuffer() : data() {}
+        wchar_t data[3];
+        operator const wchar_t*() const { return data; }
+    };
+    static Utf16CharBuffer EncodeChar(const wxUniChar& ch);
+    static wxWCharBuffer EncodeNChars(size_t n, const wxUniChar& ch);
+    static bool IsSingleCodeUnitCharacter(const wxUniChar& ch)
+        { return !ch.IsSupplementary(); }
+#else
    // encodes the character to a form used to represent it in internal
-    // representation (returns a string in UTF8 version)
-    static wxChar EncodeChar(const wxUniChar& ch) { return (wxChar)ch; }
+    // representation
+    struct SingleCharBuffer
+    {
+        SingleCharBuffer() : data() {}
+        wxChar data[2];
+        operator const wxChar*() const { return data; }
+    };
+    static SingleCharBuffer EncodeChar(const wxUniChar& ch)
+    {
+        SingleCharBuffer buf;
+        buf.data[0] = (wxChar)ch;
+        return buf;
+    }
+    static wxWxCharBuffer EncodeNChars(size_t n, const wxUniChar& ch);
+    static bool IsSingleCodeUnitCharacter(const wxUniChar&) { return true; }
+#endif

    static wxUniChar DecodeChar(const wxStringImpl::const_iterator& i)
        { return *i; }
@ -134,6 +161,9 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
        return dist;
    }

+    static bool IsSingleCodeUnitCharacter(const wxUniChar& ch)
+        { return ch.IsAscii(); }
+
    // encodes the character as UTF-8:
    typedef wxUniChar::Utf8CharBuffer Utf8CharBuffer;
    static Utf8CharBuffer EncodeChar(const wxUniChar& ch)
--- a/src/common/stringops.cpp
+++ b/src/common/stringops.cpp
@ -27,6 +27,68 @@
 // implementation
 // ===========================================================================

+#if wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
+
+#if wxUSE_UNICODE_UTF16
+
+wxStringOperationsWchar::Utf16CharBuffer wxStringOperationsWchar::EncodeChar(const wxUniChar& ch)
+{
+    Utf16CharBuffer buf;
+    if ( ch.IsSupplementary() )
+    {
+        buf.data[0] = (wchar_t)ch.HighSurrogate();
+        buf.data[1] = (wchar_t)ch.LowSurrogate();
+    }
+    else
+    {
+        // Assume ch is a BMP character
+        buf.data[0] = (wchar_t)ch;
+    }
+    return buf;
+}
+
+wxWCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar& ch)
+{
+    if ( ch.IsSupplementary() )
+    {
+        wxWCharBuffer buf(n * 2);
+        wchar_t s[2] = {
+            (wchar_t)ch.HighSurrogate(),
+            (wchar_t)ch.LowSurrogate(),
+        };
+        wchar_t *ptr = buf.data();
+        for (size_t i = 0; i < n; i++, ptr += 2)
+        {
+            wmemcpy(ptr, s, 2);
+        }
+        return buf;
+    }
+    else
+    {
+        // Assume ch is a BMP character
+        wxWCharBuffer buf(n);
+        wmemset(buf.data(), (wchar_t)ch, n);
+        return buf;
+    }
+}
+
+#else
+
+wxWxCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar& ch)
+{
+    wxWxCharBuffer buf(n);
+#if wxUSE_UNICODE_WCHAR
+    wmemset(buf.data(), (wchar_t)ch, n);
+#else // ANSI
+    memset(buf.data(), (unsigned char)ch, n);
+#endif
+    return buf;
+}
+
+#endif // wxUSE_UNICODE_UTF16
+
+#endif // wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
+
 #if wxUSE_UNICODE_UTF8

 // ---------------------------------------------------------------------------