From 58d940690abbb7f59fd6b3e189328afed7f53821 Mon Sep 17 00:00:00 2001
From: ARATA Mizuki <minorinoki@gmail.com>
Date: Fri, 21 Apr 2017 04:32:32 +0900
Subject: [PATCH] Better handling of supplementary wxUniChar values in some of
 wxString methods

On MSW, the Unicode code point is now properly encoded as UTF-16 when
assigned or appended to a wxString.

Closes #11827
---
 include/wx/string.h      | 109 ++++++++++++++++-----------------------
 include/wx/stringops.h   |  34 +++++++++++-
 src/common/stringops.cpp |  62 ++++++++++++++++++++++
 3 files changed, 139 insertions(+), 66 deletions(-)

diff --git a/include/wx/string.h b/include/wx/string.h
index 3441a7f7d6..b80998cc44 100644
--- a/include/wx/string.h
+++ b/include/wx/string.h
@@ -898,9 +898,6 @@ public:
       wxStringIteratorNode m_node;
   };
 
-  size_t IterToImplPos(wxString::iterator i) const
-    { return wxStringImpl::const_iterator(i.impl()) - m_impl.begin(); }
-
   iterator GetIterForNthChar(size_t n)
     { return iterator(this, m_impl.begin() + PosToImpl(n)); }
   const_iterator GetIterForNthChar(size_t n) const
@@ -975,6 +972,9 @@ public:
   const_iterator GetIterForNthChar(size_t n) const { return begin() + n; }
 #endif // wxUSE_UNICODE_UTF8/!wxUSE_UNICODE_UTF8
 
+  size_t IterToImplPos(wxString::iterator i) const
+    { return wxStringImpl::const_iterator(i.impl()) - m_impl.begin(); }
+
   #undef WX_STR_ITERATOR_TAG
   #undef WX_STR_ITERATOR_IMPL
 
@@ -1820,12 +1820,11 @@ public:
   {
     wxSTRING_INVALIDATE_CACHE();
 
-#if wxUSE_UNICODE_UTF8
-    if ( !ch.IsAscii() )
-        m_impl = wxStringOperations::EncodeChar(ch);
-    else
-#endif // wxUSE_UNICODE_UTF8
+    if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
         m_impl = (wxStringCharType)ch;
+    else
+        m_impl = wxStringOperations::EncodeChar(ch);
+
     return *this;
   }
 
@@ -2410,20 +2409,18 @@ public:
     // append n copies of ch
   wxString& append(size_t n, wxUniChar ch)
   {
-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
-      {
-          wxSTRING_INVALIDATE_CACHED_LENGTH();
-
-          m_impl.append(wxStringOperations::EncodeNChars(n, ch));
-      }
-      else // ASCII
-#endif
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
       {
           wxSTRING_UPDATE_CACHED_LENGTH(n);
 
           m_impl.append(n, (wxStringCharType)ch);
       }
+      else
+      {
+          wxSTRING_INVALIDATE_CACHED_LENGTH();
+
+          m_impl.append(wxStringOperations::EncodeNChars(n, ch));
+      }
 
       return *this;
   }
@@ -2556,12 +2553,10 @@ public:
   {
       wxSTRING_SET_CACHED_LENGTH(n);
 
-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
-          m_impl.assign(wxStringOperations::EncodeNChars(n, ch));
-      else
-#endif
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
           m_impl.assign(n, (wxStringCharType)ch);
+      else
+          m_impl.assign(wxStringOperations::EncodeNChars(n, ch));
 
       return *this;
   }
@@ -2671,12 +2666,11 @@ public:
   {
       wxSTRING_UPDATE_CACHED_LENGTH(n);
 
-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
-          m_impl.insert(PosToImpl(nPos), wxStringOperations::EncodeNChars(n, ch));
-      else
-#endif
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
           m_impl.insert(PosToImpl(nPos), n, (wxStringCharType)ch);
+      else
+          m_impl.insert(PosToImpl(nPos), wxStringOperations::EncodeNChars(n, ch));
+
       return *this;
   }
 
@@ -2684,16 +2678,14 @@ public:
   {
       wxSTRING_UPDATE_CACHED_LENGTH(1);
 
-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
+          return iterator(this, m_impl.insert(it.impl(), (wxStringCharType)ch));
+      else
       {
           size_t pos = IterToImplPos(it);
           m_impl.insert(pos, wxStringOperations::EncodeChar(ch));
           return iterator(this, m_impl.begin() + pos);
       }
-      else
-#endif
-          return iterator(this, m_impl.insert(it.impl(), (wxStringCharType)ch));
   }
 
   void insert(iterator it, const_iterator first, const_iterator last)
@@ -2716,12 +2708,10 @@ public:
   {
       wxSTRING_UPDATE_CACHED_LENGTH(n);
 
-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
-          m_impl.insert(IterToImplPos(it), wxStringOperations::EncodeNChars(n, ch));
-      else
-#endif
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
           m_impl.insert(it.impl(), n, (wxStringCharType)ch);
+      else
+          m_impl.insert(IterToImplPos(it), wxStringOperations::EncodeNChars(n, ch));
   }
 
     // delete characters from nStart to nStart + nLen
@@ -2800,12 +2790,11 @@ public:
 
       size_t from, len;
       PosLenToImpl(nStart, nLen, &from, &len);
-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
-          m_impl.replace(from, len, wxStringOperations::EncodeNChars(nCount, ch));
-      else
-#endif
+
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
           m_impl.replace(from, len, nCount, (wxStringCharType)ch);
+      else
+          m_impl.replace(from, len, wxStringOperations::EncodeNChars(nCount, ch));
 
       return *this;
   }
@@ -2921,13 +2910,11 @@ public:
   {
       wxSTRING_INVALIDATE_CACHE();
 
-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
+          m_impl.replace(first.impl(), last.impl(), n, (wxStringCharType)ch);
+      else
           m_impl.replace(first.impl(), last.impl(),
                   wxStringOperations::EncodeNChars(n, ch));
-      else
-#endif
-          m_impl.replace(first.impl(), last.impl(), n, (wxStringCharType)ch);
 
       return *this;
   }
@@ -2988,15 +2975,12 @@ public:
     // find the first occurrence of character ch after nStart
   size_t find(wxUniChar ch, size_t nStart = 0) const
   {
-#if wxUSE_UNICODE_UTF8
-    if ( !ch.IsAscii() )
-        return PosFromImpl(m_impl.find(wxStringOperations::EncodeChar(ch),
-                                       PosToImpl(nStart)));
-    else
-#endif
+    if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
         return PosFromImpl(m_impl.find((wxStringCharType)ch,
                                        PosToImpl(nStart)));
-
+    else
+        return PosFromImpl(m_impl.find(wxStringOperations::EncodeChar(ch),
+                                       PosToImpl(nStart)));
   }
   size_t find(wxUniCharRef ch, size_t nStart = 0) const
     {  return find(wxUniChar(ch), nStart); }
@@ -3033,13 +3017,11 @@ public:
     // as find, but from the end
   size_t rfind(wxUniChar ch, size_t nStart = npos) const
   {
-#if wxUSE_UNICODE_UTF8
-    if ( !ch.IsAscii() )
-        return PosFromImpl(m_impl.rfind(wxStringOperations::EncodeChar(ch),
+    if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
+        return PosFromImpl(m_impl.rfind((wxStringCharType)ch,
                                         PosToImpl(nStart)));
     else
-#endif
-        return PosFromImpl(m_impl.rfind((wxStringCharType)ch,
+        return PosFromImpl(m_impl.rfind(wxStringOperations::EncodeChar(ch),
                                         PosToImpl(nStart)));
   }
   size_t rfind(wxUniCharRef ch, size_t nStart = npos) const
@@ -3301,12 +3283,11 @@ public:
   {
       wxSTRING_UPDATE_CACHED_LENGTH(1);
 
-#if wxUSE_UNICODE_UTF8
-      if ( !ch.IsAscii() )
-          m_impl += wxStringOperations::EncodeChar(ch);
-      else
-#endif
+      if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
           m_impl += (wxStringCharType)ch;
+      else
+          m_impl += wxStringOperations::EncodeChar(ch);
+
       return *this;
   }
   wxString& operator+=(wxUniCharRef ch) { return *this += wxUniChar(ch); }
diff --git a/include/wx/stringops.h b/include/wx/stringops.h
index 21c6121787..fd6695116f 100644
--- a/include/wx/stringops.h
+++ b/include/wx/stringops.h
@@ -44,9 +44,36 @@ struct WXDLLIMPEXP_BASE wxStringOperationsWchar
     static ptrdiff_t DiffIters(const Iterator& i1, const Iterator& i2)
         { return i1 - i2; }
 
+#if wxUSE_UNICODE_UTF16
+    // encodes th characters as UTF-16:
+    struct Utf16CharBuffer
+    {
+        Utf16CharBuffer() : data() {}
+        wchar_t data[3];
+        operator const wchar_t*() const { return data; }
+    };
+    static Utf16CharBuffer EncodeChar(const wxUniChar& ch);
+    static wxWCharBuffer EncodeNChars(size_t n, const wxUniChar& ch);
+    static bool IsSingleCodeUnitCharacter(const wxUniChar& ch)
+        { return !ch.IsSupplementary(); }
+#else
     // encodes the character to a form used to represent it in internal
-    // representation (returns a string in UTF8 version)
-    static wxChar EncodeChar(const wxUniChar& ch) { return (wxChar)ch; }
+    // representation
+    struct SingleCharBuffer
+    {
+        SingleCharBuffer() : data() {}
+        wxChar data[2];
+        operator const wxChar*() const { return data; }
+    };
+    static SingleCharBuffer EncodeChar(const wxUniChar& ch)
+    {
+        SingleCharBuffer buf;
+        buf.data[0] = (wxChar)ch;
+        return buf;
+    }
+    static wxWxCharBuffer EncodeNChars(size_t n, const wxUniChar& ch);
+    static bool IsSingleCodeUnitCharacter(const wxUniChar&) { return true; }
+#endif
 
     static wxUniChar DecodeChar(const wxStringImpl::const_iterator& i)
         { return *i; }
@@ -134,6 +161,9 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
         return dist;
     }
 
+    static bool IsSingleCodeUnitCharacter(const wxUniChar& ch)
+        { return ch.IsAscii(); }
+
     // encodes the character as UTF-8:
     typedef wxUniChar::Utf8CharBuffer Utf8CharBuffer;
     static Utf8CharBuffer EncodeChar(const wxUniChar& ch)
diff --git a/src/common/stringops.cpp b/src/common/stringops.cpp
index 36ff4045a4..2d8fcaee3e 100644
--- a/src/common/stringops.cpp
+++ b/src/common/stringops.cpp
@@ -27,6 +27,68 @@
 // implementation
 // ===========================================================================
 
+#if wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
+
+#if wxUSE_UNICODE_UTF16
+
+wxStringOperationsWchar::Utf16CharBuffer wxStringOperationsWchar::EncodeChar(const wxUniChar& ch)
+{
+    Utf16CharBuffer buf;
+    if ( ch.IsSupplementary() )
+    {
+        buf.data[0] = (wchar_t)ch.HighSurrogate();
+        buf.data[1] = (wchar_t)ch.LowSurrogate();
+    }
+    else
+    {
+        // Assume ch is a BMP character
+        buf.data[0] = (wchar_t)ch;
+    }
+    return buf;
+}
+
+wxWCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar& ch)
+{
+    if ( ch.IsSupplementary() )
+    {
+        wxWCharBuffer buf(n * 2);
+        wchar_t s[2] = {
+            (wchar_t)ch.HighSurrogate(),
+            (wchar_t)ch.LowSurrogate(),
+        };
+        wchar_t *ptr = buf.data();
+        for (size_t i = 0; i < n; i++, ptr += 2)
+        {
+            wmemcpy(ptr, s, 2);
+        }
+        return buf;
+    }
+    else
+    {
+        // Assume ch is a BMP character
+        wxWCharBuffer buf(n);
+        wmemset(buf.data(), (wchar_t)ch, n);
+        return buf;
+    }
+}
+
+#else
+
+wxWxCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar& ch)
+{
+    wxWxCharBuffer buf(n);
+#if wxUSE_UNICODE_WCHAR
+    wmemset(buf.data(), (wchar_t)ch, n);
+#else // ANSI
+    memset(buf.data(), (unsigned char)ch, n);
+#endif
+    return buf;
+}
+
+#endif // wxUSE_UNICODE_UTF16
+
+#endif // wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
+
 #if wxUSE_UNICODE_UTF8
 
 // ---------------------------------------------------------------------------