From 016addc201b29696077b8d397c988c3817eaa429 Mon Sep 17 00:00:00 2001 From: Dennis Oberst Date: Wed, 28 Jun 2023 18:47:34 +0200 Subject: [PATCH] QString: assign() [4/4]: (it,it) overload for UTF-8 data types Implement the missing overload to handle UTF-8 specific data types, including char8_t (C++20), char, uchar and signed char. Introduce the helper function 'assign_helper_char8' which handles the non-contiguous_iterator case. The contiguous_iterator case is already handled by the QAnyStringView overload. Include 'qstringconverter.h' at the end of the file, since it can't be included at the top due to diamond dependency conflicts. QStringDecoder is an implementation detail we don't want users to depend on when using assign(it, it). It would be unnatural to not be able to use a function just because we didn't include an apparently unrelated header. [ChangeLog][QtCore][QString] Enabled assign() for UTF-8 data types. Fixes: QTBUG-114208 Change-Id: Ia39bbb70ca105a6bbf1a131b2533f29a919ff66d Reviewed-by: Marc Mutz --- src/corelib/text/qstring.cpp | 4 + src/corelib/text/qstring.h | 9 +++ src/corelib/text/qstringconverter.h | 60 ++++++++++++++ .../auto/corelib/text/qstring/tst_qstring.cpp | 78 ++++++++++++++++++- 4 files changed, 150 insertions(+), 1 deletion(-) diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index 81891e3cdb..a56d2c064c 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -3364,6 +3364,10 @@ QString &QString::append(QChar ch) \list \li QChar \li QLatin1Char + \li \c {char} + \li \c {unsigned char} + \li \c {signed char} + \li \c {char8_t} \li \c char16_t \li (on platforms, such as Windows, where it is a 16-bit type) \c wchar_t \li \c char32_t diff --git a/src/corelib/text/qstring.h b/src/corelib/text/qstring.h index c91dfe1850..078abda361 100644 --- a/src/corelib/text/qstring.h +++ b/src/corelib/text/qstring.h @@ -141,6 +141,7 @@ class Q_CORE_EXPORT QString using is_compatible_char_helper = std::disjunction< QtPrivate::IsCompatibleCharType, QtPrivate::IsCompatibleChar32Type, + QtPrivate::IsCompatibleChar8Type, std::is_same // special case >; @@ -451,6 +452,10 @@ public: ++first; } return *this; + } else if constexpr (QtPrivate::IsCompatibleChar8Type::value) { + assign_helper_char8(first, last); + d.data()[d.size] = u'\0'; + return *this; } else { d.assign(first, last, [](QChar ch) -> char16_t { return ch.unicode(); }); d.data()[d.size] = u'\0'; @@ -936,6 +941,9 @@ private: void reallocGrowData(qsizetype n); // ### remove once QAnyStringView supports UTF-32: QString &assign_helper(const char32_t *data, qsizetype len); + // Defined in qstringconverter.h + template + void assign_helper_char8(InputIterator first, InputIterator last); static int compare_helper(const QChar *data1, qsizetype length1, const QChar *data2, qsizetype length2, Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; @@ -1512,6 +1520,7 @@ inline QString operator""_qs(const char16_t *str, size_t size) noexcept QT_END_NAMESPACE #include +#include #ifdef Q_L1S_VIEW_IS_PRIMARY # undef Q_L1S_VIEW_IS_PRIMARY diff --git a/src/corelib/text/qstringconverter.h b/src/corelib/text/qstringconverter.h index e12516966a..148501288c 100644 --- a/src/corelib/text/qstringconverter.h +++ b/src/corelib/text/qstringconverter.h @@ -211,6 +211,66 @@ QByteArray &operator+=(QByteArray &a, const QStringEncoder::DecodedData &b) } #endif +template +void QString::assign_helper_char8(InputIterator first, InputIterator last) +{ + static_assert(!QString::is_contiguous_iterator_v, + "Internal error: Should have been handed over to the QAnyStringView overload." + ); + + using ValueType = typename std::iterator_traits::value_type; + constexpr bool IsFwdIt = std::is_convertible_v< + typename std::iterator_traits::iterator_category, + std::forward_iterator_tag + >; + + resize(0); + // In case of not being shared, there is the possibility of having free space at begin + // even after the resize to zero. + if (const auto offset = d.freeSpaceAtBegin()) + d.setBegin(d.begin() - offset); + + if constexpr (IsFwdIt) + reserve(static_cast(std::distance(first, last))); + + auto toUtf16 = QStringDecoder(QStringDecoder::Utf8); + auto availableCapacity = d.constAllocatedCapacity(); + auto *dst = d.data(); + auto *dend = d.data() + availableCapacity; + + while (true) { + if (first == last) { // ran out of input elements + Q_ASSERT(!std::less<>{}(dend, dst)); + d.size = dst - d.begin(); + return; + } + const ValueType next = *first; // decays proxies, if any + const auto chunk = QUtf8StringView(&next, 1); + // UTF-8 characters can have a maximum size of 4 bytes and may result in a surrogate + // pair of UTF-16 code units. In the input-iterator case, we don't know the size + // and would need to always reserve space for 2 code units. To keep our promise + // of 'not allocating if it fits', we have to pre-check this condition. + // We know that it fits in the forward-iterator case. + if constexpr (!IsFwdIt) { + constexpr qsizetype Pair = 2; + char16_t buf[Pair]; + const qptrdiff n = toUtf16.appendToBuffer(buf, chunk) - buf; + if (dend - dst < n) { // ran out of allocated memory + const auto offset = dst - d.begin(); + reallocData(d.constAllocatedCapacity() + Pair, QArrayData::Grow); + // update the pointers since we've re-allocated + availableCapacity = d.constAllocatedCapacity(); + dst = d.data() + offset; + dend = d.data() + availableCapacity; + } + dst = std::copy_n(buf, n, dst); + } else { // take the fast path + dst = toUtf16.appendToBuffer(dst, chunk); + } + ++first; + } +} + QT_END_NAMESPACE #endif diff --git a/tests/auto/corelib/text/qstring/tst_qstring.cpp b/tests/auto/corelib/text/qstring/tst_qstring.cpp index 2d03fb9d7c..77fb85d80f 100644 --- a/tests/auto/corelib/text/qstring/tst_qstring.cpp +++ b/tests/auto/corelib/text/qstring/tst_qstring.cpp @@ -3467,6 +3467,37 @@ void tst_QString::assign() QCOMPARE_EQ(str.capacity(), oldCap); QCOMPARE_EQ(str.size(), 0); +#ifndef QT_NO_CAST_FROM_ASCII + const char c8[] = "a©☻🂤"; // [1, 2, 3, 4] bytes in utf-8 code points + str.assign(std::begin(c8), std::end(c8) - 1); + QCOMPARE(str, c8); + + std::string c8str(c8); + str.assign(c8str.begin(), c8str.end()); + QCOMPARE(str, c8); + QCOMPARE(str.capacity(), qsizetype(std::size(c8) - 1)); + + oldCap = str.capacity(); + str.assign(c8str.begin(), c8str.begin()); // empty range + QCOMPARE_EQ(str.capacity(), oldCap); + QCOMPARE_EQ(str.size(), 0); + + std::forward_list fwd(std::begin(c8), std::end(c8) - 1); + str.assign(fwd.begin(), fwd.end()); + QCOMPARE(str, c8); +#endif +#ifdef __cpp_char8_t + const char8_t c8t[] = u8"🂤🂤🂤🂤🂤🂤🂤🂤🂤🂤"; // 10 x 4 bytes in utf-8 code points + str.assign(std::begin(c8t), std::end(c8t) - 1); + QCOMPARE(str, c8t); + QCOMPARE(str.size(), 20); +#endif +#ifdef __cpp_lib_char8_t + std::u8string c8tstr(c8t); + str.assign(c8tstr.begin(), c8tstr.end()); + QCOMPARE(str, c8t); +#endif + const char16_t c16[] = u"٩(⁎❛ᴗ❛⁎)۶ 🤷"; str.assign(std::begin(c16), std::end(c16) - 1); QCOMPARE(str, c16); @@ -3516,6 +3547,51 @@ void tst_QString::assign() str.assign(std::istream_iterator{}, std::istream_iterator{}); // empty range QCOMPARE_EQ(str.capacity(), oldCap); QCOMPARE_EQ(str.size(), 0); + +#ifndef QT_NO_CAST_FROM_ASCII + str.resize(0); + str.squeeze(); + str.reserve(5); + const char c8cmp[] = "🂤🂤a"; // 2 + 2 + 1 byte + ss.clear(); + ss.str(c8cmp); + str.assign(std::istream_iterator{ss}, std::istream_iterator{}); + QCOMPARE(str, c8cmp); + QCOMPARE(str.size(), 5); + QCOMPARE(str.capacity(), 5); + + // 1 code-point + ill-formed sequence + 1 code-point. + const char c8IllFormed[] = "a\xe0\x9f\x80""a"; + ss.clear(); + ss.str(c8IllFormed); + str.assign(std::istream_iterator{ss}, std::istream_iterator{}); + QEXPECT_FAIL("", "Iconsistent handling of ill-formed sequences, QTBUG-117051", Continue); + QCOMPARE_EQ(str, QString(c8IllFormed)); + + const char c82[] = "ÌşṫһíᶊśꞧɨℼṩuDF49ïľι?"; + ss.clear(); + ss.str(c82); + str.assign(std::istream_iterator{ss}, std::istream_iterator{}); + QCOMPARE(str, c82); + + const char uc8[] = "ẵƽ𝔰ȉ𝚐ꞑ𝒾𝝿𝕘"; + ss.clear(); + ss.str(uc8); + str.assign(std::istream_iterator{ss}, std::istream_iterator{}); + QCOMPARE(str, uc8); + + ss.clear(); + const char sc8[] = "𓁇ख़ॵ௵"; + ss.str(sc8); + str.assign(std::istream_iterator{ss}, std::istream_iterator{}); + QCOMPARE(str, sc8); + + oldCap = str.capacity(); + str.assign(std::istream_iterator{}, // empty range + std::istream_iterator{}); + QCOMPARE_EQ(str.capacity(), oldCap); + QCOMPARE_EQ(str.size(), 0); +#endif } // Test chaining { @@ -3634,7 +3710,7 @@ void tst_QString::assign_uses_prepend_buffer() for (qsizetype i = 0; i < withFreeSpaceAtBegin.d.freeSpaceAtBegin(); ++i) ss << "d "; - withFreeSpaceAtBegin.assign(std::istream_iterator{ss}, std::istream_iterator{}); + withFreeSpaceAtBegin.assign(std::istream_iterator{ss}, std::istream_iterator{}); QCOMPARE_EQ(withFreeSpaceAtBegin.d.freeSpaceAtBegin(), 0); // we used the prepend buffer QCOMPARE_EQ(capBegin(withFreeSpaceAtBegin), oldCapBegin); QCOMPARE_EQ(capEnd(withFreeSpaceAtBegin), oldCapEnd);