From 3fedcd4e4aeb79e432276aaac128910f8de05f25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=98ystein=20Heskestad?= Date: Tue, 4 Oct 2022 17:24:17 +0200 Subject: [PATCH] Add Boyer-Moore Latin-1 string searcher with optional case sensitivity [ChangeLog][QtCore][QString] Added Boyer-Moore Latin-1 string searcher with optional case sensitivity Task-number: QTBUG-100236 Change-Id: I200a0dac7c8012add1ee02511dba791d233115e0 Reviewed-by: Thiago Macieira --- src/corelib/CMakeLists.txt | 1 + src/corelib/text/qlatin1stringmatcher.cpp | 198 ++++++++++++ src/corelib/text/qlatin1stringmatcher.h | 160 +++++++++ src/corelib/text/qstring.cpp | 116 ++----- src/tools/bootstrap/CMakeLists.txt | 1 + tests/auto/corelib/text/CMakeLists.txt | 1 + .../text/qlatin1stringmatcher/CMakeLists.txt | 14 + .../tst_qlatin1stringmatcher.cpp | 306 ++++++++++++++++++ 8 files changed, 699 insertions(+), 98 deletions(-) create mode 100644 src/corelib/text/qlatin1stringmatcher.cpp create mode 100644 src/corelib/text/qlatin1stringmatcher.h create mode 100644 tests/auto/corelib/text/qlatin1stringmatcher/CMakeLists.txt create mode 100644 tests/auto/corelib/text/qlatin1stringmatcher/tst_qlatin1stringmatcher.cpp diff --git a/src/corelib/CMakeLists.txt b/src/corelib/CMakeLists.txt index fdaef27e5f..02854a710a 100644 --- a/src/corelib/CMakeLists.txt +++ b/src/corelib/CMakeLists.txt @@ -218,6 +218,7 @@ qt_internal_add_module(Core text/qchar.h text/qcollator.cpp text/qcollator.h text/qcollator_p.h text/qdoublescanprint_p.h + text/qlatin1stringmatcher.cpp text/qlatin1stringmatcher.h text/qlocale.cpp text/qlocale.h text/qlocale_p.h text/qlocale_data_p.h text/qlocale_tools.cpp text/qlocale_tools_p.h diff --git a/src/corelib/text/qlatin1stringmatcher.cpp b/src/corelib/text/qlatin1stringmatcher.cpp new file mode 100644 index 0000000000..e23b4f20f3 --- /dev/null +++ b/src/corelib/text/qlatin1stringmatcher.cpp @@ -0,0 +1,198 @@ +// Copyright (C) 2022 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only + +#include "qlatin1stringmatcher.h" +#include + +QT_BEGIN_NAMESPACE + +/*! \class QLatin1StringMatcher + \inmodule QtCore + \brief Optimized search for substring in Latin-1 text. + + A QLatin1StringMatcher can search for one QLatin1StringView + as a substring of another, either ignoring case or taking it into + account. + + \since 6.5 + \ingroup tools + \ingroup string-processing + + This class is useful when you have a Latin-1 encoded string that + you want to repeatedly search for in some QLatin1StringViews + (perhaps in a loop), or when you want to search for all + instances of it in a given QLatin1StringView. Using a matcher + object and indexIn() is faster than matching a plain + QLatin1StringView with QLatin1StringView::indexOf() if repeated + matching takes place. This class offers no benefit if you are + doing one-off matches. The string to be searched for must not + be destroyed or changed before the matcher object is destroyed, + as the matcher accesses the string when searching for it. + + Create a QLatin1StringMatcher for the QLatin1StringView + you want to search for and the case sensitivity. Then call + indexIn() with the QLatin1StringView that you want to search + within. + + \sa QLatin1StringView, QStringMatcher, QByteArrayMatcher +*/ + +/*! + Construct an empty Latin-1 string matcher. + This will match at each position in any string. + \sa setPattern(), setCaseSensitivity(), indexIn() +*/ +QLatin1StringMatcher::QLatin1StringMatcher() noexcept + : m_pattern(), + m_cs(Qt::CaseSensitive), + m_caseSensitiveSearcher(m_pattern.data(), m_pattern.data()) +{ +} + +/*! + Constructs a Latin-1 string matcher that searches for the given \a pattern + with given case sensitivity \a cs. The \a pattern argument must + not be destroyed before this matcher object. Call indexIn() + to find the \a pattern in the given QLatin1StringView. +*/ +QLatin1StringMatcher::QLatin1StringMatcher(QLatin1StringView pattern, + Qt::CaseSensitivity cs) noexcept + : m_pattern(pattern), m_cs(cs) +{ + setSearcher(); +} + +/*! + Destroys the Latin-1 string matcher. +*/ +QLatin1StringMatcher::~QLatin1StringMatcher() noexcept +{ + freeSearcher(); +} + +/*! + \internal +*/ +void QLatin1StringMatcher::setSearcher() noexcept +{ + if (m_cs == Qt::CaseSensitive) { + new (&m_caseSensitiveSearcher) CaseSensitiveSearcher(m_pattern.data(), m_pattern.end()); + } else { + QtPrivate::QCaseInsensitiveLatin1Hash foldCase; + qsizetype bufferSize = std::min(m_pattern.size(), qsizetype(sizeof m_foldBuffer)); + for (qsizetype i = 0; i < bufferSize; ++i) + m_foldBuffer[i] = static_cast(foldCase(m_pattern[i].toLatin1())); + + new (&m_caseInsensitiveSearcher) + CaseInsensitiveSearcher(m_foldBuffer, &m_foldBuffer[bufferSize]); + } +} + +/*! + \internal +*/ +void QLatin1StringMatcher::freeSearcher() noexcept +{ + if (m_cs == Qt::CaseSensitive) + m_caseSensitiveSearcher.~CaseSensitiveSearcher(); + else + m_caseInsensitiveSearcher.~CaseInsensitiveSearcher(); +} + +/*! + Sets the \a pattern to search for. The string pointed to by the + QLatin1StringView must not be destroyed before the matcher is + destroyed, unless it is set to point to a different \a pattern + with longer lifetime first. + + \sa pattern(), indexIn() +*/ +void QLatin1StringMatcher::setPattern(QLatin1StringView pattern) noexcept +{ + if (m_pattern.latin1() == pattern.latin1() && m_pattern.size() == pattern.size()) + return; // Same address and size + + freeSearcher(); + m_pattern = pattern; + setSearcher(); +} + +/*! + Returns the Latin-1 pattern that the matcher searches for. + + \sa setPattern(), indexIn() +*/ +QLatin1StringView QLatin1StringMatcher::pattern() const noexcept +{ + return m_pattern; +} + +/*! + Sets the case sensitivity to \a cs. + + \sa caseSensitivity(), indexIn() +*/ +void QLatin1StringMatcher::setCaseSensitivity(Qt::CaseSensitivity cs) noexcept +{ + if (m_cs == cs) + return; + + freeSearcher(); + m_cs = cs; + setSearcher(); +} + +/*! + Returns the case sensitivity the matcher uses. + + \sa setCaseSensitivity(), indexIn() +*/ +Qt::CaseSensitivity QLatin1StringMatcher::caseSensitivity() const noexcept +{ + return m_cs; +} + +/*! + Searches for the pattern in the given \a haystack starting from + \a from. + + \sa caseSensitivity(), pattern() +*/ +qsizetype QLatin1StringMatcher::indexIn(QLatin1StringView haystack, qsizetype from) const noexcept +{ + if (m_pattern.isEmpty() && from == haystack.size()) + return from; + if (from >= haystack.size()) + return -1; + auto begin = haystack.begin() + from; + auto end = haystack.end(); + auto found = begin; + if (m_cs == Qt::CaseSensitive) { + found = m_caseSensitiveSearcher(begin, end, m_pattern.begin(), m_pattern.end()).begin; + if (found == end) + return -1; + } else { + const qsizetype bufferSize = std::min(m_pattern.size(), qsizetype(sizeof m_foldBuffer)); + const QLatin1StringView restNeedle = m_pattern.sliced(bufferSize); + const bool needleLongerThanBuffer = restNeedle.size() > 0; + QLatin1StringView restHaystack = haystack; + do { + found = m_caseInsensitiveSearcher(found, end, m_foldBuffer, &m_foldBuffer[bufferSize]) + .begin; + if (found == end) { + return -1; + } else if (!needleLongerThanBuffer) { + break; + } + restHaystack = haystack.sliced( + qMin(haystack.size(), + bufferSize + qsizetype(std::distance(haystack.begin(), found)))); + if (restHaystack.startsWith(restNeedle, Qt::CaseInsensitive)) + break; + ++found; + } while (true); + } + return std::distance(haystack.begin(), found); +} + +QT_END_NAMESPACE diff --git a/src/corelib/text/qlatin1stringmatcher.h b/src/corelib/text/qlatin1stringmatcher.h new file mode 100644 index 0000000000..a256250e9c --- /dev/null +++ b/src/corelib/text/qlatin1stringmatcher.h @@ -0,0 +1,160 @@ +// Copyright (C) 2022 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only + +#ifndef QLATIN1STRINGMATCHER_H +#define QLATIN1STRINGMATCHER_H + +#include +#include +#include + +#include +#include + +QT_BEGIN_NAMESPACE + +namespace QtPrivate { +template::value_type>, + class BinaryPredicate = std::equal_to<>> +class q_boyer_moore_searcher_hashed_needle +{ +public: + constexpr q_boyer_moore_searcher_hashed_needle(RandomIt1 pat_first, RandomIt1 pat_last) + : m_skiptable{} + { + const size_t n = std::distance(pat_first, pat_last); + constexpr auto uchar_max = (std::numeric_limits::max)(); + uchar max = n > uchar_max ? uchar_max : uchar(n); + q20::fill(std::begin(m_skiptable), std::end(m_skiptable), max); + + RandomIt1 pattern = pat_first; + pattern += n - max; + while (max--) + m_skiptable[uchar(*pattern++)] = max; + } + + template + constexpr auto operator()(RandomIt2 first, RandomIt2 last, RandomIt1 pat_first, + RandomIt1 pat_last) const + { + struct R + { + RandomIt2 begin, end; + }; + Hash hf; + BinaryPredicate pred; + auto pat_length = std::distance(pat_first, pat_last); + if (pat_length == 0) + return R{ first, first }; + + const qsizetype pl_minus_one = qsizetype(pat_length - 1); + RandomIt2 current = first + pl_minus_one; + + while (current < last) { + qsizetype skip = m_skiptable[hf(*current)]; + if (!skip) { + // possible match + while (skip < pat_length) { + if (!pred(hf(*(current - skip)), uchar(pat_first[pl_minus_one - skip]))) + break; + skip++; + } + if (skip > pl_minus_one) { // we have a match + auto match = current - skip + 1; + return R{ match, match + pat_length }; + } + + // If we don't have a match we are a bit inefficient as we only skip by one + // when we have the non matching char in the string. + if (m_skiptable[hf(*(current - skip))] == pat_length) + skip = pat_length - skip; + else + skip = 1; + } + current += skip; + } + + return R{ last, last }; + } + +private: + alignas(16) uchar m_skiptable[256]; +}; + +struct QCaseSensitiveLatin1Hash +{ + constexpr QCaseSensitiveLatin1Hash() = default; + + constexpr std::size_t operator()(char c) const noexcept { return std::size_t(uchar(c)); } +}; + +struct QCaseInsensitiveLatin1Hash +{ + constexpr QCaseInsensitiveLatin1Hash() = default; + + constexpr std::size_t operator()(char c) const noexcept + { + return std::size_t(latin1Lower[uchar(c)]); + } + +private: + static constexpr uchar latin1Lower[256] = { + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f, + 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f, + 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f, + 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f, + 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x5b,0x5c,0x5d,0x5e,0x5f, + 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f, + 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f, + 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f, + 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f, + 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf, + 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf, + // 0xd7 (multiplication sign) and 0xdf (sz ligature) complicate life + 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef, + 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xd7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xdf, + 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef, + 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff + }; +}; +} + +class QLatin1StringMatcher +{ +public: + Q_CORE_EXPORT QLatin1StringMatcher() noexcept; + Q_CORE_EXPORT explicit QLatin1StringMatcher( + QLatin1StringView pattern, Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; + Q_CORE_EXPORT ~QLatin1StringMatcher() noexcept; + + Q_CORE_EXPORT void setPattern(QLatin1StringView pattern) noexcept; + Q_CORE_EXPORT QLatin1StringView pattern() const noexcept; + Q_CORE_EXPORT void setCaseSensitivity(Qt::CaseSensitivity cs) noexcept; + Q_CORE_EXPORT Qt::CaseSensitivity caseSensitivity() const noexcept; + + Q_CORE_EXPORT qsizetype indexIn(QLatin1StringView haystack, qsizetype from = 0) const noexcept; + +private: + void setSearcher() noexcept; + void freeSearcher() noexcept; + + QLatin1StringView m_pattern; + Qt::CaseSensitivity m_cs; + typedef QtPrivate::q_boyer_moore_searcher_hashed_needle + CaseSensitiveSearcher; + typedef QtPrivate::q_boyer_moore_searcher_hashed_needle + CaseInsensitiveSearcher; + union { + CaseSensitiveSearcher m_caseSensitiveSearcher; + CaseInsensitiveSearcher m_caseInsensitiveSearcher; + }; + char m_foldBuffer[256]; +}; + +QT_END_NAMESPACE + +#endif // QLATIN1MATCHER_H diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index 6997ca05a8..030e583ddc 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -38,13 +38,12 @@ #include #include "qchar.cpp" +#include "qlatin1stringmatcher.h" #include "qstringmatcher.cpp" #include "qstringiterator_p.h" #include "qstringalgorithms_p.h" #include "qthreadstorage.h" -#include "qbytearraymatcher.h" // Helper for comparison of QLatin1StringView - #include #include @@ -1415,70 +1414,6 @@ static int latin1nicmp(const char *lhsChar, qsizetype lSize, const char *rhsChar return lencmp(lSize, rSize); } -namespace { - -template -inline uchar latin1_fold(const uchar c) -{ - return c; -} - -template<> -inline uchar latin1_fold(const uchar c) -{ - return latin1Lower[c]; -} - -template -inline void bm_latin1_init_skiptable(const uchar *cc, qsizetype len, uchar *skiptable) -{ - int l = int(qMin(len, qsizetype(255))); - memset(skiptable, l, 256 * sizeof(uchar)); - cc += len - l; - while (l--) - skiptable[latin1_fold(*cc++)] = l; -} - -template -inline qsizetype bm_latin1_find(const uchar *cc, qsizetype l, qsizetype index, const uchar *puc, - qsizetype pl, const uchar *skiptable) -{ - if (pl == 0) - return index > l ? -1 : index; - const qsizetype pl_minus_one = pl - 1; - - const uchar *current = cc + index + pl_minus_one; - const uchar *end = cc + l; - - while (current < end) { - qsizetype skip = skiptable[latin1_fold(*current)]; - if (!skip) { - // possible match - while (skip < pl) { - if (latin1_fold(*(current - skip)) != latin1_fold(puc[pl_minus_one - skip])) - break; - skip++; - } - if (skip > pl_minus_one) // we have a match - return (current - cc) - skip + 1; - - // in case we don't have a match we are a bit inefficient as we only skip by one - // when we have the non matching char in the string. - if (skiptable[latin1_fold(*(current - skip))] == pl) - skip = pl - skip; - else - skip = 1; - } - if (current > end - skip) - break; - current += skip; - } - - return -1; // not found -} - -} - bool QtPrivate::equalStrings(QStringView lhs, QStringView rhs) noexcept { return ucstreq(lhs.utf16(), lhs.size(), rhs.utf16(), rhs.size()); @@ -10686,16 +10621,10 @@ qsizetype QtPrivate::count(QLatin1StringView haystack, QLatin1StringView needle, qsizetype num = 0; qsizetype i = -1; - // TODO: use Boyer-Moore searcher for case-insensitive search too - // when QTBUG-100236 is done - if (cs == Qt::CaseSensitive) { - QByteArrayMatcher matcher(needle); - while ((i = matcher.indexIn(haystack, i + 1)) != -1) - ++num; - } else { - while ((i = QtPrivate::findString(haystack, i + 1, needle, cs)) != -1) - ++num; - } + QLatin1StringMatcher matcher(needle, cs); + while ((i = matcher.indexIn(haystack, i + 1)) != -1) + ++num; + return num; } @@ -10710,19 +10639,14 @@ qsizetype QtPrivate::count(QLatin1StringView haystack, QStringView needle, Qt::C qsizetype num = 0; qsizetype i = -1; - // TODO: use Boyer-Moore searcher for case-insensitive search too - // when QTBUG-100236 is done - if (cs == Qt::CaseSensitive) { - QVarLengthArray s(needle.size()); - qt_to_latin1_unchecked(s.data(), needle.utf16(), needle.size()); + QVarLengthArray s(needle.size()); + qt_to_latin1_unchecked(s.data(), needle.utf16(), needle.size()); + + QLatin1StringMatcher matcher(QLatin1StringView(reinterpret_cast(s.data()), s.size()), + cs); + while ((i = matcher.indexIn(haystack, i + 1)) != -1) + ++num; - QByteArrayMatcher matcher(s); - while ((i = matcher.indexIn(haystack, i + 1)) != -1) - ++num; - } else { - while ((i = QtPrivate::findString(haystack, i + 1, needle, cs)) != -1) - ++num; - } return num; } @@ -10743,12 +10667,11 @@ qsizetype QtPrivate::count(QLatin1StringView haystack, QChar needle, Qt::CaseSen if (needle.unicode() > 0xff) return 0; - const char needleL1 = needle.toLatin1(); if (cs == Qt::CaseSensitive) { - return std::count(haystack.cbegin(), haystack.cend(), needleL1); + return std::count(haystack.cbegin(), haystack.cend(), needle.toLatin1()); } else { auto toLower = [](char ch) { return latin1Lower[uchar(ch)]; }; - const uchar ch = toLower(needleL1); + const uchar ch = toLower(needle.toLatin1()); return std::count_if(haystack.cbegin(), haystack.cend(), [&toLower, ch](const char c) { return toLower(c) == ch; }); @@ -10961,7 +10884,7 @@ qsizetype QtPrivate::findString(QLatin1StringView haystack, qsizetype from, QLat return -1; } - const QByteArrayMatcher matcher(needle); + const QLatin1StringMatcher matcher(needle, Qt::CaseSensitivity::CaseSensitive); return matcher.indexIn(haystack, from); } @@ -10995,12 +10918,9 @@ qsizetype QtPrivate::findString(QLatin1StringView haystack, qsizetype from, QLat } return -1; } - uchar skiptable[256]; - bm_latin1_init_skiptable( - reinterpret_cast(needle.begin()), needle.size(), skiptable); - return bm_latin1_find( - reinterpret_cast(haystack.begin()), haystack.size(), from, - reinterpret_cast(needle.begin()), needle.size(), skiptable); + + QLatin1StringMatcher matcher(needle, Qt::CaseSensitivity::CaseInsensitive); + return matcher.indexIn(haystack, from); } qsizetype QtPrivate::lastIndexOf(QStringView haystack, qsizetype from, QStringView needle, Qt::CaseSensitivity cs) noexcept diff --git a/src/tools/bootstrap/CMakeLists.txt b/src/tools/bootstrap/CMakeLists.txt index 5a2299e37c..2018a29b94 100644 --- a/src/tools/bootstrap/CMakeLists.txt +++ b/src/tools/bootstrap/CMakeLists.txt @@ -68,6 +68,7 @@ qt_internal_extend_target(Bootstrap ../../corelib/text/qbytearray.cpp ../../corelib/text/qbytearraylist.cpp ../../corelib/text/qbytearraymatcher.cpp + ../../corelib/text/qlatin1stringmatcher.cpp ../../corelib/text/qlocale.cpp ../../corelib/text/qlocale_tools.cpp ../../corelib/text/qregularexpression.cpp diff --git a/tests/auto/corelib/text/CMakeLists.txt b/tests/auto/corelib/text/CMakeLists.txt index 6bfcadbaaa..27766b6791 100644 --- a/tests/auto/corelib/text/CMakeLists.txt +++ b/tests/auto/corelib/text/CMakeLists.txt @@ -13,6 +13,7 @@ add_subdirectory(qbytearrayview) add_subdirectory(qbytedatabuffer) add_subdirectory(qchar) add_subdirectory(qcollator) +add_subdirectory(qlatin1stringmatcher) add_subdirectory(qlatin1stringview) add_subdirectory(qregularexpression) add_subdirectory(qstring) diff --git a/tests/auto/corelib/text/qlatin1stringmatcher/CMakeLists.txt b/tests/auto/corelib/text/qlatin1stringmatcher/CMakeLists.txt new file mode 100644 index 0000000000..fb4ef9fd8b --- /dev/null +++ b/tests/auto/corelib/text/qlatin1stringmatcher/CMakeLists.txt @@ -0,0 +1,14 @@ +# Copyright (C) 2022 The Qt Company Ltd. +# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 + +##################################################################### +## tst_qlatin1sgtringmatcher Test: +##################################################################### + +qt_internal_add_test(tst_qlatin1stringmatcher + SOURCES + tst_qlatin1stringmatcher.cpp +) + +## Scopes: +##################################################################### diff --git a/tests/auto/corelib/text/qlatin1stringmatcher/tst_qlatin1stringmatcher.cpp b/tests/auto/corelib/text/qlatin1stringmatcher/tst_qlatin1stringmatcher.cpp new file mode 100644 index 0000000000..9572466a7b --- /dev/null +++ b/tests/auto/corelib/text/qlatin1stringmatcher/tst_qlatin1stringmatcher.cpp @@ -0,0 +1,306 @@ +// Copyright (C) 2022 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 + +#include + +#include + +#include +#include + +#if QT_CONFIG(cxx11_future) +# include +#endif + +// COM interface +#if defined(interface) +# undef interface +#endif + +using namespace Qt::Literals::StringLiterals; + +class tst_QLatin1StringMatcher : public QObject +{ + Q_OBJECT + +private slots: + void overloads(); + void interface(); + void indexIn(); + void haystacksWithMoreThan4GiBWork(); +}; + +void tst_QLatin1StringMatcher::overloads() +{ + QLatin1StringView hello = "hello"_L1; + QByteArray hello2B = QByteArrayView(hello).toByteArray().repeated(2); + QLatin1StringView hello2(hello2B); + { + QLatin1StringMatcher m("hello"_L1, Qt::CaseSensitive); + QCOMPARE(m.pattern(), "hello"_L1); + QCOMPARE(m.indexIn("hello"_L1), 0); + QCOMPARE(m.indexIn("Hello"_L1), -1); + QCOMPARE(m.indexIn("Hellohello"_L1), 5); + QCOMPARE(m.indexIn("helloHello"_L1), 0); + QCOMPARE(m.indexIn("helloHello"_L1, 1), -1); + } + { + QLatin1StringMatcher m("Hello"_L1, Qt::CaseSensitive); + QCOMPARE(m.pattern(), "Hello"_L1); + QCOMPARE(m.indexIn("hello"_L1), -1); + QCOMPARE(m.indexIn("Hello"_L1), 0); + QCOMPARE(m.indexIn("Hellohello"_L1), 0); + QCOMPARE(m.indexIn("helloHello"_L1), 5); + QCOMPARE(m.indexIn("helloHello"_L1, 6), -1); + } + { + QLatin1StringMatcher m("hello"_L1, Qt::CaseInsensitive); + QCOMPARE(m.pattern(), "hello"_L1); + QCOMPARE(m.indexIn("hello"_L1), 0); + QCOMPARE(m.indexIn("Hello"_L1), 0); + QCOMPARE(m.indexIn("Hellohello"_L1), 0); + QCOMPARE(m.indexIn("helloHello"_L1), 0); + QCOMPARE(m.indexIn("helloHello"_L1, 1), 5); + QCOMPARE(m.indexIn("helloHello"_L1, 6), -1); + } + { + QLatin1StringMatcher m("Hello"_L1, Qt::CaseInsensitive); + QCOMPARE(m.pattern(), "Hello"_L1); + QCOMPARE(m.indexIn("hello"_L1), 0); + QCOMPARE(m.indexIn("Hello"_L1), 0); + QCOMPARE(m.indexIn("Hellohello"_L1), 0); + QCOMPARE(m.indexIn("helloHello"_L1), 0); + QCOMPARE(m.indexIn("helloHello"_L1, 1), 5); + QCOMPARE(m.indexIn("helloHello"_L1, 6), -1); + } + { + QLatin1StringMatcher m(hello, Qt::CaseSensitive); + QCOMPARE(m.pattern(), "hello"_L1); + QCOMPARE(m.indexIn(hello), 0); + QCOMPARE(m.indexIn(hello, 1), -1); + QCOMPARE(m.indexIn(hello2, 1), hello.size()); + QCOMPARE(m.indexIn(hello2, 6), -1); + } +} + +void tst_QLatin1StringMatcher::interface() +{ + QLatin1StringView needle = "abc123"_L1; + QByteArray haystackT(500, 'a'); + haystackT.insert(6, "123"); + haystackT.insert(31, "abc"); + haystackT.insert(42, "abc123"); + haystackT.insert(84, "abc123"); + QLatin1StringView haystack(haystackT); + + QLatin1StringMatcher matcher1; + + matcher1 = QLatin1StringMatcher(needle, Qt::CaseSensitive); + QLatin1StringMatcher matcher2; + matcher2.setPattern(needle); + + QLatin1StringMatcher matcher3 = QLatin1StringMatcher(needle, Qt::CaseSensitive); + QLatin1StringMatcher matcher4; + matcher4 = matcher3; + + QCOMPARE(matcher1.indexIn(haystack), 42); + QCOMPARE(matcher2.indexIn(haystack), 42); + QCOMPARE(matcher3.indexIn(haystack), 42); + QCOMPARE(matcher4.indexIn(haystack), 42); + + QCOMPARE(matcher1.indexIn(haystack, 43), 84); + QCOMPARE(matcher1.indexIn(haystack, 85), -1); + + QLatin1StringMatcher matcher5("123"_L1, Qt::CaseSensitive); + QCOMPARE(matcher5.indexIn(haystack), 6); + + matcher5 = QLatin1StringMatcher("abc"_L1, Qt::CaseSensitive); + QCOMPARE(matcher5.indexIn(haystack), 31); + + matcher5.setPattern(matcher4.pattern()); + QCOMPARE(matcher5.indexIn(haystack), 42); + + QLatin1StringMatcher matcher6 = matcher5; + QCOMPARE(matcher6.indexIn(haystack), 42); + + QLatin1StringMatcher matcher7 = std::move(matcher5); + QCOMPARE(matcher7.indexIn(haystack), 42); + + matcher1.setPattern("123"_L1); + matcher7 = std::move(matcher1); + QCOMPARE(matcher7.indexIn(haystack), 6); +} + +#define LONG_STRING__32 "abcdefghijklmnopqrstuvwxyz012345" +#define LONG_STRING__64 LONG_STRING__32 LONG_STRING__32 +#define LONG_STRING_128 LONG_STRING__64 LONG_STRING__64 +#define LONG_STRING_256 LONG_STRING_128 LONG_STRING_128 +#define LONG_STRING_512 LONG_STRING_256 LONG_STRING_256 + +void tst_QLatin1StringMatcher::indexIn() +{ + const char p_data[] = { 0x0, 0x0, 0x1 }; + QLatin1StringView pattern(p_data, sizeof(p_data)); + + QByteArray haystackT(8, '\0'); + haystackT[7] = 0x1; + QLatin1StringView haystack(haystackT); + + QLatin1StringMatcher matcher; + + matcher = QLatin1StringMatcher(pattern, Qt::CaseSensitive); + QCOMPARE(matcher.indexIn(haystack, 0), 5); + QCOMPARE(matcher.indexIn(haystack, 1), 5); + QCOMPARE(matcher.indexIn(haystack, 2), 5); + + matcher.setPattern(pattern); + QCOMPARE(matcher.indexIn(haystack, 0), 5); + QCOMPARE(matcher.indexIn(haystack, 1), 5); + QCOMPARE(matcher.indexIn(haystack, 2), 5); + + std::array allChars; + for (int i = 0; i < 256; ++i) + allChars[i] = char(i); + + matcher = QLatin1StringMatcher(QLatin1StringView(allChars), Qt::CaseSensitive); + haystackT = LONG_STRING__32 "x"; + haystackT += matcher.pattern(); + haystack = QLatin1StringView(haystackT); + QCOMPARE(matcher.indexIn(haystack, 0), 33); + QCOMPARE(matcher.indexIn(haystack, 1), 33); + QCOMPARE(matcher.indexIn(haystack, 2), 33); + QCOMPARE(matcher.indexIn(haystack, 33), 33); + QCOMPARE(matcher.indexIn(haystack, 34), -1); + + matcher = QLatin1StringMatcher(QLatin1StringView(LONG_STRING_256), Qt::CaseSensitive); + haystackT = QByteArray(LONG_STRING__32 "x"); + haystackT += matcher.pattern(); + haystackT += QByteArrayView("Just junk at the end"); + haystack = QLatin1StringView(haystackT); + QCOMPARE(matcher.indexIn(haystack, 0), 33); + QCOMPARE(matcher.indexIn(haystack, 1), 33); + QCOMPARE(matcher.indexIn(haystack, 2), 33); + QCOMPARE(matcher.indexIn(haystack, 33), 33); + QCOMPARE(matcher.indexIn(haystack, 34), -1); + matcher.setCaseSensitivity(Qt::CaseInsensitive); + QCOMPARE(matcher.indexIn(haystack, 0), 33); + QCOMPARE(matcher.indexIn(haystack, 1), 33); + QCOMPARE(matcher.indexIn(haystack, 2), 33); + QCOMPARE(matcher.indexIn(haystack, 33), 33); + QCOMPARE(matcher.indexIn(haystack, 34), -1); + + matcher = QLatin1StringMatcher(QLatin1StringView(LONG_STRING_512), Qt::CaseInsensitive); + haystackT = QByteArray(LONG_STRING__32 "x"); + haystackT += matcher.pattern(); + haystackT += QByteArrayView("Just junk at the end"); + haystack = QLatin1StringView(haystackT); + QCOMPARE(matcher.indexIn(haystack, 0), 33); + QCOMPARE(matcher.indexIn(haystack, 1), 33); + QCOMPARE(matcher.indexIn(haystack, 2), 33); + QCOMPARE(matcher.indexIn(haystack, 33), 33); + QCOMPARE(matcher.indexIn(haystack, 34), -1); + matcher.setCaseSensitivity(Qt::CaseSensitive); + QCOMPARE(matcher.indexIn(haystack, 0), 33); + QCOMPARE(matcher.indexIn(haystack, 1), 33); + QCOMPARE(matcher.indexIn(haystack, 2), 33); + QCOMPARE(matcher.indexIn(haystack, 33), 33); + QCOMPARE(matcher.indexIn(haystack, 34), -1); + + matcher = QLatin1StringMatcher(QLatin1StringView(""), Qt::CaseSensitive); + haystackT = QByteArray(LONG_STRING__32 "x"); + haystack = QLatin1StringView(haystackT); + QCOMPARE(matcher.indexIn(haystack, 0), 0); + QCOMPARE(matcher.indexIn(haystack, 1), 1); + QCOMPARE(matcher.indexIn(haystack, 2), 2); + QCOMPARE(matcher.indexIn(haystack, 33), 33); + + matcher = QLatin1StringMatcher(QLatin1StringView(""), Qt::CaseInsensitive); + haystackT = QByteArray(LONG_STRING__32 "x"); + haystack = QLatin1StringView(haystackT); + QCOMPARE(matcher.indexIn(haystack, 0), 0); + QCOMPARE(matcher.indexIn(haystack, 1), 1); + QCOMPARE(matcher.indexIn(haystack, 2), 2); + QCOMPARE(matcher.indexIn(haystack, 33), 33); + + matcher = QLatin1StringMatcher(QLatin1StringView("m\xF8"), Qt::CaseInsensitive); + haystackT = QByteArray("M\xF8m\xF8"); + haystack = QLatin1StringView(haystackT); + QCOMPARE(matcher.indexIn(haystack, 0), 0); + QCOMPARE(matcher.indexIn(haystack, 1), 2); + QCOMPARE(matcher.indexIn(haystack, 2), 2); + QCOMPARE(matcher.indexIn(haystack, 3), -1); + matcher.setCaseSensitivity(Qt::CaseSensitive); + QCOMPARE(matcher.indexIn(haystack, 0), 2); + QCOMPARE(matcher.indexIn(haystack, 1), 2); + QCOMPARE(matcher.indexIn(haystack, 2), 2); + QCOMPARE(matcher.indexIn(haystack, 3), -1); +} + +void tst_QLatin1StringMatcher::haystacksWithMoreThan4GiBWork() +{ +#if QT_POINTER_SIZE > 4 + // use a large needle to trigger long skips in the Boyer-Moore algorithm + // (to speed up the test) + constexpr std::string_view needle = LONG_STRING_256; + + // + // GIVEN: a haystack with more than 4 GiB of data + // + + // don't use QByteArray because freeSpaceAtEnd() may break reserve() + // semantics and a realloc is the last thing we need here + std::string large; + QElapsedTimer timer; + timer.start(); + constexpr size_t GiB = 1024 * 1024 * 1024; + constexpr size_t BaseSize = 4 * GiB + 1; + try { + large.reserve(BaseSize + needle.size()); + large.resize(BaseSize, '\0'); + large.append(needle); + } catch (const std::bad_alloc &) { + QSKIP("Could not allocate 4GiB plus a couple hundred bytes of RAM."); + } + QCOMPARE(large.size(), BaseSize + needle.size()); + qDebug("created dataset in %lld ms", timer.elapsed()); + +# if QT_CONFIG(cxx11_future) + using MaybeThread = std::thread; +# else + struct MaybeThread + { + std::function func; + void join() { func(); } + }; +# endif + + // + // WHEN: trying to match an occurrence past the 4GiB mark + // + + qsizetype dynamicResult; + + auto t = MaybeThread{ [&] { + QLatin1StringMatcher m(QLatin1StringView(needle), Qt::CaseSensitive); + dynamicResult = m.indexIn(QLatin1StringView(large)); + } }; + t.join(); + + // + // THEN: the result index is not trucated + // + + QCOMPARE(dynamicResult, qsizetype(BaseSize)); +#else + QSKIP("This test is 64-bit only."); +#endif +} + +#undef LONG_STRING_512 +#undef LONG_STRING_256 +#undef LONG_STRING_128 +#undef LONG_STRING__64 +#undef LONG_STRING__32 + +QTEST_APPLESS_MAIN(tst_QLatin1StringMatcher) +#include "tst_qlatin1stringmatcher.moc"