Add Boyer-Moore Latin-1 string searcher with optional case sensitivity

[ChangeLog][QtCore][QString] Added Boyer-Moore Latin-1 string searcher with optional case sensitivity

Task-number: QTBUG-100236
Change-Id: I200a0dac7c8012add1ee02511dba791d233115e0
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Øystein Heskestad 2022-10-04 17:24:17 +02:00
parent bb5d4094e0
commit 3fedcd4e4a
8 changed files with 699 additions and 98 deletions

View File

@ -218,6 +218,7 @@ qt_internal_add_module(Core
text/qchar.h
text/qcollator.cpp text/qcollator.h text/qcollator_p.h
text/qdoublescanprint_p.h
text/qlatin1stringmatcher.cpp text/qlatin1stringmatcher.h
text/qlocale.cpp text/qlocale.h text/qlocale_p.h
text/qlocale_data_p.h
text/qlocale_tools.cpp text/qlocale_tools_p.h

View File

@ -0,0 +1,198 @@
// Copyright (C) 2022 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#include "qlatin1stringmatcher.h"
#include <limits.h>
QT_BEGIN_NAMESPACE
/*! \class QLatin1StringMatcher
\inmodule QtCore
\brief Optimized search for substring in Latin-1 text.
A QLatin1StringMatcher can search for one QLatin1StringView
as a substring of another, either ignoring case or taking it into
account.
\since 6.5
\ingroup tools
\ingroup string-processing
This class is useful when you have a Latin-1 encoded string that
you want to repeatedly search for in some QLatin1StringViews
(perhaps in a loop), or when you want to search for all
instances of it in a given QLatin1StringView. Using a matcher
object and indexIn() is faster than matching a plain
QLatin1StringView with QLatin1StringView::indexOf() if repeated
matching takes place. This class offers no benefit if you are
doing one-off matches. The string to be searched for must not
be destroyed or changed before the matcher object is destroyed,
as the matcher accesses the string when searching for it.
Create a QLatin1StringMatcher for the QLatin1StringView
you want to search for and the case sensitivity. Then call
indexIn() with the QLatin1StringView that you want to search
within.
\sa QLatin1StringView, QStringMatcher, QByteArrayMatcher
*/
/*!
Construct an empty Latin-1 string matcher.
This will match at each position in any string.
\sa setPattern(), setCaseSensitivity(), indexIn()
*/
QLatin1StringMatcher::QLatin1StringMatcher() noexcept
: m_pattern(),
m_cs(Qt::CaseSensitive),
m_caseSensitiveSearcher(m_pattern.data(), m_pattern.data())
{
}
/*!
Constructs a Latin-1 string matcher that searches for the given \a pattern
with given case sensitivity \a cs. The \a pattern argument must
not be destroyed before this matcher object. Call indexIn()
to find the \a pattern in the given QLatin1StringView.
*/
QLatin1StringMatcher::QLatin1StringMatcher(QLatin1StringView pattern,
Qt::CaseSensitivity cs) noexcept
: m_pattern(pattern), m_cs(cs)
{
setSearcher();
}
/*!
Destroys the Latin-1 string matcher.
*/
QLatin1StringMatcher::~QLatin1StringMatcher() noexcept
{
freeSearcher();
}
/*!
\internal
*/
void QLatin1StringMatcher::setSearcher() noexcept
{
if (m_cs == Qt::CaseSensitive) {
new (&m_caseSensitiveSearcher) CaseSensitiveSearcher(m_pattern.data(), m_pattern.end());
} else {
QtPrivate::QCaseInsensitiveLatin1Hash foldCase;
qsizetype bufferSize = std::min(m_pattern.size(), qsizetype(sizeof m_foldBuffer));
for (qsizetype i = 0; i < bufferSize; ++i)
m_foldBuffer[i] = static_cast<char>(foldCase(m_pattern[i].toLatin1()));
new (&m_caseInsensitiveSearcher)
CaseInsensitiveSearcher(m_foldBuffer, &m_foldBuffer[bufferSize]);
}
}
/*!
\internal
*/
void QLatin1StringMatcher::freeSearcher() noexcept
{
if (m_cs == Qt::CaseSensitive)
m_caseSensitiveSearcher.~CaseSensitiveSearcher();
else
m_caseInsensitiveSearcher.~CaseInsensitiveSearcher();
}
/*!
Sets the \a pattern to search for. The string pointed to by the
QLatin1StringView must not be destroyed before the matcher is
destroyed, unless it is set to point to a different \a pattern
with longer lifetime first.
\sa pattern(), indexIn()
*/
void QLatin1StringMatcher::setPattern(QLatin1StringView pattern) noexcept
{
if (m_pattern.latin1() == pattern.latin1() && m_pattern.size() == pattern.size())
return; // Same address and size
freeSearcher();
m_pattern = pattern;
setSearcher();
}
/*!
Returns the Latin-1 pattern that the matcher searches for.
\sa setPattern(), indexIn()
*/
QLatin1StringView QLatin1StringMatcher::pattern() const noexcept
{
return m_pattern;
}
/*!
Sets the case sensitivity to \a cs.
\sa caseSensitivity(), indexIn()
*/
void QLatin1StringMatcher::setCaseSensitivity(Qt::CaseSensitivity cs) noexcept
{
if (m_cs == cs)
return;
freeSearcher();
m_cs = cs;
setSearcher();
}
/*!
Returns the case sensitivity the matcher uses.
\sa setCaseSensitivity(), indexIn()
*/
Qt::CaseSensitivity QLatin1StringMatcher::caseSensitivity() const noexcept
{
return m_cs;
}
/*!
Searches for the pattern in the given \a haystack starting from
\a from.
\sa caseSensitivity(), pattern()
*/
qsizetype QLatin1StringMatcher::indexIn(QLatin1StringView haystack, qsizetype from) const noexcept
{
if (m_pattern.isEmpty() && from == haystack.size())
return from;
if (from >= haystack.size())
return -1;
auto begin = haystack.begin() + from;
auto end = haystack.end();
auto found = begin;
if (m_cs == Qt::CaseSensitive) {
found = m_caseSensitiveSearcher(begin, end, m_pattern.begin(), m_pattern.end()).begin;
if (found == end)
return -1;
} else {
const qsizetype bufferSize = std::min(m_pattern.size(), qsizetype(sizeof m_foldBuffer));
const QLatin1StringView restNeedle = m_pattern.sliced(bufferSize);
const bool needleLongerThanBuffer = restNeedle.size() > 0;
QLatin1StringView restHaystack = haystack;
do {
found = m_caseInsensitiveSearcher(found, end, m_foldBuffer, &m_foldBuffer[bufferSize])
.begin;
if (found == end) {
return -1;
} else if (!needleLongerThanBuffer) {
break;
}
restHaystack = haystack.sliced(
qMin(haystack.size(),
bufferSize + qsizetype(std::distance(haystack.begin(), found))));
if (restHaystack.startsWith(restNeedle, Qt::CaseInsensitive))
break;
++found;
} while (true);
}
return std::distance(haystack.begin(), found);
}
QT_END_NAMESPACE

View File

@ -0,0 +1,160 @@
// Copyright (C) 2022 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#ifndef QLATIN1STRINGMATCHER_H
#define QLATIN1STRINGMATCHER_H
#include <functional>
#include <iterator>
#include <limits>
#include <QtCore/q20algorithm.h>
#include <QtCore/qstring.h>
QT_BEGIN_NAMESPACE
namespace QtPrivate {
template<class RandomIt1,
class Hash = std::hash<typename std::iterator_traits<RandomIt1>::value_type>,
class BinaryPredicate = std::equal_to<>>
class q_boyer_moore_searcher_hashed_needle
{
public:
constexpr q_boyer_moore_searcher_hashed_needle(RandomIt1 pat_first, RandomIt1 pat_last)
: m_skiptable{}
{
const size_t n = std::distance(pat_first, pat_last);
constexpr auto uchar_max = (std::numeric_limits<uchar>::max)();
uchar max = n > uchar_max ? uchar_max : uchar(n);
q20::fill(std::begin(m_skiptable), std::end(m_skiptable), max);
RandomIt1 pattern = pat_first;
pattern += n - max;
while (max--)
m_skiptable[uchar(*pattern++)] = max;
}
template<class RandomIt2>
constexpr auto operator()(RandomIt2 first, RandomIt2 last, RandomIt1 pat_first,
RandomIt1 pat_last) const
{
struct R
{
RandomIt2 begin, end;
};
Hash hf;
BinaryPredicate pred;
auto pat_length = std::distance(pat_first, pat_last);
if (pat_length == 0)
return R{ first, first };
const qsizetype pl_minus_one = qsizetype(pat_length - 1);
RandomIt2 current = first + pl_minus_one;
while (current < last) {
qsizetype skip = m_skiptable[hf(*current)];
if (!skip) {
// possible match
while (skip < pat_length) {
if (!pred(hf(*(current - skip)), uchar(pat_first[pl_minus_one - skip])))
break;
skip++;
}
if (skip > pl_minus_one) { // we have a match
auto match = current - skip + 1;
return R{ match, match + pat_length };
}
// If we don't have a match we are a bit inefficient as we only skip by one
// when we have the non matching char in the string.
if (m_skiptable[hf(*(current - skip))] == pat_length)
skip = pat_length - skip;
else
skip = 1;
}
current += skip;
}
return R{ last, last };
}
private:
alignas(16) uchar m_skiptable[256];
};
struct QCaseSensitiveLatin1Hash
{
constexpr QCaseSensitiveLatin1Hash() = default;
constexpr std::size_t operator()(char c) const noexcept { return std::size_t(uchar(c)); }
};
struct QCaseInsensitiveLatin1Hash
{
constexpr QCaseInsensitiveLatin1Hash() = default;
constexpr std::size_t operator()(char c) const noexcept
{
return std::size_t(latin1Lower[uchar(c)]);
}
private:
static constexpr uchar latin1Lower[256] = {
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x5b,0x5c,0x5d,0x5e,0x5f,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
// 0xd7 (multiplication sign) and 0xdf (sz ligature) complicate life
0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xd7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xdf,
0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
};
};
}
class QLatin1StringMatcher
{
public:
Q_CORE_EXPORT QLatin1StringMatcher() noexcept;
Q_CORE_EXPORT explicit QLatin1StringMatcher(
QLatin1StringView pattern, Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
Q_CORE_EXPORT ~QLatin1StringMatcher() noexcept;
Q_CORE_EXPORT void setPattern(QLatin1StringView pattern) noexcept;
Q_CORE_EXPORT QLatin1StringView pattern() const noexcept;
Q_CORE_EXPORT void setCaseSensitivity(Qt::CaseSensitivity cs) noexcept;
Q_CORE_EXPORT Qt::CaseSensitivity caseSensitivity() const noexcept;
Q_CORE_EXPORT qsizetype indexIn(QLatin1StringView haystack, qsizetype from = 0) const noexcept;
private:
void setSearcher() noexcept;
void freeSearcher() noexcept;
QLatin1StringView m_pattern;
Qt::CaseSensitivity m_cs;
typedef QtPrivate::q_boyer_moore_searcher_hashed_needle<const char *,
QtPrivate::QCaseSensitiveLatin1Hash>
CaseSensitiveSearcher;
typedef QtPrivate::q_boyer_moore_searcher_hashed_needle<const char *,
QtPrivate::QCaseInsensitiveLatin1Hash>
CaseInsensitiveSearcher;
union {
CaseSensitiveSearcher m_caseSensitiveSearcher;
CaseInsensitiveSearcher m_caseInsensitiveSearcher;
};
char m_foldBuffer[256];
};
QT_END_NAMESPACE
#endif // QLATIN1MATCHER_H

View File

@ -38,13 +38,12 @@
#include <wchar.h>
#include "qchar.cpp"
#include "qlatin1stringmatcher.h"
#include "qstringmatcher.cpp"
#include "qstringiterator_p.h"
#include "qstringalgorithms_p.h"
#include "qthreadstorage.h"
#include "qbytearraymatcher.h" // Helper for comparison of QLatin1StringView
#include <algorithm>
#include <functional>
@ -1415,70 +1414,6 @@ static int latin1nicmp(const char *lhsChar, qsizetype lSize, const char *rhsChar
return lencmp(lSize, rSize);
}
namespace {
template<Qt::CaseSensitivity cs>
inline uchar latin1_fold(const uchar c)
{
return c;
}
template<>
inline uchar latin1_fold<Qt::CaseSensitivity::CaseInsensitive>(const uchar c)
{
return latin1Lower[c];
}
template<Qt::CaseSensitivity cs>
inline void bm_latin1_init_skiptable(const uchar *cc, qsizetype len, uchar *skiptable)
{
int l = int(qMin(len, qsizetype(255)));
memset(skiptable, l, 256 * sizeof(uchar));
cc += len - l;
while (l--)
skiptable[latin1_fold<cs>(*cc++)] = l;
}
template<Qt::CaseSensitivity cs>
inline qsizetype bm_latin1_find(const uchar *cc, qsizetype l, qsizetype index, const uchar *puc,
qsizetype pl, const uchar *skiptable)
{
if (pl == 0)
return index > l ? -1 : index;
const qsizetype pl_minus_one = pl - 1;
const uchar *current = cc + index + pl_minus_one;
const uchar *end = cc + l;
while (current < end) {
qsizetype skip = skiptable[latin1_fold<cs>(*current)];
if (!skip) {
// possible match
while (skip < pl) {
if (latin1_fold<cs>(*(current - skip)) != latin1_fold<cs>(puc[pl_minus_one - skip]))
break;
skip++;
}
if (skip > pl_minus_one) // we have a match
return (current - cc) - skip + 1;
// in case we don't have a match we are a bit inefficient as we only skip by one
// when we have the non matching char in the string.
if (skiptable[latin1_fold<cs>(*(current - skip))] == pl)
skip = pl - skip;
else
skip = 1;
}
if (current > end - skip)
break;
current += skip;
}
return -1; // not found
}
}
bool QtPrivate::equalStrings(QStringView lhs, QStringView rhs) noexcept
{
return ucstreq(lhs.utf16(), lhs.size(), rhs.utf16(), rhs.size());
@ -10686,16 +10621,10 @@ qsizetype QtPrivate::count(QLatin1StringView haystack, QLatin1StringView needle,
qsizetype num = 0;
qsizetype i = -1;
// TODO: use Boyer-Moore searcher for case-insensitive search too
// when QTBUG-100236 is done
if (cs == Qt::CaseSensitive) {
QByteArrayMatcher matcher(needle);
while ((i = matcher.indexIn(haystack, i + 1)) != -1)
++num;
} else {
while ((i = QtPrivate::findString(haystack, i + 1, needle, cs)) != -1)
++num;
}
QLatin1StringMatcher matcher(needle, cs);
while ((i = matcher.indexIn(haystack, i + 1)) != -1)
++num;
return num;
}
@ -10710,19 +10639,14 @@ qsizetype QtPrivate::count(QLatin1StringView haystack, QStringView needle, Qt::C
qsizetype num = 0;
qsizetype i = -1;
// TODO: use Boyer-Moore searcher for case-insensitive search too
// when QTBUG-100236 is done
if (cs == Qt::CaseSensitive) {
QVarLengthArray<uchar> s(needle.size());
qt_to_latin1_unchecked(s.data(), needle.utf16(), needle.size());
QVarLengthArray<uchar> s(needle.size());
qt_to_latin1_unchecked(s.data(), needle.utf16(), needle.size());
QLatin1StringMatcher matcher(QLatin1StringView(reinterpret_cast<char *>(s.data()), s.size()),
cs);
while ((i = matcher.indexIn(haystack, i + 1)) != -1)
++num;
QByteArrayMatcher matcher(s);
while ((i = matcher.indexIn(haystack, i + 1)) != -1)
++num;
} else {
while ((i = QtPrivate::findString(haystack, i + 1, needle, cs)) != -1)
++num;
}
return num;
}
@ -10743,12 +10667,11 @@ qsizetype QtPrivate::count(QLatin1StringView haystack, QChar needle, Qt::CaseSen
if (needle.unicode() > 0xff)
return 0;
const char needleL1 = needle.toLatin1();
if (cs == Qt::CaseSensitive) {
return std::count(haystack.cbegin(), haystack.cend(), needleL1);
return std::count(haystack.cbegin(), haystack.cend(), needle.toLatin1());
} else {
auto toLower = [](char ch) { return latin1Lower[uchar(ch)]; };
const uchar ch = toLower(needleL1);
const uchar ch = toLower(needle.toLatin1());
return std::count_if(haystack.cbegin(), haystack.cend(), [&toLower, ch](const char c) {
return toLower(c) == ch;
});
@ -10961,7 +10884,7 @@ qsizetype QtPrivate::findString(QLatin1StringView haystack, qsizetype from, QLat
return -1;
}
const QByteArrayMatcher matcher(needle);
const QLatin1StringMatcher matcher(needle, Qt::CaseSensitivity::CaseSensitive);
return matcher.indexIn(haystack, from);
}
@ -10995,12 +10918,9 @@ qsizetype QtPrivate::findString(QLatin1StringView haystack, qsizetype from, QLat
}
return -1;
}
uchar skiptable[256];
bm_latin1_init_skiptable<Qt::CaseSensitivity::CaseInsensitive>(
reinterpret_cast<const unsigned char *>(needle.begin()), needle.size(), skiptable);
return bm_latin1_find<Qt::CaseSensitivity::CaseInsensitive>(
reinterpret_cast<const unsigned char *>(haystack.begin()), haystack.size(), from,
reinterpret_cast<const unsigned char *>(needle.begin()), needle.size(), skiptable);
QLatin1StringMatcher matcher(needle, Qt::CaseSensitivity::CaseInsensitive);
return matcher.indexIn(haystack, from);
}
qsizetype QtPrivate::lastIndexOf(QStringView haystack, qsizetype from, QStringView needle, Qt::CaseSensitivity cs) noexcept

View File

@ -68,6 +68,7 @@ qt_internal_extend_target(Bootstrap
../../corelib/text/qbytearray.cpp
../../corelib/text/qbytearraylist.cpp
../../corelib/text/qbytearraymatcher.cpp
../../corelib/text/qlatin1stringmatcher.cpp
../../corelib/text/qlocale.cpp
../../corelib/text/qlocale_tools.cpp
../../corelib/text/qregularexpression.cpp

View File

@ -13,6 +13,7 @@ add_subdirectory(qbytearrayview)
add_subdirectory(qbytedatabuffer)
add_subdirectory(qchar)
add_subdirectory(qcollator)
add_subdirectory(qlatin1stringmatcher)
add_subdirectory(qlatin1stringview)
add_subdirectory(qregularexpression)
add_subdirectory(qstring)

View File

@ -0,0 +1,14 @@
# Copyright (C) 2022 The Qt Company Ltd.
# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#####################################################################
## tst_qlatin1sgtringmatcher Test:
#####################################################################
qt_internal_add_test(tst_qlatin1stringmatcher
SOURCES
tst_qlatin1stringmatcher.cpp
)
## Scopes:
#####################################################################

View File

@ -0,0 +1,306 @@
// Copyright (C) 2022 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#include <QTest>
#include <QtCore/QLatin1StringMatcher>
#include <numeric>
#include <string>
#if QT_CONFIG(cxx11_future)
# include <thread>
#endif
// COM interface
#if defined(interface)
# undef interface
#endif
using namespace Qt::Literals::StringLiterals;
class tst_QLatin1StringMatcher : public QObject
{
Q_OBJECT
private slots:
void overloads();
void interface();
void indexIn();
void haystacksWithMoreThan4GiBWork();
};
void tst_QLatin1StringMatcher::overloads()
{
QLatin1StringView hello = "hello"_L1;
QByteArray hello2B = QByteArrayView(hello).toByteArray().repeated(2);
QLatin1StringView hello2(hello2B);
{
QLatin1StringMatcher m("hello"_L1, Qt::CaseSensitive);
QCOMPARE(m.pattern(), "hello"_L1);
QCOMPARE(m.indexIn("hello"_L1), 0);
QCOMPARE(m.indexIn("Hello"_L1), -1);
QCOMPARE(m.indexIn("Hellohello"_L1), 5);
QCOMPARE(m.indexIn("helloHello"_L1), 0);
QCOMPARE(m.indexIn("helloHello"_L1, 1), -1);
}
{
QLatin1StringMatcher m("Hello"_L1, Qt::CaseSensitive);
QCOMPARE(m.pattern(), "Hello"_L1);
QCOMPARE(m.indexIn("hello"_L1), -1);
QCOMPARE(m.indexIn("Hello"_L1), 0);
QCOMPARE(m.indexIn("Hellohello"_L1), 0);
QCOMPARE(m.indexIn("helloHello"_L1), 5);
QCOMPARE(m.indexIn("helloHello"_L1, 6), -1);
}
{
QLatin1StringMatcher m("hello"_L1, Qt::CaseInsensitive);
QCOMPARE(m.pattern(), "hello"_L1);
QCOMPARE(m.indexIn("hello"_L1), 0);
QCOMPARE(m.indexIn("Hello"_L1), 0);
QCOMPARE(m.indexIn("Hellohello"_L1), 0);
QCOMPARE(m.indexIn("helloHello"_L1), 0);
QCOMPARE(m.indexIn("helloHello"_L1, 1), 5);
QCOMPARE(m.indexIn("helloHello"_L1, 6), -1);
}
{
QLatin1StringMatcher m("Hello"_L1, Qt::CaseInsensitive);
QCOMPARE(m.pattern(), "Hello"_L1);
QCOMPARE(m.indexIn("hello"_L1), 0);
QCOMPARE(m.indexIn("Hello"_L1), 0);
QCOMPARE(m.indexIn("Hellohello"_L1), 0);
QCOMPARE(m.indexIn("helloHello"_L1), 0);
QCOMPARE(m.indexIn("helloHello"_L1, 1), 5);
QCOMPARE(m.indexIn("helloHello"_L1, 6), -1);
}
{
QLatin1StringMatcher m(hello, Qt::CaseSensitive);
QCOMPARE(m.pattern(), "hello"_L1);
QCOMPARE(m.indexIn(hello), 0);
QCOMPARE(m.indexIn(hello, 1), -1);
QCOMPARE(m.indexIn(hello2, 1), hello.size());
QCOMPARE(m.indexIn(hello2, 6), -1);
}
}
void tst_QLatin1StringMatcher::interface()
{
QLatin1StringView needle = "abc123"_L1;
QByteArray haystackT(500, 'a');
haystackT.insert(6, "123");
haystackT.insert(31, "abc");
haystackT.insert(42, "abc123");
haystackT.insert(84, "abc123");
QLatin1StringView haystack(haystackT);
QLatin1StringMatcher matcher1;
matcher1 = QLatin1StringMatcher(needle, Qt::CaseSensitive);
QLatin1StringMatcher matcher2;
matcher2.setPattern(needle);
QLatin1StringMatcher matcher3 = QLatin1StringMatcher(needle, Qt::CaseSensitive);
QLatin1StringMatcher matcher4;
matcher4 = matcher3;
QCOMPARE(matcher1.indexIn(haystack), 42);
QCOMPARE(matcher2.indexIn(haystack), 42);
QCOMPARE(matcher3.indexIn(haystack), 42);
QCOMPARE(matcher4.indexIn(haystack), 42);
QCOMPARE(matcher1.indexIn(haystack, 43), 84);
QCOMPARE(matcher1.indexIn(haystack, 85), -1);
QLatin1StringMatcher matcher5("123"_L1, Qt::CaseSensitive);
QCOMPARE(matcher5.indexIn(haystack), 6);
matcher5 = QLatin1StringMatcher("abc"_L1, Qt::CaseSensitive);
QCOMPARE(matcher5.indexIn(haystack), 31);
matcher5.setPattern(matcher4.pattern());
QCOMPARE(matcher5.indexIn(haystack), 42);
QLatin1StringMatcher matcher6 = matcher5;
QCOMPARE(matcher6.indexIn(haystack), 42);
QLatin1StringMatcher matcher7 = std::move(matcher5);
QCOMPARE(matcher7.indexIn(haystack), 42);
matcher1.setPattern("123"_L1);
matcher7 = std::move(matcher1);
QCOMPARE(matcher7.indexIn(haystack), 6);
}
#define LONG_STRING__32 "abcdefghijklmnopqrstuvwxyz012345"
#define LONG_STRING__64 LONG_STRING__32 LONG_STRING__32
#define LONG_STRING_128 LONG_STRING__64 LONG_STRING__64
#define LONG_STRING_256 LONG_STRING_128 LONG_STRING_128
#define LONG_STRING_512 LONG_STRING_256 LONG_STRING_256
void tst_QLatin1StringMatcher::indexIn()
{
const char p_data[] = { 0x0, 0x0, 0x1 };
QLatin1StringView pattern(p_data, sizeof(p_data));
QByteArray haystackT(8, '\0');
haystackT[7] = 0x1;
QLatin1StringView haystack(haystackT);
QLatin1StringMatcher matcher;
matcher = QLatin1StringMatcher(pattern, Qt::CaseSensitive);
QCOMPARE(matcher.indexIn(haystack, 0), 5);
QCOMPARE(matcher.indexIn(haystack, 1), 5);
QCOMPARE(matcher.indexIn(haystack, 2), 5);
matcher.setPattern(pattern);
QCOMPARE(matcher.indexIn(haystack, 0), 5);
QCOMPARE(matcher.indexIn(haystack, 1), 5);
QCOMPARE(matcher.indexIn(haystack, 2), 5);
std::array<char, 256> allChars;
for (int i = 0; i < 256; ++i)
allChars[i] = char(i);
matcher = QLatin1StringMatcher(QLatin1StringView(allChars), Qt::CaseSensitive);
haystackT = LONG_STRING__32 "x";
haystackT += matcher.pattern();
haystack = QLatin1StringView(haystackT);
QCOMPARE(matcher.indexIn(haystack, 0), 33);
QCOMPARE(matcher.indexIn(haystack, 1), 33);
QCOMPARE(matcher.indexIn(haystack, 2), 33);
QCOMPARE(matcher.indexIn(haystack, 33), 33);
QCOMPARE(matcher.indexIn(haystack, 34), -1);
matcher = QLatin1StringMatcher(QLatin1StringView(LONG_STRING_256), Qt::CaseSensitive);
haystackT = QByteArray(LONG_STRING__32 "x");
haystackT += matcher.pattern();
haystackT += QByteArrayView("Just junk at the end");
haystack = QLatin1StringView(haystackT);
QCOMPARE(matcher.indexIn(haystack, 0), 33);
QCOMPARE(matcher.indexIn(haystack, 1), 33);
QCOMPARE(matcher.indexIn(haystack, 2), 33);
QCOMPARE(matcher.indexIn(haystack, 33), 33);
QCOMPARE(matcher.indexIn(haystack, 34), -1);
matcher.setCaseSensitivity(Qt::CaseInsensitive);
QCOMPARE(matcher.indexIn(haystack, 0), 33);
QCOMPARE(matcher.indexIn(haystack, 1), 33);
QCOMPARE(matcher.indexIn(haystack, 2), 33);
QCOMPARE(matcher.indexIn(haystack, 33), 33);
QCOMPARE(matcher.indexIn(haystack, 34), -1);
matcher = QLatin1StringMatcher(QLatin1StringView(LONG_STRING_512), Qt::CaseInsensitive);
haystackT = QByteArray(LONG_STRING__32 "x");
haystackT += matcher.pattern();
haystackT += QByteArrayView("Just junk at the end");
haystack = QLatin1StringView(haystackT);
QCOMPARE(matcher.indexIn(haystack, 0), 33);
QCOMPARE(matcher.indexIn(haystack, 1), 33);
QCOMPARE(matcher.indexIn(haystack, 2), 33);
QCOMPARE(matcher.indexIn(haystack, 33), 33);
QCOMPARE(matcher.indexIn(haystack, 34), -1);
matcher.setCaseSensitivity(Qt::CaseSensitive);
QCOMPARE(matcher.indexIn(haystack, 0), 33);
QCOMPARE(matcher.indexIn(haystack, 1), 33);
QCOMPARE(matcher.indexIn(haystack, 2), 33);
QCOMPARE(matcher.indexIn(haystack, 33), 33);
QCOMPARE(matcher.indexIn(haystack, 34), -1);
matcher = QLatin1StringMatcher(QLatin1StringView(""), Qt::CaseSensitive);
haystackT = QByteArray(LONG_STRING__32 "x");
haystack = QLatin1StringView(haystackT);
QCOMPARE(matcher.indexIn(haystack, 0), 0);
QCOMPARE(matcher.indexIn(haystack, 1), 1);
QCOMPARE(matcher.indexIn(haystack, 2), 2);
QCOMPARE(matcher.indexIn(haystack, 33), 33);
matcher = QLatin1StringMatcher(QLatin1StringView(""), Qt::CaseInsensitive);
haystackT = QByteArray(LONG_STRING__32 "x");
haystack = QLatin1StringView(haystackT);
QCOMPARE(matcher.indexIn(haystack, 0), 0);
QCOMPARE(matcher.indexIn(haystack, 1), 1);
QCOMPARE(matcher.indexIn(haystack, 2), 2);
QCOMPARE(matcher.indexIn(haystack, 33), 33);
matcher = QLatin1StringMatcher(QLatin1StringView("m\xF8"), Qt::CaseInsensitive);
haystackT = QByteArray("M\xF8m\xF8");
haystack = QLatin1StringView(haystackT);
QCOMPARE(matcher.indexIn(haystack, 0), 0);
QCOMPARE(matcher.indexIn(haystack, 1), 2);
QCOMPARE(matcher.indexIn(haystack, 2), 2);
QCOMPARE(matcher.indexIn(haystack, 3), -1);
matcher.setCaseSensitivity(Qt::CaseSensitive);
QCOMPARE(matcher.indexIn(haystack, 0), 2);
QCOMPARE(matcher.indexIn(haystack, 1), 2);
QCOMPARE(matcher.indexIn(haystack, 2), 2);
QCOMPARE(matcher.indexIn(haystack, 3), -1);
}
void tst_QLatin1StringMatcher::haystacksWithMoreThan4GiBWork()
{
#if QT_POINTER_SIZE > 4
// use a large needle to trigger long skips in the Boyer-Moore algorithm
// (to speed up the test)
constexpr std::string_view needle = LONG_STRING_256;
//
// GIVEN: a haystack with more than 4 GiB of data
//
// don't use QByteArray because freeSpaceAtEnd() may break reserve()
// semantics and a realloc is the last thing we need here
std::string large;
QElapsedTimer timer;
timer.start();
constexpr size_t GiB = 1024 * 1024 * 1024;
constexpr size_t BaseSize = 4 * GiB + 1;
try {
large.reserve(BaseSize + needle.size());
large.resize(BaseSize, '\0');
large.append(needle);
} catch (const std::bad_alloc &) {
QSKIP("Could not allocate 4GiB plus a couple hundred bytes of RAM.");
}
QCOMPARE(large.size(), BaseSize + needle.size());
qDebug("created dataset in %lld ms", timer.elapsed());
# if QT_CONFIG(cxx11_future)
using MaybeThread = std::thread;
# else
struct MaybeThread
{
std::function<void()> func;
void join() { func(); }
};
# endif
//
// WHEN: trying to match an occurrence past the 4GiB mark
//
qsizetype dynamicResult;
auto t = MaybeThread{ [&] {
QLatin1StringMatcher m(QLatin1StringView(needle), Qt::CaseSensitive);
dynamicResult = m.indexIn(QLatin1StringView(large));
} };
t.join();
//
// THEN: the result index is not trucated
//
QCOMPARE(dynamicResult, qsizetype(BaseSize));
#else
QSKIP("This test is 64-bit only.");
#endif
}
#undef LONG_STRING_512
#undef LONG_STRING_256
#undef LONG_STRING_128
#undef LONG_STRING__64
#undef LONG_STRING__32
QTEST_APPLESS_MAIN(tst_QLatin1StringMatcher)
#include "tst_qlatin1stringmatcher.moc"