Add Boyer-Moore Latin-1 string searcher with optional case sensitivity
[ChangeLog][QtCore][QString] Added Boyer-Moore Latin-1 string searcher with optional case sensitivity Task-number: QTBUG-100236 Change-Id: I200a0dac7c8012add1ee02511dba791d233115e0 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
bb5d4094e0
commit
3fedcd4e4a
@ -218,6 +218,7 @@ qt_internal_add_module(Core
|
||||
text/qchar.h
|
||||
text/qcollator.cpp text/qcollator.h text/qcollator_p.h
|
||||
text/qdoublescanprint_p.h
|
||||
text/qlatin1stringmatcher.cpp text/qlatin1stringmatcher.h
|
||||
text/qlocale.cpp text/qlocale.h text/qlocale_p.h
|
||||
text/qlocale_data_p.h
|
||||
text/qlocale_tools.cpp text/qlocale_tools_p.h
|
||||
|
198
src/corelib/text/qlatin1stringmatcher.cpp
Normal file
198
src/corelib/text/qlatin1stringmatcher.cpp
Normal file
@ -0,0 +1,198 @@
|
||||
// Copyright (C) 2022 The Qt Company Ltd.
|
||||
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
|
||||
|
||||
#include "qlatin1stringmatcher.h"
|
||||
#include <limits.h>
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
/*! \class QLatin1StringMatcher
|
||||
\inmodule QtCore
|
||||
\brief Optimized search for substring in Latin-1 text.
|
||||
|
||||
A QLatin1StringMatcher can search for one QLatin1StringView
|
||||
as a substring of another, either ignoring case or taking it into
|
||||
account.
|
||||
|
||||
\since 6.5
|
||||
\ingroup tools
|
||||
\ingroup string-processing
|
||||
|
||||
This class is useful when you have a Latin-1 encoded string that
|
||||
you want to repeatedly search for in some QLatin1StringViews
|
||||
(perhaps in a loop), or when you want to search for all
|
||||
instances of it in a given QLatin1StringView. Using a matcher
|
||||
object and indexIn() is faster than matching a plain
|
||||
QLatin1StringView with QLatin1StringView::indexOf() if repeated
|
||||
matching takes place. This class offers no benefit if you are
|
||||
doing one-off matches. The string to be searched for must not
|
||||
be destroyed or changed before the matcher object is destroyed,
|
||||
as the matcher accesses the string when searching for it.
|
||||
|
||||
Create a QLatin1StringMatcher for the QLatin1StringView
|
||||
you want to search for and the case sensitivity. Then call
|
||||
indexIn() with the QLatin1StringView that you want to search
|
||||
within.
|
||||
|
||||
\sa QLatin1StringView, QStringMatcher, QByteArrayMatcher
|
||||
*/
|
||||
|
||||
/*!
|
||||
Construct an empty Latin-1 string matcher.
|
||||
This will match at each position in any string.
|
||||
\sa setPattern(), setCaseSensitivity(), indexIn()
|
||||
*/
|
||||
QLatin1StringMatcher::QLatin1StringMatcher() noexcept
|
||||
: m_pattern(),
|
||||
m_cs(Qt::CaseSensitive),
|
||||
m_caseSensitiveSearcher(m_pattern.data(), m_pattern.data())
|
||||
{
|
||||
}
|
||||
|
||||
/*!
|
||||
Constructs a Latin-1 string matcher that searches for the given \a pattern
|
||||
with given case sensitivity \a cs. The \a pattern argument must
|
||||
not be destroyed before this matcher object. Call indexIn()
|
||||
to find the \a pattern in the given QLatin1StringView.
|
||||
*/
|
||||
QLatin1StringMatcher::QLatin1StringMatcher(QLatin1StringView pattern,
|
||||
Qt::CaseSensitivity cs) noexcept
|
||||
: m_pattern(pattern), m_cs(cs)
|
||||
{
|
||||
setSearcher();
|
||||
}
|
||||
|
||||
/*!
|
||||
Destroys the Latin-1 string matcher.
|
||||
*/
|
||||
QLatin1StringMatcher::~QLatin1StringMatcher() noexcept
|
||||
{
|
||||
freeSearcher();
|
||||
}
|
||||
|
||||
/*!
|
||||
\internal
|
||||
*/
|
||||
void QLatin1StringMatcher::setSearcher() noexcept
|
||||
{
|
||||
if (m_cs == Qt::CaseSensitive) {
|
||||
new (&m_caseSensitiveSearcher) CaseSensitiveSearcher(m_pattern.data(), m_pattern.end());
|
||||
} else {
|
||||
QtPrivate::QCaseInsensitiveLatin1Hash foldCase;
|
||||
qsizetype bufferSize = std::min(m_pattern.size(), qsizetype(sizeof m_foldBuffer));
|
||||
for (qsizetype i = 0; i < bufferSize; ++i)
|
||||
m_foldBuffer[i] = static_cast<char>(foldCase(m_pattern[i].toLatin1()));
|
||||
|
||||
new (&m_caseInsensitiveSearcher)
|
||||
CaseInsensitiveSearcher(m_foldBuffer, &m_foldBuffer[bufferSize]);
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
\internal
|
||||
*/
|
||||
void QLatin1StringMatcher::freeSearcher() noexcept
|
||||
{
|
||||
if (m_cs == Qt::CaseSensitive)
|
||||
m_caseSensitiveSearcher.~CaseSensitiveSearcher();
|
||||
else
|
||||
m_caseInsensitiveSearcher.~CaseInsensitiveSearcher();
|
||||
}
|
||||
|
||||
/*!
|
||||
Sets the \a pattern to search for. The string pointed to by the
|
||||
QLatin1StringView must not be destroyed before the matcher is
|
||||
destroyed, unless it is set to point to a different \a pattern
|
||||
with longer lifetime first.
|
||||
|
||||
\sa pattern(), indexIn()
|
||||
*/
|
||||
void QLatin1StringMatcher::setPattern(QLatin1StringView pattern) noexcept
|
||||
{
|
||||
if (m_pattern.latin1() == pattern.latin1() && m_pattern.size() == pattern.size())
|
||||
return; // Same address and size
|
||||
|
||||
freeSearcher();
|
||||
m_pattern = pattern;
|
||||
setSearcher();
|
||||
}
|
||||
|
||||
/*!
|
||||
Returns the Latin-1 pattern that the matcher searches for.
|
||||
|
||||
\sa setPattern(), indexIn()
|
||||
*/
|
||||
QLatin1StringView QLatin1StringMatcher::pattern() const noexcept
|
||||
{
|
||||
return m_pattern;
|
||||
}
|
||||
|
||||
/*!
|
||||
Sets the case sensitivity to \a cs.
|
||||
|
||||
\sa caseSensitivity(), indexIn()
|
||||
*/
|
||||
void QLatin1StringMatcher::setCaseSensitivity(Qt::CaseSensitivity cs) noexcept
|
||||
{
|
||||
if (m_cs == cs)
|
||||
return;
|
||||
|
||||
freeSearcher();
|
||||
m_cs = cs;
|
||||
setSearcher();
|
||||
}
|
||||
|
||||
/*!
|
||||
Returns the case sensitivity the matcher uses.
|
||||
|
||||
\sa setCaseSensitivity(), indexIn()
|
||||
*/
|
||||
Qt::CaseSensitivity QLatin1StringMatcher::caseSensitivity() const noexcept
|
||||
{
|
||||
return m_cs;
|
||||
}
|
||||
|
||||
/*!
|
||||
Searches for the pattern in the given \a haystack starting from
|
||||
\a from.
|
||||
|
||||
\sa caseSensitivity(), pattern()
|
||||
*/
|
||||
qsizetype QLatin1StringMatcher::indexIn(QLatin1StringView haystack, qsizetype from) const noexcept
|
||||
{
|
||||
if (m_pattern.isEmpty() && from == haystack.size())
|
||||
return from;
|
||||
if (from >= haystack.size())
|
||||
return -1;
|
||||
auto begin = haystack.begin() + from;
|
||||
auto end = haystack.end();
|
||||
auto found = begin;
|
||||
if (m_cs == Qt::CaseSensitive) {
|
||||
found = m_caseSensitiveSearcher(begin, end, m_pattern.begin(), m_pattern.end()).begin;
|
||||
if (found == end)
|
||||
return -1;
|
||||
} else {
|
||||
const qsizetype bufferSize = std::min(m_pattern.size(), qsizetype(sizeof m_foldBuffer));
|
||||
const QLatin1StringView restNeedle = m_pattern.sliced(bufferSize);
|
||||
const bool needleLongerThanBuffer = restNeedle.size() > 0;
|
||||
QLatin1StringView restHaystack = haystack;
|
||||
do {
|
||||
found = m_caseInsensitiveSearcher(found, end, m_foldBuffer, &m_foldBuffer[bufferSize])
|
||||
.begin;
|
||||
if (found == end) {
|
||||
return -1;
|
||||
} else if (!needleLongerThanBuffer) {
|
||||
break;
|
||||
}
|
||||
restHaystack = haystack.sliced(
|
||||
qMin(haystack.size(),
|
||||
bufferSize + qsizetype(std::distance(haystack.begin(), found))));
|
||||
if (restHaystack.startsWith(restNeedle, Qt::CaseInsensitive))
|
||||
break;
|
||||
++found;
|
||||
} while (true);
|
||||
}
|
||||
return std::distance(haystack.begin(), found);
|
||||
}
|
||||
|
||||
QT_END_NAMESPACE
|
160
src/corelib/text/qlatin1stringmatcher.h
Normal file
160
src/corelib/text/qlatin1stringmatcher.h
Normal file
@ -0,0 +1,160 @@
|
||||
// Copyright (C) 2022 The Qt Company Ltd.
|
||||
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
|
||||
|
||||
#ifndef QLATIN1STRINGMATCHER_H
|
||||
#define QLATIN1STRINGMATCHER_H
|
||||
|
||||
#include <functional>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
|
||||
#include <QtCore/q20algorithm.h>
|
||||
#include <QtCore/qstring.h>
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
namespace QtPrivate {
|
||||
template<class RandomIt1,
|
||||
class Hash = std::hash<typename std::iterator_traits<RandomIt1>::value_type>,
|
||||
class BinaryPredicate = std::equal_to<>>
|
||||
class q_boyer_moore_searcher_hashed_needle
|
||||
{
|
||||
public:
|
||||
constexpr q_boyer_moore_searcher_hashed_needle(RandomIt1 pat_first, RandomIt1 pat_last)
|
||||
: m_skiptable{}
|
||||
{
|
||||
const size_t n = std::distance(pat_first, pat_last);
|
||||
constexpr auto uchar_max = (std::numeric_limits<uchar>::max)();
|
||||
uchar max = n > uchar_max ? uchar_max : uchar(n);
|
||||
q20::fill(std::begin(m_skiptable), std::end(m_skiptable), max);
|
||||
|
||||
RandomIt1 pattern = pat_first;
|
||||
pattern += n - max;
|
||||
while (max--)
|
||||
m_skiptable[uchar(*pattern++)] = max;
|
||||
}
|
||||
|
||||
template<class RandomIt2>
|
||||
constexpr auto operator()(RandomIt2 first, RandomIt2 last, RandomIt1 pat_first,
|
||||
RandomIt1 pat_last) const
|
||||
{
|
||||
struct R
|
||||
{
|
||||
RandomIt2 begin, end;
|
||||
};
|
||||
Hash hf;
|
||||
BinaryPredicate pred;
|
||||
auto pat_length = std::distance(pat_first, pat_last);
|
||||
if (pat_length == 0)
|
||||
return R{ first, first };
|
||||
|
||||
const qsizetype pl_minus_one = qsizetype(pat_length - 1);
|
||||
RandomIt2 current = first + pl_minus_one;
|
||||
|
||||
while (current < last) {
|
||||
qsizetype skip = m_skiptable[hf(*current)];
|
||||
if (!skip) {
|
||||
// possible match
|
||||
while (skip < pat_length) {
|
||||
if (!pred(hf(*(current - skip)), uchar(pat_first[pl_minus_one - skip])))
|
||||
break;
|
||||
skip++;
|
||||
}
|
||||
if (skip > pl_minus_one) { // we have a match
|
||||
auto match = current - skip + 1;
|
||||
return R{ match, match + pat_length };
|
||||
}
|
||||
|
||||
// If we don't have a match we are a bit inefficient as we only skip by one
|
||||
// when we have the non matching char in the string.
|
||||
if (m_skiptable[hf(*(current - skip))] == pat_length)
|
||||
skip = pat_length - skip;
|
||||
else
|
||||
skip = 1;
|
||||
}
|
||||
current += skip;
|
||||
}
|
||||
|
||||
return R{ last, last };
|
||||
}
|
||||
|
||||
private:
|
||||
alignas(16) uchar m_skiptable[256];
|
||||
};
|
||||
|
||||
struct QCaseSensitiveLatin1Hash
|
||||
{
|
||||
constexpr QCaseSensitiveLatin1Hash() = default;
|
||||
|
||||
constexpr std::size_t operator()(char c) const noexcept { return std::size_t(uchar(c)); }
|
||||
};
|
||||
|
||||
struct QCaseInsensitiveLatin1Hash
|
||||
{
|
||||
constexpr QCaseInsensitiveLatin1Hash() = default;
|
||||
|
||||
constexpr std::size_t operator()(char c) const noexcept
|
||||
{
|
||||
return std::size_t(latin1Lower[uchar(c)]);
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr uchar latin1Lower[256] = {
|
||||
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
|
||||
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
|
||||
0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x5b,0x5c,0x5d,0x5e,0x5f,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
|
||||
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
|
||||
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
|
||||
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
|
||||
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
|
||||
// 0xd7 (multiplication sign) and 0xdf (sz ligature) complicate life
|
||||
0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
|
||||
0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xd7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xdf,
|
||||
0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
|
||||
0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
class QLatin1StringMatcher
|
||||
{
|
||||
public:
|
||||
Q_CORE_EXPORT QLatin1StringMatcher() noexcept;
|
||||
Q_CORE_EXPORT explicit QLatin1StringMatcher(
|
||||
QLatin1StringView pattern, Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
|
||||
Q_CORE_EXPORT ~QLatin1StringMatcher() noexcept;
|
||||
|
||||
Q_CORE_EXPORT void setPattern(QLatin1StringView pattern) noexcept;
|
||||
Q_CORE_EXPORT QLatin1StringView pattern() const noexcept;
|
||||
Q_CORE_EXPORT void setCaseSensitivity(Qt::CaseSensitivity cs) noexcept;
|
||||
Q_CORE_EXPORT Qt::CaseSensitivity caseSensitivity() const noexcept;
|
||||
|
||||
Q_CORE_EXPORT qsizetype indexIn(QLatin1StringView haystack, qsizetype from = 0) const noexcept;
|
||||
|
||||
private:
|
||||
void setSearcher() noexcept;
|
||||
void freeSearcher() noexcept;
|
||||
|
||||
QLatin1StringView m_pattern;
|
||||
Qt::CaseSensitivity m_cs;
|
||||
typedef QtPrivate::q_boyer_moore_searcher_hashed_needle<const char *,
|
||||
QtPrivate::QCaseSensitiveLatin1Hash>
|
||||
CaseSensitiveSearcher;
|
||||
typedef QtPrivate::q_boyer_moore_searcher_hashed_needle<const char *,
|
||||
QtPrivate::QCaseInsensitiveLatin1Hash>
|
||||
CaseInsensitiveSearcher;
|
||||
union {
|
||||
CaseSensitiveSearcher m_caseSensitiveSearcher;
|
||||
CaseInsensitiveSearcher m_caseInsensitiveSearcher;
|
||||
};
|
||||
char m_foldBuffer[256];
|
||||
};
|
||||
|
||||
QT_END_NAMESPACE
|
||||
|
||||
#endif // QLATIN1MATCHER_H
|
@ -38,13 +38,12 @@
|
||||
#include <wchar.h>
|
||||
|
||||
#include "qchar.cpp"
|
||||
#include "qlatin1stringmatcher.h"
|
||||
#include "qstringmatcher.cpp"
|
||||
#include "qstringiterator_p.h"
|
||||
#include "qstringalgorithms_p.h"
|
||||
#include "qthreadstorage.h"
|
||||
|
||||
#include "qbytearraymatcher.h" // Helper for comparison of QLatin1StringView
|
||||
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
|
||||
@ -1415,70 +1414,6 @@ static int latin1nicmp(const char *lhsChar, qsizetype lSize, const char *rhsChar
|
||||
return lencmp(lSize, rSize);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
template<Qt::CaseSensitivity cs>
|
||||
inline uchar latin1_fold(const uchar c)
|
||||
{
|
||||
return c;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline uchar latin1_fold<Qt::CaseSensitivity::CaseInsensitive>(const uchar c)
|
||||
{
|
||||
return latin1Lower[c];
|
||||
}
|
||||
|
||||
template<Qt::CaseSensitivity cs>
|
||||
inline void bm_latin1_init_skiptable(const uchar *cc, qsizetype len, uchar *skiptable)
|
||||
{
|
||||
int l = int(qMin(len, qsizetype(255)));
|
||||
memset(skiptable, l, 256 * sizeof(uchar));
|
||||
cc += len - l;
|
||||
while (l--)
|
||||
skiptable[latin1_fold<cs>(*cc++)] = l;
|
||||
}
|
||||
|
||||
template<Qt::CaseSensitivity cs>
|
||||
inline qsizetype bm_latin1_find(const uchar *cc, qsizetype l, qsizetype index, const uchar *puc,
|
||||
qsizetype pl, const uchar *skiptable)
|
||||
{
|
||||
if (pl == 0)
|
||||
return index > l ? -1 : index;
|
||||
const qsizetype pl_minus_one = pl - 1;
|
||||
|
||||
const uchar *current = cc + index + pl_minus_one;
|
||||
const uchar *end = cc + l;
|
||||
|
||||
while (current < end) {
|
||||
qsizetype skip = skiptable[latin1_fold<cs>(*current)];
|
||||
if (!skip) {
|
||||
// possible match
|
||||
while (skip < pl) {
|
||||
if (latin1_fold<cs>(*(current - skip)) != latin1_fold<cs>(puc[pl_minus_one - skip]))
|
||||
break;
|
||||
skip++;
|
||||
}
|
||||
if (skip > pl_minus_one) // we have a match
|
||||
return (current - cc) - skip + 1;
|
||||
|
||||
// in case we don't have a match we are a bit inefficient as we only skip by one
|
||||
// when we have the non matching char in the string.
|
||||
if (skiptable[latin1_fold<cs>(*(current - skip))] == pl)
|
||||
skip = pl - skip;
|
||||
else
|
||||
skip = 1;
|
||||
}
|
||||
if (current > end - skip)
|
||||
break;
|
||||
current += skip;
|
||||
}
|
||||
|
||||
return -1; // not found
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool QtPrivate::equalStrings(QStringView lhs, QStringView rhs) noexcept
|
||||
{
|
||||
return ucstreq(lhs.utf16(), lhs.size(), rhs.utf16(), rhs.size());
|
||||
@ -10686,16 +10621,10 @@ qsizetype QtPrivate::count(QLatin1StringView haystack, QLatin1StringView needle,
|
||||
qsizetype num = 0;
|
||||
qsizetype i = -1;
|
||||
|
||||
// TODO: use Boyer-Moore searcher for case-insensitive search too
|
||||
// when QTBUG-100236 is done
|
||||
if (cs == Qt::CaseSensitive) {
|
||||
QByteArrayMatcher matcher(needle);
|
||||
while ((i = matcher.indexIn(haystack, i + 1)) != -1)
|
||||
++num;
|
||||
} else {
|
||||
while ((i = QtPrivate::findString(haystack, i + 1, needle, cs)) != -1)
|
||||
++num;
|
||||
}
|
||||
QLatin1StringMatcher matcher(needle, cs);
|
||||
while ((i = matcher.indexIn(haystack, i + 1)) != -1)
|
||||
++num;
|
||||
|
||||
return num;
|
||||
}
|
||||
|
||||
@ -10710,19 +10639,14 @@ qsizetype QtPrivate::count(QLatin1StringView haystack, QStringView needle, Qt::C
|
||||
qsizetype num = 0;
|
||||
qsizetype i = -1;
|
||||
|
||||
// TODO: use Boyer-Moore searcher for case-insensitive search too
|
||||
// when QTBUG-100236 is done
|
||||
if (cs == Qt::CaseSensitive) {
|
||||
QVarLengthArray<uchar> s(needle.size());
|
||||
qt_to_latin1_unchecked(s.data(), needle.utf16(), needle.size());
|
||||
QVarLengthArray<uchar> s(needle.size());
|
||||
qt_to_latin1_unchecked(s.data(), needle.utf16(), needle.size());
|
||||
|
||||
QLatin1StringMatcher matcher(QLatin1StringView(reinterpret_cast<char *>(s.data()), s.size()),
|
||||
cs);
|
||||
while ((i = matcher.indexIn(haystack, i + 1)) != -1)
|
||||
++num;
|
||||
|
||||
QByteArrayMatcher matcher(s);
|
||||
while ((i = matcher.indexIn(haystack, i + 1)) != -1)
|
||||
++num;
|
||||
} else {
|
||||
while ((i = QtPrivate::findString(haystack, i + 1, needle, cs)) != -1)
|
||||
++num;
|
||||
}
|
||||
return num;
|
||||
}
|
||||
|
||||
@ -10743,12 +10667,11 @@ qsizetype QtPrivate::count(QLatin1StringView haystack, QChar needle, Qt::CaseSen
|
||||
if (needle.unicode() > 0xff)
|
||||
return 0;
|
||||
|
||||
const char needleL1 = needle.toLatin1();
|
||||
if (cs == Qt::CaseSensitive) {
|
||||
return std::count(haystack.cbegin(), haystack.cend(), needleL1);
|
||||
return std::count(haystack.cbegin(), haystack.cend(), needle.toLatin1());
|
||||
} else {
|
||||
auto toLower = [](char ch) { return latin1Lower[uchar(ch)]; };
|
||||
const uchar ch = toLower(needleL1);
|
||||
const uchar ch = toLower(needle.toLatin1());
|
||||
return std::count_if(haystack.cbegin(), haystack.cend(), [&toLower, ch](const char c) {
|
||||
return toLower(c) == ch;
|
||||
});
|
||||
@ -10961,7 +10884,7 @@ qsizetype QtPrivate::findString(QLatin1StringView haystack, qsizetype from, QLat
|
||||
return -1;
|
||||
}
|
||||
|
||||
const QByteArrayMatcher matcher(needle);
|
||||
const QLatin1StringMatcher matcher(needle, Qt::CaseSensitivity::CaseSensitive);
|
||||
return matcher.indexIn(haystack, from);
|
||||
}
|
||||
|
||||
@ -10995,12 +10918,9 @@ qsizetype QtPrivate::findString(QLatin1StringView haystack, qsizetype from, QLat
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
uchar skiptable[256];
|
||||
bm_latin1_init_skiptable<Qt::CaseSensitivity::CaseInsensitive>(
|
||||
reinterpret_cast<const unsigned char *>(needle.begin()), needle.size(), skiptable);
|
||||
return bm_latin1_find<Qt::CaseSensitivity::CaseInsensitive>(
|
||||
reinterpret_cast<const unsigned char *>(haystack.begin()), haystack.size(), from,
|
||||
reinterpret_cast<const unsigned char *>(needle.begin()), needle.size(), skiptable);
|
||||
|
||||
QLatin1StringMatcher matcher(needle, Qt::CaseSensitivity::CaseInsensitive);
|
||||
return matcher.indexIn(haystack, from);
|
||||
}
|
||||
|
||||
qsizetype QtPrivate::lastIndexOf(QStringView haystack, qsizetype from, QStringView needle, Qt::CaseSensitivity cs) noexcept
|
||||
|
@ -68,6 +68,7 @@ qt_internal_extend_target(Bootstrap
|
||||
../../corelib/text/qbytearray.cpp
|
||||
../../corelib/text/qbytearraylist.cpp
|
||||
../../corelib/text/qbytearraymatcher.cpp
|
||||
../../corelib/text/qlatin1stringmatcher.cpp
|
||||
../../corelib/text/qlocale.cpp
|
||||
../../corelib/text/qlocale_tools.cpp
|
||||
../../corelib/text/qregularexpression.cpp
|
||||
|
@ -13,6 +13,7 @@ add_subdirectory(qbytearrayview)
|
||||
add_subdirectory(qbytedatabuffer)
|
||||
add_subdirectory(qchar)
|
||||
add_subdirectory(qcollator)
|
||||
add_subdirectory(qlatin1stringmatcher)
|
||||
add_subdirectory(qlatin1stringview)
|
||||
add_subdirectory(qregularexpression)
|
||||
add_subdirectory(qstring)
|
||||
|
14
tests/auto/corelib/text/qlatin1stringmatcher/CMakeLists.txt
Normal file
14
tests/auto/corelib/text/qlatin1stringmatcher/CMakeLists.txt
Normal file
@ -0,0 +1,14 @@
|
||||
# Copyright (C) 2022 The Qt Company Ltd.
|
||||
# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
|
||||
|
||||
#####################################################################
|
||||
## tst_qlatin1sgtringmatcher Test:
|
||||
#####################################################################
|
||||
|
||||
qt_internal_add_test(tst_qlatin1stringmatcher
|
||||
SOURCES
|
||||
tst_qlatin1stringmatcher.cpp
|
||||
)
|
||||
|
||||
## Scopes:
|
||||
#####################################################################
|
@ -0,0 +1,306 @@
|
||||
// Copyright (C) 2022 The Qt Company Ltd.
|
||||
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
|
||||
|
||||
#include <QTest>
|
||||
|
||||
#include <QtCore/QLatin1StringMatcher>
|
||||
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
|
||||
#if QT_CONFIG(cxx11_future)
|
||||
# include <thread>
|
||||
#endif
|
||||
|
||||
// COM interface
|
||||
#if defined(interface)
|
||||
# undef interface
|
||||
#endif
|
||||
|
||||
using namespace Qt::Literals::StringLiterals;
|
||||
|
||||
class tst_QLatin1StringMatcher : public QObject
|
||||
{
|
||||
Q_OBJECT
|
||||
|
||||
private slots:
|
||||
void overloads();
|
||||
void interface();
|
||||
void indexIn();
|
||||
void haystacksWithMoreThan4GiBWork();
|
||||
};
|
||||
|
||||
void tst_QLatin1StringMatcher::overloads()
|
||||
{
|
||||
QLatin1StringView hello = "hello"_L1;
|
||||
QByteArray hello2B = QByteArrayView(hello).toByteArray().repeated(2);
|
||||
QLatin1StringView hello2(hello2B);
|
||||
{
|
||||
QLatin1StringMatcher m("hello"_L1, Qt::CaseSensitive);
|
||||
QCOMPARE(m.pattern(), "hello"_L1);
|
||||
QCOMPARE(m.indexIn("hello"_L1), 0);
|
||||
QCOMPARE(m.indexIn("Hello"_L1), -1);
|
||||
QCOMPARE(m.indexIn("Hellohello"_L1), 5);
|
||||
QCOMPARE(m.indexIn("helloHello"_L1), 0);
|
||||
QCOMPARE(m.indexIn("helloHello"_L1, 1), -1);
|
||||
}
|
||||
{
|
||||
QLatin1StringMatcher m("Hello"_L1, Qt::CaseSensitive);
|
||||
QCOMPARE(m.pattern(), "Hello"_L1);
|
||||
QCOMPARE(m.indexIn("hello"_L1), -1);
|
||||
QCOMPARE(m.indexIn("Hello"_L1), 0);
|
||||
QCOMPARE(m.indexIn("Hellohello"_L1), 0);
|
||||
QCOMPARE(m.indexIn("helloHello"_L1), 5);
|
||||
QCOMPARE(m.indexIn("helloHello"_L1, 6), -1);
|
||||
}
|
||||
{
|
||||
QLatin1StringMatcher m("hello"_L1, Qt::CaseInsensitive);
|
||||
QCOMPARE(m.pattern(), "hello"_L1);
|
||||
QCOMPARE(m.indexIn("hello"_L1), 0);
|
||||
QCOMPARE(m.indexIn("Hello"_L1), 0);
|
||||
QCOMPARE(m.indexIn("Hellohello"_L1), 0);
|
||||
QCOMPARE(m.indexIn("helloHello"_L1), 0);
|
||||
QCOMPARE(m.indexIn("helloHello"_L1, 1), 5);
|
||||
QCOMPARE(m.indexIn("helloHello"_L1, 6), -1);
|
||||
}
|
||||
{
|
||||
QLatin1StringMatcher m("Hello"_L1, Qt::CaseInsensitive);
|
||||
QCOMPARE(m.pattern(), "Hello"_L1);
|
||||
QCOMPARE(m.indexIn("hello"_L1), 0);
|
||||
QCOMPARE(m.indexIn("Hello"_L1), 0);
|
||||
QCOMPARE(m.indexIn("Hellohello"_L1), 0);
|
||||
QCOMPARE(m.indexIn("helloHello"_L1), 0);
|
||||
QCOMPARE(m.indexIn("helloHello"_L1, 1), 5);
|
||||
QCOMPARE(m.indexIn("helloHello"_L1, 6), -1);
|
||||
}
|
||||
{
|
||||
QLatin1StringMatcher m(hello, Qt::CaseSensitive);
|
||||
QCOMPARE(m.pattern(), "hello"_L1);
|
||||
QCOMPARE(m.indexIn(hello), 0);
|
||||
QCOMPARE(m.indexIn(hello, 1), -1);
|
||||
QCOMPARE(m.indexIn(hello2, 1), hello.size());
|
||||
QCOMPARE(m.indexIn(hello2, 6), -1);
|
||||
}
|
||||
}
|
||||
|
||||
void tst_QLatin1StringMatcher::interface()
|
||||
{
|
||||
QLatin1StringView needle = "abc123"_L1;
|
||||
QByteArray haystackT(500, 'a');
|
||||
haystackT.insert(6, "123");
|
||||
haystackT.insert(31, "abc");
|
||||
haystackT.insert(42, "abc123");
|
||||
haystackT.insert(84, "abc123");
|
||||
QLatin1StringView haystack(haystackT);
|
||||
|
||||
QLatin1StringMatcher matcher1;
|
||||
|
||||
matcher1 = QLatin1StringMatcher(needle, Qt::CaseSensitive);
|
||||
QLatin1StringMatcher matcher2;
|
||||
matcher2.setPattern(needle);
|
||||
|
||||
QLatin1StringMatcher matcher3 = QLatin1StringMatcher(needle, Qt::CaseSensitive);
|
||||
QLatin1StringMatcher matcher4;
|
||||
matcher4 = matcher3;
|
||||
|
||||
QCOMPARE(matcher1.indexIn(haystack), 42);
|
||||
QCOMPARE(matcher2.indexIn(haystack), 42);
|
||||
QCOMPARE(matcher3.indexIn(haystack), 42);
|
||||
QCOMPARE(matcher4.indexIn(haystack), 42);
|
||||
|
||||
QCOMPARE(matcher1.indexIn(haystack, 43), 84);
|
||||
QCOMPARE(matcher1.indexIn(haystack, 85), -1);
|
||||
|
||||
QLatin1StringMatcher matcher5("123"_L1, Qt::CaseSensitive);
|
||||
QCOMPARE(matcher5.indexIn(haystack), 6);
|
||||
|
||||
matcher5 = QLatin1StringMatcher("abc"_L1, Qt::CaseSensitive);
|
||||
QCOMPARE(matcher5.indexIn(haystack), 31);
|
||||
|
||||
matcher5.setPattern(matcher4.pattern());
|
||||
QCOMPARE(matcher5.indexIn(haystack), 42);
|
||||
|
||||
QLatin1StringMatcher matcher6 = matcher5;
|
||||
QCOMPARE(matcher6.indexIn(haystack), 42);
|
||||
|
||||
QLatin1StringMatcher matcher7 = std::move(matcher5);
|
||||
QCOMPARE(matcher7.indexIn(haystack), 42);
|
||||
|
||||
matcher1.setPattern("123"_L1);
|
||||
matcher7 = std::move(matcher1);
|
||||
QCOMPARE(matcher7.indexIn(haystack), 6);
|
||||
}
|
||||
|
||||
#define LONG_STRING__32 "abcdefghijklmnopqrstuvwxyz012345"
|
||||
#define LONG_STRING__64 LONG_STRING__32 LONG_STRING__32
|
||||
#define LONG_STRING_128 LONG_STRING__64 LONG_STRING__64
|
||||
#define LONG_STRING_256 LONG_STRING_128 LONG_STRING_128
|
||||
#define LONG_STRING_512 LONG_STRING_256 LONG_STRING_256
|
||||
|
||||
void tst_QLatin1StringMatcher::indexIn()
|
||||
{
|
||||
const char p_data[] = { 0x0, 0x0, 0x1 };
|
||||
QLatin1StringView pattern(p_data, sizeof(p_data));
|
||||
|
||||
QByteArray haystackT(8, '\0');
|
||||
haystackT[7] = 0x1;
|
||||
QLatin1StringView haystack(haystackT);
|
||||
|
||||
QLatin1StringMatcher matcher;
|
||||
|
||||
matcher = QLatin1StringMatcher(pattern, Qt::CaseSensitive);
|
||||
QCOMPARE(matcher.indexIn(haystack, 0), 5);
|
||||
QCOMPARE(matcher.indexIn(haystack, 1), 5);
|
||||
QCOMPARE(matcher.indexIn(haystack, 2), 5);
|
||||
|
||||
matcher.setPattern(pattern);
|
||||
QCOMPARE(matcher.indexIn(haystack, 0), 5);
|
||||
QCOMPARE(matcher.indexIn(haystack, 1), 5);
|
||||
QCOMPARE(matcher.indexIn(haystack, 2), 5);
|
||||
|
||||
std::array<char, 256> allChars;
|
||||
for (int i = 0; i < 256; ++i)
|
||||
allChars[i] = char(i);
|
||||
|
||||
matcher = QLatin1StringMatcher(QLatin1StringView(allChars), Qt::CaseSensitive);
|
||||
haystackT = LONG_STRING__32 "x";
|
||||
haystackT += matcher.pattern();
|
||||
haystack = QLatin1StringView(haystackT);
|
||||
QCOMPARE(matcher.indexIn(haystack, 0), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 1), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 2), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 33), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 34), -1);
|
||||
|
||||
matcher = QLatin1StringMatcher(QLatin1StringView(LONG_STRING_256), Qt::CaseSensitive);
|
||||
haystackT = QByteArray(LONG_STRING__32 "x");
|
||||
haystackT += matcher.pattern();
|
||||
haystackT += QByteArrayView("Just junk at the end");
|
||||
haystack = QLatin1StringView(haystackT);
|
||||
QCOMPARE(matcher.indexIn(haystack, 0), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 1), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 2), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 33), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 34), -1);
|
||||
matcher.setCaseSensitivity(Qt::CaseInsensitive);
|
||||
QCOMPARE(matcher.indexIn(haystack, 0), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 1), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 2), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 33), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 34), -1);
|
||||
|
||||
matcher = QLatin1StringMatcher(QLatin1StringView(LONG_STRING_512), Qt::CaseInsensitive);
|
||||
haystackT = QByteArray(LONG_STRING__32 "x");
|
||||
haystackT += matcher.pattern();
|
||||
haystackT += QByteArrayView("Just junk at the end");
|
||||
haystack = QLatin1StringView(haystackT);
|
||||
QCOMPARE(matcher.indexIn(haystack, 0), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 1), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 2), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 33), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 34), -1);
|
||||
matcher.setCaseSensitivity(Qt::CaseSensitive);
|
||||
QCOMPARE(matcher.indexIn(haystack, 0), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 1), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 2), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 33), 33);
|
||||
QCOMPARE(matcher.indexIn(haystack, 34), -1);
|
||||
|
||||
matcher = QLatin1StringMatcher(QLatin1StringView(""), Qt::CaseSensitive);
|
||||
haystackT = QByteArray(LONG_STRING__32 "x");
|
||||
haystack = QLatin1StringView(haystackT);
|
||||
QCOMPARE(matcher.indexIn(haystack, 0), 0);
|
||||
QCOMPARE(matcher.indexIn(haystack, 1), 1);
|
||||
QCOMPARE(matcher.indexIn(haystack, 2), 2);
|
||||
QCOMPARE(matcher.indexIn(haystack, 33), 33);
|
||||
|
||||
matcher = QLatin1StringMatcher(QLatin1StringView(""), Qt::CaseInsensitive);
|
||||
haystackT = QByteArray(LONG_STRING__32 "x");
|
||||
haystack = QLatin1StringView(haystackT);
|
||||
QCOMPARE(matcher.indexIn(haystack, 0), 0);
|
||||
QCOMPARE(matcher.indexIn(haystack, 1), 1);
|
||||
QCOMPARE(matcher.indexIn(haystack, 2), 2);
|
||||
QCOMPARE(matcher.indexIn(haystack, 33), 33);
|
||||
|
||||
matcher = QLatin1StringMatcher(QLatin1StringView("m\xF8"), Qt::CaseInsensitive);
|
||||
haystackT = QByteArray("M\xF8m\xF8");
|
||||
haystack = QLatin1StringView(haystackT);
|
||||
QCOMPARE(matcher.indexIn(haystack, 0), 0);
|
||||
QCOMPARE(matcher.indexIn(haystack, 1), 2);
|
||||
QCOMPARE(matcher.indexIn(haystack, 2), 2);
|
||||
QCOMPARE(matcher.indexIn(haystack, 3), -1);
|
||||
matcher.setCaseSensitivity(Qt::CaseSensitive);
|
||||
QCOMPARE(matcher.indexIn(haystack, 0), 2);
|
||||
QCOMPARE(matcher.indexIn(haystack, 1), 2);
|
||||
QCOMPARE(matcher.indexIn(haystack, 2), 2);
|
||||
QCOMPARE(matcher.indexIn(haystack, 3), -1);
|
||||
}
|
||||
|
||||
void tst_QLatin1StringMatcher::haystacksWithMoreThan4GiBWork()
|
||||
{
|
||||
#if QT_POINTER_SIZE > 4
|
||||
// use a large needle to trigger long skips in the Boyer-Moore algorithm
|
||||
// (to speed up the test)
|
||||
constexpr std::string_view needle = LONG_STRING_256;
|
||||
|
||||
//
|
||||
// GIVEN: a haystack with more than 4 GiB of data
|
||||
//
|
||||
|
||||
// don't use QByteArray because freeSpaceAtEnd() may break reserve()
|
||||
// semantics and a realloc is the last thing we need here
|
||||
std::string large;
|
||||
QElapsedTimer timer;
|
||||
timer.start();
|
||||
constexpr size_t GiB = 1024 * 1024 * 1024;
|
||||
constexpr size_t BaseSize = 4 * GiB + 1;
|
||||
try {
|
||||
large.reserve(BaseSize + needle.size());
|
||||
large.resize(BaseSize, '\0');
|
||||
large.append(needle);
|
||||
} catch (const std::bad_alloc &) {
|
||||
QSKIP("Could not allocate 4GiB plus a couple hundred bytes of RAM.");
|
||||
}
|
||||
QCOMPARE(large.size(), BaseSize + needle.size());
|
||||
qDebug("created dataset in %lld ms", timer.elapsed());
|
||||
|
||||
# if QT_CONFIG(cxx11_future)
|
||||
using MaybeThread = std::thread;
|
||||
# else
|
||||
struct MaybeThread
|
||||
{
|
||||
std::function<void()> func;
|
||||
void join() { func(); }
|
||||
};
|
||||
# endif
|
||||
|
||||
//
|
||||
// WHEN: trying to match an occurrence past the 4GiB mark
|
||||
//
|
||||
|
||||
qsizetype dynamicResult;
|
||||
|
||||
auto t = MaybeThread{ [&] {
|
||||
QLatin1StringMatcher m(QLatin1StringView(needle), Qt::CaseSensitive);
|
||||
dynamicResult = m.indexIn(QLatin1StringView(large));
|
||||
} };
|
||||
t.join();
|
||||
|
||||
//
|
||||
// THEN: the result index is not trucated
|
||||
//
|
||||
|
||||
QCOMPARE(dynamicResult, qsizetype(BaseSize));
|
||||
#else
|
||||
QSKIP("This test is 64-bit only.");
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef LONG_STRING_512
|
||||
#undef LONG_STRING_256
|
||||
#undef LONG_STRING_128
|
||||
#undef LONG_STRING__64
|
||||
#undef LONG_STRING__32
|
||||
|
||||
QTEST_APPLESS_MAIN(tst_QLatin1StringMatcher)
|
||||
#include "tst_qlatin1stringmatcher.moc"
|
Loading…
Reference in New Issue
Block a user