Add proper collation support to Qt

QString::localeAwareCompare() has always been a broken
way to support collation. The current implementation is
not even thread safe.

This adds a proper collation class that fixes the problems
and finally allows Qt to sort properly according to locale
rules.

The class is private for now, but is intendent to be made
public with 5.1

Change-Id: Idb4e75ff68a398c9813af622af884a90898d2be9
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Lars Knoll 2012-06-01 23:09:31 +02:00 committed by Qt by Nokia
parent 97e177e58d
commit 1e9be1327b
4 changed files with 712 additions and 0 deletions

View File

@ -0,0 +1,585 @@
/****************************************************************************
**
** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
** Contact: http://www.qt-project.org/
**
** This file is part of the QtCore module of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** GNU Lesser General Public License Usage
** This file may be used under the terms of the GNU Lesser General Public
** License version 2.1 as published by the Free Software Foundation and
** appearing in the file LICENSE.LGPL included in the packaging of this
** file. Please review the following information to ensure the GNU Lesser
** General Public License version 2.1 requirements will be met:
** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** In addition, as a special exception, Nokia gives you certain additional
** rights. These rights are described in the Nokia Qt LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU General
** Public License version 3.0 as published by the Free Software Foundation
** and appearing in the file LICENSE.GPL included in the packaging of this
** file. Please review the following information to ensure the GNU General
** Public License version 3.0 requirements will be met:
** http://www.gnu.org/copyleft/gpl.html.
**
** Other Usage
** Alternatively, this file may be used in accordance with the terms and
** conditions contained in a signed written agreement between you and Nokia.
**
**
**
**
**
**
** $QT_END_LICENSE$
**
****************************************************************************/
#include "qcollator_p.h"
#include "qstringlist.h"
#include "qstring.h"
#ifdef QT_USE_ICU
#include <unicode/utypes.h>
#include <unicode/ucol.h>
#include <unicode/ustring.h>
#include <unicode/ures.h>
#endif
#include "qdebug.h"
QT_BEGIN_NAMESPACE
class QCollatorPrivate
{
public:
QAtomicInt ref;
QLocale locale;
QCollator::Collation collation;
#ifdef QT_USE_ICU
UCollator *collator;
#else
void *collator;
#endif
QStringList indexCharacters;
void clear() {
#ifdef QT_USE_ICU
if (collator)
ucol_close(collator);
#endif
collator = 0;
indexCharacters.clear();
}
QCollatorPrivate()
: collation(QCollator::Default),
collator(0)
{ ref.store(1); }
~QCollatorPrivate();
private:
Q_DISABLE_COPY(QCollatorPrivate)
};
QCollatorPrivate::~QCollatorPrivate()
{
clear();
}
static const int collationStringsCount = 13;
static const char * const collationStrings[collationStringsCount] = {
"default",
"big5han",
"dictionary",
"direct",
"gb2312han",
"phonebook",
"pinyin",
"phonetic",
"reformed",
"standard",
"stroke",
"traditional",
"unihan"
};
/*!
\class QCollator
\brief The QCollator class compares strings according to a localized collation algorithm.
\reentrant
\ingroup i18n
\ingroup string-processing
\ingroup shared
QCollator is initialized with a QLocale and an optional collation strategy. It tries to
initialize the collator with the specified values. The collator can then be used to compare
and sort strings in a locale dependent fashion.
A QCollator object can be used together with template based sorting algorithms such as qSort
to sort a list of QStrings.
In addition to the locale and collation strategy, several optional flags can be set that influence
the result of the collation.
*/
/*!
Constructs a QCollator from \a locale and \a collation. If \a collation is not
specified the default collation algorithm for the locale is being used. If
\a locale is not specified QLocale::default() is being used.
\sa setLocale setCollation setOptions
*/
QCollator::QCollator(const QLocale &locale, QCollator::Collation collation)
: d(new QCollatorPrivate)
{
d->locale = locale;
if ((int)collation >= 0 && (int)collation < collationStringsCount)
d->collation = collation;
init();
}
/*!
Creates a copy of \a other.
*/
QCollator::QCollator(const QCollator &other)
: d(other.d)
{
d->ref.ref();
}
/*!
Destroys the collator.
*/
QCollator::~QCollator()
{
if (!d->ref.deref())
delete d;
}
/*!
Assigns \a other to this collator.
*/
QCollator &QCollator::operator=(const QCollator &other)
{
if (this != &other) {
if (!d->ref.deref())
delete d;
d = other.d;
d->ref.ref();
}
return *this;
}
/*!
\internal
*/
void QCollator::init()
{
Q_ASSERT((int)d->collation < collationStringsCount);
#ifdef QT_USE_ICU
const char *collationString = collationStrings[(int)d->collation];
UErrorCode status = U_ZERO_ERROR;
QByteArray name = (d->locale.bcp47Name().replace(QLatin1Char('-'), QLatin1Char('_')) + QLatin1String("@collation=") + QLatin1String(collationString)).toLatin1();
d->collator = ucol_open(name.constData(), &status);
if (U_FAILURE(status))
qWarning("Could not create collator: %d", status);
// enable normalization by default
ucol_setAttribute(d->collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
#endif
}
/*!
\internal
*/
void QCollator::detach()
{
if (d->ref.load() != 1) {
QCollatorPrivate *x = new QCollatorPrivate;
x->ref.store(1);
x->locale = d->locale;
x->collation = d->collation;
x->collator = 0;
if (!d->ref.deref())
delete d;
d = x;
}
}
/*!
Sets the locale of the collator to \a locale.
*/
void QCollator::setLocale(const QLocale &locale)
{
if (d->ref.load() != 1)
detach();
d->clear();
d->locale = locale;
init();
}
/*!
Returns the locale of the collator.
*/
QLocale QCollator::locale() const
{
return d->locale;
}
/*!
\enum QCollator::collation
This enum can be used to specify an alternate collation algorithm to be used instead
of the default algorithm for the locale.
Possible values are:
\value Default Use the default algorithm for the locale
\value Big5Han
\value Dictionary
\value Direct
\value GB2312Han
\value PhoneBook
\value Pinyin
\value Phonetic
\value Reformed
\value Standard
\value Stroke
\value Traditional
\value UniHan
*/
/*!
Sets the collation algorithm to be used.
\sa QCollator::Collation
*/
void QCollator::setCollation(QCollator::Collation collation)
{
if ((int)collation < 0 || (int)collation >= collationStringsCount)
return;
if (d->ref.load() != 1)
detach();
d->clear();
d->collation = collation;
init();
}
/*!
Returns the currently used collation algorithm.
\sa QCollator::Collation
*/
QCollator::Collation QCollator::collation() const
{
return d->collation;
}
/*!
Returns a unique identifer for this collation object.
This method is helpful to save and restore defined collation
objects.
\sa fromIdentifier
*/
QString QCollator::identifier() const
{
QString id = d->locale.bcp47Name();
if (d->collation != QCollator::Default) {
id += QLatin1String("@collation=");
id += QLatin1String(collationStrings[d->collation]);
}
// this ensures the ID is compatible with ICU
id.replace('-', '_');
return id;
}
/*!
Creates a QCollator from a unique identifier and returns it.
\sa identifier
*/
QCollator QCollator::fromIdentifier(const QString &identifier)
{
QString localeString = identifier;
QString collationString;
int at = identifier.indexOf(QLatin1Char('@'));
if (at >= 0) {
localeString = identifier.left(at);
collationString = identifier.mid(at + strlen("@collation="));
}
QLocale locale(localeString);
Collation collation = Default;
if (!collationString.isEmpty()) {
for (int i = 0; i < collationStringsCount; ++i) {
if (QLatin1String(collationStrings[i]) == collationString) {
collation = Collation(i);
break;
}
}
}
return QCollator(locale, collation);
}
/*!
\enum QCollator::CasePreference
This enum can be used to tailor the case preference during collation.
\value CasePreferenceOff No case preference, use what is the standard for the locale
\value CasePreferenceUpper Sort upper case characters before lower case
\value CasePreferenceLower Sort lower case characters before upper case
*/
/*!
Sets the case preference of the collator.
\sa QCollator::CasePreference
*/
void QCollator::setCasePreference(CasePreference c)
{
if (d->ref.load() != 1)
detach();
#ifdef QT_USE_ICU
UColAttributeValue val = UCOL_OFF;
if (c == QCollator::CasePreferenceUpper)
val = UCOL_UPPER_FIRST;
else if (c == QCollator::CasePreferenceLower)
val = UCOL_LOWER_FIRST;
UErrorCode status = U_ZERO_ERROR;
ucol_setAttribute(d->collator, UCOL_CASE_FIRST, val, &status);
if (U_FAILURE(status))
qWarning("ucol_setAttribute: Case First failed: %d", status);
#else
Q_UNUSED(c);
#endif
}
/*!
Returns case preference of the collator.
\sa QCollator::CasePreference
*/
QCollator::CasePreference QCollator::casePreference() const
{
#ifdef QT_USE_ICU
UErrorCode status = U_ZERO_ERROR;
switch (ucol_getAttribute(d->collator, UCOL_CASE_FIRST, &status)) {
case UCOL_UPPER_FIRST:
return QCollator::CasePreferenceUpper;
case UCOL_LOWER_FIRST:
return QCollator::CasePreferenceLower;
case UCOL_OFF:
default:
break;
}
#endif
return QCollator::CasePreferenceOff;
}
/*!
Enables numeric sorting mode when \a on is set to true.
This will enable proper sorting of numeric digits, so that e.g. 100 sorts after 99.
By default this mode is off.
*/
void QCollator::setNumericMode(bool on)
{
if (d->ref.load() != 1)
detach();
#ifdef QT_USE_ICU
UErrorCode status = U_ZERO_ERROR;
ucol_setAttribute(d->collator, UCOL_NUMERIC_COLLATION, on ? UCOL_ON : UCOL_OFF, &status);
if (U_FAILURE(status))
qWarning("ucol_setAttribute: numeric collation failed: %d", status);
#else
Q_UNUSED(on);
#endif
}
/*!
Returns true if numeric sorting is enabled, false otherwise.
\sa setNumericMode
*/
bool QCollator::numericMode() const
{
#ifdef QT_USE_ICU
UErrorCode status;
if (ucol_getAttribute(d->collator, UCOL_NUMERIC_COLLATION, &status) == UCOL_ON)
return true;
#endif
return false;
}
/*!
If set to true, punctuation characters and symbols are ignored when determining sort order.
The default is locale dependent.
*/
void QCollator::setIgnorePunctuation(bool on)
{
if (d->ref.load() != 1)
detach();
#ifdef QT_USE_ICU
UErrorCode status;
ucol_setAttribute(d->collator, UCOL_ALTERNATE_HANDLING, on ? UCOL_SHIFTED : UCOL_NON_IGNORABLE, &status);
if (U_FAILURE(status))
qWarning("ucol_setAttribute: Alternate handling failed: %d", status);
#else
Q_UNUSED(on);
#endif
}
/*!
Returns true if punctuation characters and symbols are ignored when determining sort order.
\sa setIgnorePunctuation
*/
bool QCollator::ignorePunctuation() const
{
#ifdef QT_USE_ICU
UErrorCode status;
if (ucol_getAttribute(d->collator, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED)
return true;
#endif
return false;
}
/*!
Compares \a s1 with \a s2. Returns -1, 0 or 1 depending on whether \a s1 is
smaller, equal or larger than \a s2.
*/
int QCollator::compare(const QString &s1, const QString &s2) const
{
return compare(s1.constData(), s1.size(), s2.constData(), s2.size());
}
/*!
\overload
Compares \a s1 with \a s2. Returns -1, 0 or 1 depending on whether \a s1 is
smaller, equal or larger than \a s2.
*/
int QCollator::compare(const QStringRef &s1, const QStringRef &s2) const
{
return compare(s1.constData(), s1.size(), s2.constData(), s2.size());
}
/*!
\overload
Compares \a s1 with \a s2. \a len1 and \a len2 specify the length of the
QChar arrays pointer to by \a s1 and \a s2.
Returns -1, 0 or 1 depending on whether \a s1 is smaller, equal or larger than \a s2.
*/
int QCollator::compare(const QChar *s1, int len1, const QChar *s2, int len2) const
{
#ifdef QT_USE_ICU
const UCollationResult result =
ucol_strcoll(d->collator, (const UChar *)s1, len1, (const UChar *)s2, len2);
return result;
#else
return QString::compare_helper((const QChar *)s1, len1, (const QChar *)s2, len2, Qt::CaseInsensitive);
#endif
}
/*!
Returns a sortKey for \a string. The sortkey can be used as a placeholder
for the string that can be then sorted using regular strcmp based sorting.
Creating the sort key is usually somewhat slower, then using the compare()
methods directly. But if the string is compared repeatedly (e.g. when sorting
a whole list of strings), it's usually faster to create the sort keys for each
string and then sort using the keys.
*/
QByteArray QCollator::sortKey(const QString &string) const
{
#ifdef QT_USE_ICU
QByteArray result(16 + string.size() + (string.size() >> 2), Qt::Uninitialized);
int size = ucol_getSortKey(d->collator, (const UChar *)string.constData(),
string.size(), (uint8_t *)result.data(), result.size());
if (size > result.size()) {
result.resize(size);
size = ucol_getSortKey(d->collator, (const UChar *)string.constData(),
string.size(), (uint8_t *)result.data(), result.size());
}
result.truncate(size);
return result;
#else
return string.toLower().toUtf8();
#endif
}
static QStringList englishIndexCharacters()
{
QString chars = QString::fromLatin1("A B C D E F G H I J K L M N O P Q R S T U V W X Y Z");
return chars.split(QLatin1Char(' '), QString::SkipEmptyParts);
}
/*!
Returns a string list of primary index characters. This is useful when presenting the
sorted list in a user interface with section headers.
*/
QStringList QCollator::indexCharacters() const
{
if (!d->indexCharacters.isEmpty())
return d->indexCharacters;
#ifdef QT_USE_ICU
QByteArray id = identifier().toLatin1();
UErrorCode status = U_ZERO_ERROR;
UResourceBundle *res = ures_open(NULL, id, &status);
if (U_FAILURE(status)) {
d->indexCharacters = englishIndexCharacters();
} else {
qint32 len = 0;
status = U_ZERO_ERROR;
const UChar *val = ures_getStringByKey(res, "ExemplarCharactersIndex", &len, &status);
if (U_FAILURE(status)) {
d->indexCharacters = englishIndexCharacters();
} else {
QString chars = QString::fromUtf16(val, len);
chars.remove('[');
chars.remove(']');
chars.remove('{');
chars.remove('}');
d->indexCharacters = chars.split(QLatin1Char(' '), QString::SkipEmptyParts);
}
}
ures_close(res);
#else
d->indexCharacters = englishIndexCharacters();
#endif
return d->indexCharacters;
}
QT_END_NAMESPACE

View File

@ -0,0 +1,124 @@
/****************************************************************************
**
** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
** Contact: http://www.qt-project.org/
**
** This file is part of the QtCore module of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** GNU Lesser General Public License Usage
** This file may be used under the terms of the GNU Lesser General Public
** License version 2.1 as published by the Free Software Foundation and
** appearing in the file LICENSE.LGPL included in the packaging of this
** file. Please review the following information to ensure the GNU Lesser
** General Public License version 2.1 requirements will be met:
** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** In addition, as a special exception, Nokia gives you certain additional
** rights. These rights are described in the Nokia Qt LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU General
** Public License version 3.0 as published by the Free Software Foundation
** and appearing in the file LICENSE.GPL included in the packaging of this
** file. Please review the following information to ensure the GNU General
** Public License version 3.0 requirements will be met:
** http://www.gnu.org/copyleft/gpl.html.
**
** Other Usage
** Alternatively, this file may be used in accordance with the terms and
** conditions contained in a signed written agreement between you and Nokia.
**
**
**
**
**
**
** $QT_END_LICENSE$
**
****************************************************************************/
#ifndef QCOLLATOR_H
#define QCOLLATOR_H
#include <QString>
#include <QLocale>
QT_BEGIN_HEADER
QT_BEGIN_NAMESPACE
class QCollatorPrivate;
class Q_CORE_EXPORT QCollator
{
public:
enum Collation {
Default,
Big5Han,
Dictionary,
Direct,
GB2312Han,
PhoneBook,
Pinyin,
Phonetic,
Reformed,
Standard,
Stroke,
Traditional,
UniHan
};
QCollator(const QLocale &locale = QLocale(), QCollator::Collation collation = QCollator::Default);
QCollator(const QCollator &);
~QCollator();
QCollator &operator=(const QCollator &);
void setLocale(const QLocale &locale);
QLocale locale() const;
void setCollation(Collation collation);
Collation collation() const;
QString identifier() const;
static QCollator fromIdentifier(const QString &identifier);
enum CasePreference {
CasePreferenceOff = 0x0,
CasePreferenceUpper = 0x1,
CasePreferenceLower = 0x2
};
CasePreference casePreference() const;
void setCasePreference(CasePreference c);
void setNumericMode(bool on);
bool numericMode() const;
void setIgnorePunctuation(bool on);
bool ignorePunctuation() const;
int compare(const QString &s1, const QString &s2) const;
int compare(const QStringRef &s1, const QStringRef &s2) const;
int compare(const QChar *s1, int len1, const QChar *s2, int len2) const;
bool operator()(const QString &s1, const QString &s2) const
{ return compare(s1, s2) < 0; }
QByteArray sortKey(const QString &string) const;
QStringList indexCharacters() const;
private:
QCollatorPrivate *d;
void detach();
void init();
};
QT_END_NAMESPACE
QT_END_HEADER
#endif // QCOLLATOR_H

View File

@ -728,6 +728,7 @@ private:
friend class QTextCodec;
friend class QStringRef;
friend class QByteArray;
friend class QCollator;
friend struct QAbstractConcatenable;
public:

View File

@ -11,6 +11,7 @@ HEADERS += \
tools/qbytedata_p.h \
tools/qcache.h \
tools/qchar.h \
tools/qcollator_p.h \
tools/qcontainerfwd.h \
tools/qcryptographichash.h \
tools/qdatetime.h \
@ -65,6 +66,7 @@ SOURCES += \
tools/qbitarray.cpp \
tools/qbytearray.cpp \
tools/qbytearraymatcher.cpp \
tools/qcollator.cpp \
tools/qcryptographichash.cpp \
tools/qdatetime.cpp \
tools/qeasingcurve.cpp \