Add a new UTF-8 encoder and use it from QString

This is a new and faster UTF-8 encoder, based on the code from QUrl. This code
specializes for ASCII, which is the most common case anyway, especially since
QString's "ascii" mode is actually UTF-8 now.

In addition, make QString::toUtf8 use a stateless encoder. Stateless means that
the function doesn't handle state between calls in the form of
QTextCodec::ConverterState. This allows it to be faster than otherwise.

The new code is in the form of a template so that it can be used from
QJsonDocument and QUrl, which have small modifications to how the
encoding is handled.

Change-Id: I305ee0fd8523cc4ec74c2678cb9ea88b75bac7ac
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Thiago Macieira 2013-10-19 18:54:55 -04:00 committed by The Qt Project
parent 86fa8b4fb8
commit d51130cc3a
3 changed files with 157 additions and 46 deletions

View File

@ -1,6 +1,7 @@
/****************************************************************************
**
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
** Copyright (C) 2013 Intel Corporation
** Contact: http://www.qt-project.org/legal
**
** This file is part of the QtCore module of the Qt Toolkit.
@ -48,6 +49,27 @@ QT_BEGIN_NAMESPACE
enum { Endian = 0, Data = 1 };
QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
{
// create a QByteArray with the worst case scenario size
QByteArray result(len * 3, Qt::Uninitialized);
uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
const ushort *src = reinterpret_cast<const ushort *>(uc);
const ushort *const end = src + len;
while (src != end) {
ushort uc = *src++;
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
if (res < 0) {
// encoding error - append '?'
*dst++ = '?';
}
}
result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
return result;
}
QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
{
uchar replacement = '?';
@ -62,61 +84,35 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
surrogate_high = state->state_data[0];
}
QByteArray rstr;
rstr.resize(rlen);
uchar* cursor = (uchar*)rstr.data();
const QChar *ch = uc;
QByteArray rstr(rlen, Qt::Uninitialized);
uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData()));
const ushort *src = reinterpret_cast<const ushort *>(uc);
const ushort *const end = src + len;
int invalid = 0;
if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
// append UTF-8 BOM
*cursor++ = 0xef;
*cursor++ = 0xbb;
*cursor++ = 0xbf;
}
const QChar *end = ch + len;
while (ch < end) {
uint u = ch->unicode();
if (surrogate_high >= 0) {
if (ch->isLowSurrogate()) {
u = QChar::surrogateToUcs4(surrogate_high, u);
surrogate_high = -1;
} else {
// high surrogate without low
*cursor = replacement;
++ch;
++invalid;
surrogate_high = -1;
continue;
}
} else if (ch->isLowSurrogate()) {
// low surrogate without high
*cursor = replacement;
++ch;
++invalid;
while (src != end) {
ushort uc = surrogate_high == -1 ? *src++ : surrogate_high;
surrogate_high = -1;
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
if (Q_LIKELY(res >= 0))
continue;
} else if (ch->isHighSurrogate()) {
surrogate_high = u;
++ch;
continue;
}
if (u < 0x80) {
*cursor++ = (uchar)u;
} else {
if (u < 0x0800) {
*cursor++ = 0xc0 | ((uchar) (u >> 6));
} else {
if (QChar::requiresSurrogates(u)) {
*cursor++ = 0xf0 | ((uchar) (u >> 18));
*cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
} else {
*cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
}
*cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
}
*cursor++ = 0x80 | ((uchar) (u&0x3f));
if (res == QUtf8BaseTraits::Error) {
// encoding error
++invalid;
*cursor++ = replacement;
} else if (res == QUtf8BaseTraits::EndOfString) {
surrogate_high = uc;
break;
}
++ch;
}
rstr.resize(cursor - (const uchar*)rstr.constData());

View File

@ -1,6 +1,7 @@
/****************************************************************************
**
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
** Copyright (C) 2013 Intel Corporation
** Contact: http://www.qt-project.org/legal
**
** This file is part of the QtCore module of the Qt Toolkit.
@ -58,6 +59,118 @@
QT_BEGIN_NAMESPACE
struct QUtf8BaseTraits
{
static const bool isTrusted = false;
static const bool allowNonCharacters = true;
static const bool skipAsciiHandling = false;
static const int Error = -1;
static const int EndOfString = -2;
static bool isValidCharacter(uint u)
{ return int(u) >= 0; }
static void appendByte(uchar *&ptr, uchar b)
{ *ptr++ = b; }
static uchar peekByte(const uchar *ptr, int n = 0)
{ return ptr[n]; }
static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
{ return end - ptr; }
static void advanceByte(const uchar *&ptr, int n = 1)
{ ptr += n; }
static void appendUtf16(ushort *&ptr, ushort uc)
{ *ptr++ = uc; }
static void appendUcs4(ushort *&ptr, uint uc)
{
appendUtf16(ptr, QChar::highSurrogate(uc));
appendUtf16(ptr, QChar::lowSurrogate(uc));
}
static ushort peekUtf16(const ushort *ptr, int n = 0)
{ return ptr[n]; }
static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
{ return end - ptr; }
static void advanceUtf16(const ushort *&ptr, int n = 1)
{ ptr += n; }
// it's possible to output to UCS-4 too
static void appendUtf16(uint *&ptr, ushort uc)
{ *ptr++ = uc; }
static void appendUcs4(uint *&ptr, uint uc)
{ *ptr++ = uc; }
};
namespace QUtf8Functions
{
/// returns 0 on success; errors can only happen if \a u is a surrogate:
/// Error if \a u is a low surrogate;
/// if \a u is a high surrogate, Error if the next isn't a low one,
/// EndOfString if we run into the end of the string.
template <typename Traits, typename OutputPtr, typename InputPtr> inline
int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
{
if (!Traits::skipAsciiHandling && u < 0x80) {
// U+0000 to U+007F (US-ASCII) - one byte
Traits::appendByte(dst, uchar(u));
return 0;
} else if (u < 0x0800) {
// U+0080 to U+07FF - two bytes
// first of two bytes
Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
} else {
if (!QChar::isSurrogate(u)) {
// U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
return Traits::Error;
// first of three bytes
Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
} else {
// U+10000 to U+10FFFF - four bytes
// need to get one extra codepoint
if (Traits::availableUtf16(src, end) == 0)
return Traits::EndOfString;
ushort low = Traits::peekUtf16(src);
if (!QChar::isHighSurrogate(u))
return Traits::Error;
if (!QChar::isLowSurrogate(low))
return Traits::Error;
Traits::advanceUtf16(src);
uint ucs4 = QChar::surrogateToUcs4(u, low);
if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
return Traits::Error;
// first byte
Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
// second of four bytes
Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
// for the rest of the bytes
u = ushort(ucs4);
}
// second to last byte
Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
}
// last byte
Traits::appendByte(dst, 0x80 | (u & 0x3f));
return 0;
}
}
enum DataEndianness
{
DetectEndianness,
@ -68,6 +181,7 @@ enum DataEndianness
struct QUtf8
{
static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
static QByteArray convertFromUnicode(const QChar *, int);
static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);
};

View File

@ -1,6 +1,7 @@
/****************************************************************************
**
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
** Copyright (C) 2013 Intel Corporation
** Contact: http://www.qt-project.org/legal
**
** This file is part of the QtCore module of the Qt Toolkit.
@ -4128,7 +4129,7 @@ QByteArray QString::toUtf8_helper(const QString &str)
if (str.isNull())
return QByteArray();
return QUtf8::convertFromUnicode(str.constData(), str.length(), 0);
return QUtf8::convertFromUnicode(str.constData(), str.length());
}
/*!