Add a new UTF-8 encoder and use it from QString
This is a new and faster UTF-8 encoder, based on the code from QUrl. This code specializes for ASCII, which is the most common case anyway, especially since QString's "ascii" mode is actually UTF-8 now. In addition, make QString::toUtf8 use a stateless encoder. Stateless means that the function doesn't handle state between calls in the form of QTextCodec::ConverterState. This allows it to be faster than otherwise. The new code is in the form of a template so that it can be used from QJsonDocument and QUrl, which have small modifications to how the encoding is handled. Change-Id: I305ee0fd8523cc4ec74c2678cb9ea88b75bac7ac Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
86fa8b4fb8
commit
d51130cc3a
@ -1,6 +1,7 @@
|
||||
/****************************************************************************
|
||||
**
|
||||
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
|
||||
** Copyright (C) 2013 Intel Corporation
|
||||
** Contact: http://www.qt-project.org/legal
|
||||
**
|
||||
** This file is part of the QtCore module of the Qt Toolkit.
|
||||
@ -48,6 +49,27 @@ QT_BEGIN_NAMESPACE
|
||||
|
||||
enum { Endian = 0, Data = 1 };
|
||||
|
||||
QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
|
||||
{
|
||||
// create a QByteArray with the worst case scenario size
|
||||
QByteArray result(len * 3, Qt::Uninitialized);
|
||||
uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
|
||||
const ushort *src = reinterpret_cast<const ushort *>(uc);
|
||||
const ushort *const end = src + len;
|
||||
|
||||
while (src != end) {
|
||||
ushort uc = *src++;
|
||||
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
|
||||
if (res < 0) {
|
||||
// encoding error - append '?'
|
||||
*dst++ = '?';
|
||||
}
|
||||
}
|
||||
|
||||
result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
|
||||
return result;
|
||||
}
|
||||
|
||||
QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
|
||||
{
|
||||
uchar replacement = '?';
|
||||
@ -62,61 +84,35 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
|
||||
surrogate_high = state->state_data[0];
|
||||
}
|
||||
|
||||
QByteArray rstr;
|
||||
rstr.resize(rlen);
|
||||
uchar* cursor = (uchar*)rstr.data();
|
||||
const QChar *ch = uc;
|
||||
|
||||
QByteArray rstr(rlen, Qt::Uninitialized);
|
||||
uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData()));
|
||||
const ushort *src = reinterpret_cast<const ushort *>(uc);
|
||||
const ushort *const end = src + len;
|
||||
|
||||
int invalid = 0;
|
||||
if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
|
||||
// append UTF-8 BOM
|
||||
*cursor++ = 0xef;
|
||||
*cursor++ = 0xbb;
|
||||
*cursor++ = 0xbf;
|
||||
}
|
||||
|
||||
const QChar *end = ch + len;
|
||||
while (ch < end) {
|
||||
uint u = ch->unicode();
|
||||
if (surrogate_high >= 0) {
|
||||
if (ch->isLowSurrogate()) {
|
||||
u = QChar::surrogateToUcs4(surrogate_high, u);
|
||||
surrogate_high = -1;
|
||||
} else {
|
||||
// high surrogate without low
|
||||
*cursor = replacement;
|
||||
++ch;
|
||||
++invalid;
|
||||
surrogate_high = -1;
|
||||
continue;
|
||||
}
|
||||
} else if (ch->isLowSurrogate()) {
|
||||
// low surrogate without high
|
||||
*cursor = replacement;
|
||||
++ch;
|
||||
++invalid;
|
||||
while (src != end) {
|
||||
ushort uc = surrogate_high == -1 ? *src++ : surrogate_high;
|
||||
surrogate_high = -1;
|
||||
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
|
||||
if (Q_LIKELY(res >= 0))
|
||||
continue;
|
||||
} else if (ch->isHighSurrogate()) {
|
||||
surrogate_high = u;
|
||||
++ch;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (u < 0x80) {
|
||||
*cursor++ = (uchar)u;
|
||||
} else {
|
||||
if (u < 0x0800) {
|
||||
*cursor++ = 0xc0 | ((uchar) (u >> 6));
|
||||
} else {
|
||||
if (QChar::requiresSurrogates(u)) {
|
||||
*cursor++ = 0xf0 | ((uchar) (u >> 18));
|
||||
*cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
|
||||
} else {
|
||||
*cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
|
||||
}
|
||||
*cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
|
||||
}
|
||||
*cursor++ = 0x80 | ((uchar) (u&0x3f));
|
||||
if (res == QUtf8BaseTraits::Error) {
|
||||
// encoding error
|
||||
++invalid;
|
||||
*cursor++ = replacement;
|
||||
} else if (res == QUtf8BaseTraits::EndOfString) {
|
||||
surrogate_high = uc;
|
||||
break;
|
||||
}
|
||||
++ch;
|
||||
}
|
||||
|
||||
rstr.resize(cursor - (const uchar*)rstr.constData());
|
||||
|
@ -1,6 +1,7 @@
|
||||
/****************************************************************************
|
||||
**
|
||||
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
|
||||
** Copyright (C) 2013 Intel Corporation
|
||||
** Contact: http://www.qt-project.org/legal
|
||||
**
|
||||
** This file is part of the QtCore module of the Qt Toolkit.
|
||||
@ -58,6 +59,118 @@
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
struct QUtf8BaseTraits
|
||||
{
|
||||
static const bool isTrusted = false;
|
||||
static const bool allowNonCharacters = true;
|
||||
static const bool skipAsciiHandling = false;
|
||||
static const int Error = -1;
|
||||
static const int EndOfString = -2;
|
||||
|
||||
static bool isValidCharacter(uint u)
|
||||
{ return int(u) >= 0; }
|
||||
|
||||
static void appendByte(uchar *&ptr, uchar b)
|
||||
{ *ptr++ = b; }
|
||||
|
||||
static uchar peekByte(const uchar *ptr, int n = 0)
|
||||
{ return ptr[n]; }
|
||||
|
||||
static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
|
||||
{ return end - ptr; }
|
||||
|
||||
static void advanceByte(const uchar *&ptr, int n = 1)
|
||||
{ ptr += n; }
|
||||
|
||||
static void appendUtf16(ushort *&ptr, ushort uc)
|
||||
{ *ptr++ = uc; }
|
||||
|
||||
static void appendUcs4(ushort *&ptr, uint uc)
|
||||
{
|
||||
appendUtf16(ptr, QChar::highSurrogate(uc));
|
||||
appendUtf16(ptr, QChar::lowSurrogate(uc));
|
||||
}
|
||||
|
||||
static ushort peekUtf16(const ushort *ptr, int n = 0)
|
||||
{ return ptr[n]; }
|
||||
|
||||
static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
|
||||
{ return end - ptr; }
|
||||
|
||||
static void advanceUtf16(const ushort *&ptr, int n = 1)
|
||||
{ ptr += n; }
|
||||
|
||||
// it's possible to output to UCS-4 too
|
||||
static void appendUtf16(uint *&ptr, ushort uc)
|
||||
{ *ptr++ = uc; }
|
||||
|
||||
static void appendUcs4(uint *&ptr, uint uc)
|
||||
{ *ptr++ = uc; }
|
||||
};
|
||||
|
||||
namespace QUtf8Functions
|
||||
{
|
||||
/// returns 0 on success; errors can only happen if \a u is a surrogate:
|
||||
/// Error if \a u is a low surrogate;
|
||||
/// if \a u is a high surrogate, Error if the next isn't a low one,
|
||||
/// EndOfString if we run into the end of the string.
|
||||
template <typename Traits, typename OutputPtr, typename InputPtr> inline
|
||||
int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
|
||||
{
|
||||
if (!Traits::skipAsciiHandling && u < 0x80) {
|
||||
// U+0000 to U+007F (US-ASCII) - one byte
|
||||
Traits::appendByte(dst, uchar(u));
|
||||
return 0;
|
||||
} else if (u < 0x0800) {
|
||||
// U+0080 to U+07FF - two bytes
|
||||
// first of two bytes
|
||||
Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
|
||||
} else {
|
||||
if (!QChar::isSurrogate(u)) {
|
||||
// U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
|
||||
if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
|
||||
return Traits::Error;
|
||||
|
||||
// first of three bytes
|
||||
Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
|
||||
} else {
|
||||
// U+10000 to U+10FFFF - four bytes
|
||||
// need to get one extra codepoint
|
||||
if (Traits::availableUtf16(src, end) == 0)
|
||||
return Traits::EndOfString;
|
||||
|
||||
ushort low = Traits::peekUtf16(src);
|
||||
if (!QChar::isHighSurrogate(u))
|
||||
return Traits::Error;
|
||||
if (!QChar::isLowSurrogate(low))
|
||||
return Traits::Error;
|
||||
|
||||
Traits::advanceUtf16(src);
|
||||
uint ucs4 = QChar::surrogateToUcs4(u, low);
|
||||
|
||||
if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
|
||||
return Traits::Error;
|
||||
|
||||
// first byte
|
||||
Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
|
||||
|
||||
// second of four bytes
|
||||
Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
|
||||
|
||||
// for the rest of the bytes
|
||||
u = ushort(ucs4);
|
||||
}
|
||||
|
||||
// second to last byte
|
||||
Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
|
||||
}
|
||||
|
||||
// last byte
|
||||
Traits::appendByte(dst, 0x80 | (u & 0x3f));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
enum DataEndianness
|
||||
{
|
||||
DetectEndianness,
|
||||
@ -68,6 +181,7 @@ enum DataEndianness
|
||||
struct QUtf8
|
||||
{
|
||||
static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
|
||||
static QByteArray convertFromUnicode(const QChar *, int);
|
||||
static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);
|
||||
};
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
/****************************************************************************
|
||||
**
|
||||
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
|
||||
** Copyright (C) 2013 Intel Corporation
|
||||
** Contact: http://www.qt-project.org/legal
|
||||
**
|
||||
** This file is part of the QtCore module of the Qt Toolkit.
|
||||
@ -4128,7 +4129,7 @@ QByteArray QString::toUtf8_helper(const QString &str)
|
||||
if (str.isNull())
|
||||
return QByteArray();
|
||||
|
||||
return QUtf8::convertFromUnicode(str.constData(), str.length(), 0);
|
||||
return QUtf8::convertFromUnicode(str.constData(), str.length());
|
||||
}
|
||||
|
||||
/*!
|
||||
|
Loading…
Reference in New Issue
Block a user