Add a new UTF-8 encoder and use it from QString

This is a new and faster UTF-8 encoder, based on the code from QUrl. This code specializes for ASCII, which is the most common case anyway, especially since QString's "ascii" mode is actually UTF-8 now. In addition, make QString::toUtf8 use a stateless encoder. Stateless means that the function doesn't handle state between calls in the form of QTextCodec::ConverterState. This allows it to be faster than otherwise. The new code is in the form of a template so that it can be used from QJsonDocument and QUrl, which have small modifications to how the encoding is handled. Change-Id: I305ee0fd8523cc4ec74c2678cb9ea88b75bac7ac Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2013-10-19 18:54:55 -04:00 · 2013-10-19 18:54:55 -04:00 · d51130cc3a
commit d51130cc3a
parent 86fa8b4fb8
3 changed files with 157 additions and 46 deletions
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@ -1,6 +1,7 @@
 /****************************************************************************
 **
 ** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
+** Copyright (C) 2013 Intel Corporation
 ** Contact: http://www.qt-project.org/legal
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@ -48,6 +49,27 @@ QT_BEGIN_NAMESPACE

 enum { Endian = 0, Data = 1 };

+QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
+{
+    // create a QByteArray with the worst case scenario size
+    QByteArray result(len * 3, Qt::Uninitialized);
+    uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
+    const ushort *src = reinterpret_cast<const ushort *>(uc);
+    const ushort *const end = src + len;
+
+    while (src != end) {
+        ushort uc = *src++;
+        int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
+        if (res < 0) {
+            // encoding error - append '?'
+            *dst++ = '?';
+        }
+    }
+
+    result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
+    return result;
+}
+
 QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
 {
    uchar replacement = '?';
@ -62,61 +84,35 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
            surrogate_high = state->state_data[0];
    }

-    QByteArray rstr;
-    rstr.resize(rlen);
-    uchar* cursor = (uchar*)rstr.data();
-    const QChar *ch = uc;
+
+    QByteArray rstr(rlen, Qt::Uninitialized);
+    uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData()));
+    const ushort *src = reinterpret_cast<const ushort *>(uc);
+    const ushort *const end = src + len;
+
    int invalid = 0;
    if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
+        // append UTF-8 BOM
        *cursor++ = 0xef;
        *cursor++ = 0xbb;
        *cursor++ = 0xbf;
    }

-    const QChar *end = ch + len;
-    while (ch < end) {
-        uint u = ch->unicode();
-        if (surrogate_high >= 0) {
-            if (ch->isLowSurrogate()) {
-                u = QChar::surrogateToUcs4(surrogate_high, u);
-                surrogate_high = -1;
-            } else {
-                // high surrogate without low
-                *cursor = replacement;
-                ++ch;
-                ++invalid;
-                surrogate_high = -1;
-                continue;
-            }
-        } else if (ch->isLowSurrogate()) {
-            // low surrogate without high
-            *cursor = replacement;
-            ++ch;
-            ++invalid;
+    while (src != end) {
+        ushort uc = surrogate_high == -1 ? *src++ : surrogate_high;
+        surrogate_high = -1;
+        int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+        if (Q_LIKELY(res >= 0))
            continue;
-        } else if (ch->isHighSurrogate()) {
-            surrogate_high = u;
-            ++ch;
-            continue;
-        }

-        if (u < 0x80) {
-            *cursor++ = (uchar)u;
-        } else {
-            if (u < 0x0800) {
-                *cursor++ = 0xc0 | ((uchar) (u >> 6));
-            } else {
-                if (QChar::requiresSurrogates(u)) {
-                    *cursor++ = 0xf0 | ((uchar) (u >> 18));
-                    *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
-                } else {
-                    *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
-                }
-                *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
-            }
-            *cursor++ = 0x80 | ((uchar) (u&0x3f));
+        if (res == QUtf8BaseTraits::Error) {
+            // encoding error
+            ++invalid;
+            *cursor++ = replacement;
+        } else if (res == QUtf8BaseTraits::EndOfString) {
+            surrogate_high = uc;
+            break;
        }
-        ++ch;
    }

    rstr.resize(cursor - (const uchar*)rstr.constData());
--- a/src/corelib/codecs/qutfcodec_p.h
+++ b/src/corelib/codecs/qutfcodec_p.h
@ -1,6 +1,7 @@
 /****************************************************************************
 **
 ** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
+** Copyright (C) 2013 Intel Corporation
 ** Contact: http://www.qt-project.org/legal
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@ -58,6 +59,118 @@

 QT_BEGIN_NAMESPACE

+struct QUtf8BaseTraits
+{
+    static const bool isTrusted = false;
+    static const bool allowNonCharacters = true;
+    static const bool skipAsciiHandling = false;
+    static const int Error = -1;
+    static const int EndOfString = -2;
+
+    static bool isValidCharacter(uint u)
+    { return int(u) >= 0; }
+
+    static void appendByte(uchar *&ptr, uchar b)
+    { *ptr++ = b; }
+
+    static uchar peekByte(const uchar *ptr, int n = 0)
+    { return ptr[n]; }
+
+    static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
+    { return end - ptr; }
+
+    static void advanceByte(const uchar *&ptr, int n = 1)
+    { ptr += n; }
+
+    static void appendUtf16(ushort *&ptr, ushort uc)
+    { *ptr++ = uc; }
+
+    static void appendUcs4(ushort *&ptr, uint uc)
+    {
+        appendUtf16(ptr, QChar::highSurrogate(uc));
+        appendUtf16(ptr, QChar::lowSurrogate(uc));
+    }
+
+    static ushort peekUtf16(const ushort *ptr, int n = 0)
+    { return ptr[n]; }
+
+    static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
+    { return end - ptr; }
+
+    static void advanceUtf16(const ushort *&ptr, int n = 1)
+    { ptr += n; }
+
+    // it's possible to output to UCS-4 too
+    static void appendUtf16(uint *&ptr, ushort uc)
+    { *ptr++ = uc; }
+
+    static void appendUcs4(uint *&ptr, uint uc)
+    { *ptr++ = uc; }
+};
+
+namespace QUtf8Functions
+{
+    /// returns 0 on success; errors can only happen if \a u is a surrogate:
+    /// Error if \a u is a low surrogate;
+    /// if \a u is a high surrogate, Error if the next isn't a low one,
+    /// EndOfString if we run into the end of the string.
+    template <typename Traits, typename OutputPtr, typename InputPtr> inline
+    int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
+    {
+        if (!Traits::skipAsciiHandling && u < 0x80) {
+            // U+0000 to U+007F (US-ASCII) - one byte
+            Traits::appendByte(dst, uchar(u));
+            return 0;
+        } else if (u < 0x0800) {
+            // U+0080 to U+07FF - two bytes
+            // first of two bytes
+            Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
+        } else {
+            if (!QChar::isSurrogate(u)) {
+                // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
+                if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
+                    return Traits::Error;
+
+                // first of three bytes
+                Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
+            } else {
+                // U+10000 to U+10FFFF - four bytes
+                // need to get one extra codepoint
+                if (Traits::availableUtf16(src, end) == 0)
+                    return Traits::EndOfString;
+
+                ushort low = Traits::peekUtf16(src);
+                if (!QChar::isHighSurrogate(u))
+                    return Traits::Error;
+                if (!QChar::isLowSurrogate(low))
+                    return Traits::Error;
+
+                Traits::advanceUtf16(src);
+                uint ucs4 = QChar::surrogateToUcs4(u, low);
+
+                if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
+                    return Traits::Error;
+
+                // first byte
+                Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
+
+                // second of four bytes
+                Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
+
+                // for the rest of the bytes
+                u = ushort(ucs4);
+            }
+
+            // second to last byte
+            Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
+        }
+
+        // last byte
+        Traits::appendByte(dst, 0x80 | (u & 0x3f));
+        return 0;
+    }
+}
+
 enum DataEndianness
 {
    DetectEndianness,
@ -68,6 +181,7 @@ enum DataEndianness
 struct QUtf8
 {
    static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
+    static QByteArray convertFromUnicode(const QChar *, int);
    static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);
 };

--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@ -1,6 +1,7 @@
 /****************************************************************************
 **
 ** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
+** Copyright (C) 2013 Intel Corporation
 ** Contact: http://www.qt-project.org/legal
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@ -4128,7 +4129,7 @@ QByteArray QString::toUtf8_helper(const QString &str)
    if (str.isNull())
        return QByteArray();

-    return QUtf8::convertFromUnicode(str.constData(), str.length(), 0);
+    return QUtf8::convertFromUnicode(str.constData(), str.length());
 }

 /*!