Streamline the code in the conversion to and from utf8
Move pre/and post condition handling out of the main loop to make that one as fast as possible. Remove special handling of a corner case when the input length is zero, where the utf8 decoder did something else than all other decoders. Change-Id: I94992767ea15405b38f7953adadaa6ff98b20b6f Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org> Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
751a003d1e
commit
57037145f5
@ -416,6 +416,8 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
|
|||||||
Q_ASSERT(state);
|
Q_ASSERT(state);
|
||||||
const QChar *uc = in.data();
|
const QChar *uc = in.data();
|
||||||
qsizetype len = in.length();
|
qsizetype len = in.length();
|
||||||
|
if (!len)
|
||||||
|
return out;
|
||||||
|
|
||||||
auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
|
auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
|
||||||
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
|
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
|
||||||
@ -433,56 +435,50 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
|
|||||||
const ushort *src = reinterpret_cast<const ushort *>(uc);
|
const ushort *src = reinterpret_cast<const ushort *>(uc);
|
||||||
const ushort *const end = src + len;
|
const ushort *const end = src + len;
|
||||||
|
|
||||||
int surrogate_high = -1;
|
if (!(state->flags & QStringDecoder::Flag::Stateless)) {
|
||||||
if (state->remainingChars) {
|
if (state->remainingChars) {
|
||||||
surrogate_high = state->state_data[0];
|
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[0], cursor, src, end);
|
||||||
} else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
|
if (res < 0)
|
||||||
// append UTF-8 BOM
|
cursor = appendReplacementChar(cursor);
|
||||||
*cursor++ = utf8bom[0];
|
state->state_data[0] = 0;
|
||||||
*cursor++ = utf8bom[1];
|
state->remainingChars = 0;
|
||||||
*cursor++ = utf8bom[2];
|
} else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
|
||||||
state->internalState |= HeaderDone;
|
// append UTF-8 BOM
|
||||||
|
*cursor++ = utf8bom[0];
|
||||||
|
*cursor++ = utf8bom[1];
|
||||||
|
*cursor++ = utf8bom[2];
|
||||||
|
state->internalState |= HeaderDone;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const ushort *nextAscii = src;
|
|
||||||
while (src != end) {
|
while (src != end) {
|
||||||
int res;
|
const ushort *nextAscii = end;
|
||||||
ushort uc;
|
if (simdEncodeAscii(cursor, nextAscii, src, end))
|
||||||
if (surrogate_high != -1) {
|
|
||||||
uc = surrogate_high;
|
|
||||||
surrogate_high = -1;
|
|
||||||
res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
|
|
||||||
} else {
|
|
||||||
if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end))
|
|
||||||
break;
|
|
||||||
|
|
||||||
uc = *src++;
|
|
||||||
res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
|
|
||||||
}
|
|
||||||
if (Q_LIKELY(res >= 0))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (res == QUtf8BaseTraits::Error) {
|
|
||||||
// encoding error
|
|
||||||
++state->invalidChars;
|
|
||||||
cursor = appendReplacementChar(cursor);
|
|
||||||
} else if (res == QUtf8BaseTraits::EndOfString) {
|
|
||||||
surrogate_high = uc;
|
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
|
do {
|
||||||
|
ushort uc = *src++;
|
||||||
|
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
|
||||||
|
if (Q_LIKELY(res >= 0))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (res == QUtf8BaseTraits::Error) {
|
||||||
|
// encoding error
|
||||||
|
++state->invalidChars;
|
||||||
|
cursor = appendReplacementChar(cursor);
|
||||||
|
} else if (res == QUtf8BaseTraits::EndOfString) {
|
||||||
|
if (state->flags & QStringConverter::Flag::Stateless) {
|
||||||
|
++state->invalidChars;
|
||||||
|
cursor = appendReplacementChar(cursor);
|
||||||
|
} else {
|
||||||
|
state->remainingChars = 1;
|
||||||
|
state->state_data[0] = uc;
|
||||||
|
}
|
||||||
|
return reinterpret_cast<char *>(cursor);
|
||||||
|
}
|
||||||
|
} while (src < nextAscii);
|
||||||
}
|
}
|
||||||
|
|
||||||
state->internalState |= HeaderDone;
|
|
||||||
state->remainingChars = 0;
|
|
||||||
if (surrogate_high >= 0) {
|
|
||||||
if (state->flags & QStringConverter::Flag::Stateless) {
|
|
||||||
++state->invalidChars;
|
|
||||||
cursor = appendReplacementChar(cursor);
|
|
||||||
} else {
|
|
||||||
state->remainingChars = 1;
|
|
||||||
state->state_data[0] = surrogate_high;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return reinterpret_cast<char *>(cursor);
|
return reinterpret_cast<char *>(cursor);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -581,8 +577,9 @@ QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConvert
|
|||||||
QChar *QUtf8::convertToUnicode(QChar *out, const char *chars, qsizetype len, QStringConverter::State *state)
|
QChar *QUtf8::convertToUnicode(QChar *out, const char *chars, qsizetype len, QStringConverter::State *state)
|
||||||
{
|
{
|
||||||
Q_ASSERT(state);
|
Q_ASSERT(state);
|
||||||
|
if (!len)
|
||||||
|
return out;
|
||||||
|
|
||||||
bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
|
|
||||||
|
|
||||||
ushort replacement = QChar::ReplacementCharacter;
|
ushort replacement = QChar::ReplacementCharacter;
|
||||||
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
|
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
|
||||||
@ -595,62 +592,60 @@ QChar *QUtf8::convertToUnicode(QChar *out, const char *chars, qsizetype len, QSt
|
|||||||
const uchar *src = reinterpret_cast<const uchar *>(chars);
|
const uchar *src = reinterpret_cast<const uchar *>(chars);
|
||||||
const uchar *end = src + len;
|
const uchar *end = src + len;
|
||||||
|
|
||||||
if (state->remainingChars) {
|
if (!(state->flags & QStringConverter::Flag::Stateless)) {
|
||||||
// handle incoming state first
|
bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
|
||||||
uchar remainingCharsData[4]; // longest UTF-8 sequence possible
|
if (state->remainingChars || !headerdone) {
|
||||||
qsizetype remainingCharsCount = state->remainingChars;
|
// handle incoming state first
|
||||||
qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
|
uchar remainingCharsData[4]; // longest UTF-8 sequence possible
|
||||||
|
qsizetype remainingCharsCount = state->remainingChars;
|
||||||
|
qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
|
||||||
|
|
||||||
memset(remainingCharsData, 0, sizeof(remainingCharsData));
|
memset(remainingCharsData, 0, sizeof(remainingCharsData));
|
||||||
memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
|
memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
|
||||||
memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
|
memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
|
||||||
|
|
||||||
const uchar *begin = &remainingCharsData[1];
|
const uchar *begin = &remainingCharsData[1];
|
||||||
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
|
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
|
||||||
static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
|
static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
|
||||||
if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) {
|
if (res == QUtf8BaseTraits::Error) {
|
||||||
// special case for len == 0:
|
++state->invalidChars;
|
||||||
// if we were supplied an empty string, terminate the previous, unfinished sequence with error
|
*dst++ = replacement;
|
||||||
++state->invalidChars;
|
++src;
|
||||||
*dst++ = replacement;
|
} else if (res == QUtf8BaseTraits::EndOfString) {
|
||||||
} else if (res == QUtf8BaseTraits::EndOfString) {
|
// if we got EndOfString again, then there were too few bytes in src;
|
||||||
// if we got EndOfString again, then there were too few bytes in src;
|
// copy to our state and return
|
||||||
// copy to our state and return
|
state->remainingChars = remainingCharsCount + newCharsToCopy;
|
||||||
state->remainingChars = remainingCharsCount + newCharsToCopy;
|
memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
|
||||||
memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
|
return out;
|
||||||
return out;
|
} else if (!headerdone) {
|
||||||
} else if (!headerdone && res >= 0) {
|
// eat the UTF-8 BOM
|
||||||
// eat the UTF-8 BOM
|
if (dst[-1] == 0xfeff)
|
||||||
headerdone = true;
|
--dst;
|
||||||
if (dst[-1] == 0xfeff)
|
}
|
||||||
--dst;
|
state->internalState |= HeaderDone;
|
||||||
}
|
|
||||||
|
// adjust src now that we have maybe consumed a few chars
|
||||||
// adjust src now that we have maybe consumed a few chars
|
if (res >= 0) {
|
||||||
if (res >= 0) {
|
Q_ASSERT(res > remainingCharsCount);
|
||||||
Q_ASSERT(res > remainingCharsCount);
|
src += res - remainingCharsCount;
|
||||||
src += res - remainingCharsCount;
|
}
|
||||||
}
|
}
|
||||||
|
} else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
|
||||||
|
// stateless, remove initial BOM
|
||||||
|
if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
|
||||||
|
// skip BOM
|
||||||
|
src += 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
// main body, stateless decoding
|
// main body, stateless decoding
|
||||||
res = 0;
|
res = 0;
|
||||||
const uchar *nextAscii = src;
|
const uchar *nextAscii = src;
|
||||||
const uchar *start = src;
|
|
||||||
while (res >= 0 && src < end) {
|
while (res >= 0 && src < end) {
|
||||||
if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
|
if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
ch = *src++;
|
ch = *src++;
|
||||||
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
|
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
|
||||||
if (!headerdone && res >= 0) {
|
|
||||||
headerdone = true;
|
|
||||||
if (src == start + 3) { // 3 == sizeof(utf8-bom)
|
|
||||||
// eat the UTF-8 BOM (it can only appear at the beginning of the string).
|
|
||||||
if (dst[-1] == 0xfeff)
|
|
||||||
--dst;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (res == QUtf8BaseTraits::Error) {
|
if (res == QUtf8BaseTraits::Error) {
|
||||||
res = 0;
|
res = 0;
|
||||||
++state->invalidChars;
|
++state->invalidChars;
|
||||||
@ -677,9 +672,6 @@ QChar *QUtf8::convertToUnicode(QChar *out, const char *chars, qsizetype len, QSt
|
|||||||
state->remainingChars = 0;
|
state->remainingChars = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (headerdone)
|
|
||||||
state->internalState |= HeaderDone;
|
|
||||||
|
|
||||||
return reinterpret_cast<QChar *>(dst);
|
return reinterpret_cast<QChar *>(dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1641,14 +1641,6 @@ void tst_QTextCodec::utf8stateful_data()
|
|||||||
QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString();
|
QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString();
|
||||||
QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString();
|
QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString();
|
||||||
|
|
||||||
// invalid: sequence too short (the empty second buffer causes a state reset)
|
|
||||||
QTest::newRow("1of2+empty") << QByteArray("\xc2") << QByteArray() << QString();
|
|
||||||
QTest::newRow("1of3+empty") << QByteArray("\xe0") << QByteArray() << QString();
|
|
||||||
QTest::newRow("2of3+empty") << QByteArray("\xe0\xa0") << QByteArray() << QString();
|
|
||||||
QTest::newRow("1of4+empty") << QByteArray("\360") << QByteArray() << QString();
|
|
||||||
QTest::newRow("2of4+empty") << QByteArray("\360\220") << QByteArray() << QString();
|
|
||||||
QTest::newRow("3of4+empty") << QByteArray("\360\220\210") << QByteArray() << QString();
|
|
||||||
|
|
||||||
// overlong sequence:
|
// overlong sequence:
|
||||||
QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString();
|
QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString();
|
||||||
QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString();
|
QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString();
|
||||||
|
@ -1335,14 +1335,6 @@ void tst_QStringConverter::utf8stateful_data()
|
|||||||
QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString();
|
QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString();
|
||||||
QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString();
|
QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString();
|
||||||
|
|
||||||
// invalid: sequence too short (the empty second buffer causes a state reset)
|
|
||||||
QTest::newRow("1of2+empty") << QByteArray("\xc2") << QByteArray() << QString();
|
|
||||||
QTest::newRow("1of3+empty") << QByteArray("\xe0") << QByteArray() << QString();
|
|
||||||
QTest::newRow("2of3+empty") << QByteArray("\xe0\xa0") << QByteArray() << QString();
|
|
||||||
QTest::newRow("1of4+empty") << QByteArray("\360") << QByteArray() << QString();
|
|
||||||
QTest::newRow("2of4+empty") << QByteArray("\360\220") << QByteArray() << QString();
|
|
||||||
QTest::newRow("3of4+empty") << QByteArray("\360\220\210") << QByteArray() << QString();
|
|
||||||
|
|
||||||
// overlong sequence:
|
// overlong sequence:
|
||||||
QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString();
|
QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString();
|
||||||
QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString();
|
QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString();
|
||||||
|
Loading…
Reference in New Issue
Block a user