Cleanup state handling in QUtf8::convertFromUnicode

And optimize the method so we can avoid a copy of
the data.

Change-Id: Ic267150db80358dbc4010bb1db2af5c0eb97dc65
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Lars Knoll 2020-04-22 22:40:56 +02:00
parent 618620bc5d
commit 5dcfd0ac2f
2 changed files with 108 additions and 109 deletions

View File

@ -402,38 +402,45 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len)
return result;
}
QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverter::State *state)
QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len, QStringConverterBase::State *state)
{
QByteArray ba(3*len +3, Qt::Uninitialized);
char *end = convertFromUnicode(ba.data(), QStringView(uc, len), state);
ba.truncate(end - ba.data());
return ba;
}
char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state)
{
Q_ASSERT(state);
if (state->flags & QStringConverter::Flag::Stateless) // temporary
state = nullptr;
const QChar *uc = in.data();
qsizetype len = in.length();
uchar replacement = '?';
qsizetype rlen = 3*len;
int surrogate_high = -1;
bool writeBom = false;
if (state) {
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
replacement = 0;
if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
rlen += 3;
writeBom = true;
auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
*cursor++ = 0;
} else {
// QChar::replacement encoded in utf8
*cursor++ = 0xef;
*cursor++ = 0xbf;
*cursor++ = 0xbd;
}
if (state->remainingChars)
surrogate_high = state->state_data[0];
}
return cursor;
};
QByteArray rstr(rlen, Qt::Uninitialized);
uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData()));
uchar *cursor = reinterpret_cast<uchar *>(out);
const ushort *src = reinterpret_cast<const ushort *>(uc);
const ushort *const end = src + len;
int invalid = 0;
if (writeBom) {
int surrogate_high = -1;
if (state->remainingChars) {
surrogate_high = state->state_data[0];
} else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
// append UTF-8 BOM
*cursor++ = utf8bom[0];
*cursor++ = utf8bom[1];
*cursor++ = utf8bom[2];
state->internalState |= HeaderDone;
}
const ushort *nextAscii = src;
@ -456,25 +463,26 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, qsizetype len, QStringConv
if (res == QUtf8BaseTraits::Error) {
// encoding error
++invalid;
*cursor++ = replacement;
++state->invalidChars;
cursor = appendReplacementChar(cursor);
} else if (res == QUtf8BaseTraits::EndOfString) {
surrogate_high = uc;
break;
}
}
rstr.resize(cursor - (const uchar*)rstr.constData());
if (state) {
state->invalidChars += invalid;
state->internalState |= HeaderDone;
state->remainingChars = 0;
if (surrogate_high >= 0) {
state->internalState |= HeaderDone;
state->remainingChars = 0;
if (surrogate_high >= 0) {
if (state->flags & QStringConverter::Flag::Stateless) {
++state->invalidChars;
cursor = appendReplacementChar(cursor);
} else {
state->remainingChars = 1;
state->state_data[0] = surrogate_high;
}
}
return rstr;
return reinterpret_cast<char *>(cursor);
}
QString QUtf8::convertToUnicode(const char *chars, qsizetype len)
@ -553,16 +561,6 @@ QChar *QUtf8::convertToUnicode(QChar *buffer, const char *chars, qsizetype len)
QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConverter::State *state)
{
Q_ASSERT(state);
if (state->flags & QStringConverter::Flag::Stateless) // temporary
state = nullptr;
bool headerdone = state && state->internalState & HeaderDone;
ushort replacement = QChar::ReplacementCharacter;
int invalid = 0;
int res;
uchar ch = 0;
// See above for buffer requirements for stateless decoding. However, that
// fails if the state is not empty. The following situations can add to the
// requirements:
@ -574,52 +572,63 @@ QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConvert
// 2 of 3 bytes same +1 (same)
// 3 of 4 bytes same +1 (same)
QString result(len + 1, Qt::Uninitialized);
QChar *end = convertToUnicode(result.data(), chars, len, state);
result.truncate(end - result.constData());
return result;
}
ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
QChar *QUtf8::convertToUnicode(QChar *out, const char *chars, qsizetype len, QStringConverter::State *state)
{
Q_ASSERT(state);
bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::DontSkipInitialBom;
ushort replacement = QChar::ReplacementCharacter;
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
replacement = QChar::Null;
int res;
uchar ch = 0;
ushort *dst = reinterpret_cast<ushort *>(out);
const uchar *src = reinterpret_cast<const uchar *>(chars);
const uchar *end = src + len;
if (state) {
if (state->flags & QStringConverter::Flag::DontSkipInitialBom)
if (state->remainingChars) {
// handle incoming state first
uchar remainingCharsData[4]; // longest UTF-8 sequence possible
qsizetype remainingCharsCount = state->remainingChars;
qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
memset(remainingCharsData, 0, sizeof(remainingCharsData));
memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
const uchar *begin = &remainingCharsData[1];
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) {
// special case for len == 0:
// if we were supplied an empty string, terminate the previous, unfinished sequence with error
++state->invalidChars;
*dst++ = replacement;
} else if (res == QUtf8BaseTraits::EndOfString) {
// if we got EndOfString again, then there were too few bytes in src;
// copy to our state and return
state->remainingChars = remainingCharsCount + newCharsToCopy;
memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
return out;
} else if (!headerdone && res >= 0) {
// eat the UTF-8 BOM
headerdone = true;
if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
replacement = QChar::Null;
if (state->remainingChars) {
// handle incoming state first
uchar remainingCharsData[4]; // longest UTF-8 sequence possible
qsizetype remainingCharsCount = state->remainingChars;
qsizetype newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
if (dst[-1] == 0xfeff)
--dst;
}
memset(remainingCharsData, 0, sizeof(remainingCharsData));
memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
const uchar *begin = &remainingCharsData[1];
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) {
// special case for len == 0:
// if we were supplied an empty string, terminate the previous, unfinished sequence with error
++invalid;
*dst++ = replacement;
} else if (res == QUtf8BaseTraits::EndOfString) {
// if we got EndOfString again, then there were too few bytes in src;
// copy to our state and return
state->remainingChars = remainingCharsCount + newCharsToCopy;
memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
return QString();
} else if (!headerdone && res >= 0) {
// eat the UTF-8 BOM
headerdone = true;
if (dst[-1] == 0xfeff)
--dst;
}
// adjust src now that we have maybe consumed a few chars
if (res >= 0) {
Q_ASSERT(res > remainingCharsCount);
src += res - remainingCharsCount;
}
// adjust src now that we have maybe consumed a few chars
if (res >= 0) {
Q_ASSERT(res > remainingCharsCount);
src += res - remainingCharsCount;
}
}
@ -643,32 +652,34 @@ QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConvert
}
if (res == QUtf8BaseTraits::Error) {
res = 0;
++invalid;
++state->invalidChars;
*dst++ = replacement;
}
}
if ((!state || state->flags & QStringConverter::Flag::Stateless) && res == QUtf8BaseTraits::EndOfString) {
if (res == QUtf8BaseTraits::EndOfString) {
// unterminated UTF sequence
*dst++ = QChar::ReplacementCharacter;
while (src++ < end)
if (state->flags & QStringConverter::Flag::Stateless) {
*dst++ = QChar::ReplacementCharacter;
}
result.truncate(dst - (const ushort *)result.unicode());
if (state) {
state->invalidChars += invalid;
if (headerdone)
state->internalState |= HeaderDone;
if (res == QUtf8BaseTraits::EndOfString) {
++state->invalidChars;
while (src++ < end) {
*dst++ = QChar::ReplacementCharacter;
++state->invalidChars;
}
state->remainingChars = 0;
} else {
--src; // unread the byte in ch
state->remainingChars = end - src;
memcpy(&state->state_data[0], src, end - src);
} else {
state->remainingChars = 0;
}
} else {
state->remainingChars = 0;
}
return result;
if (headerdone)
state->internalState |= HeaderDone;
return reinterpret_cast<QChar *>(dst);
}
struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
@ -1240,20 +1251,6 @@ void QStringConverter::State::clear()
internalState = 0;
}
static QChar *fromUtf8(QChar *out, const char *in, qsizetype length, QStringConverter::State *state)
{
QString s = QUtf8::convertToUnicode(in, length, state);
memcpy(out, s.constData(), s.length()*sizeof(QChar));
return out + s.length();
}
static char *toUtf8(char *out, QStringView in, QStringConverter::State *state)
{
QByteArray s = QUtf8::convertFromUnicode(in.data(), in.length(), state);
memcpy(out, s.constData(), s.length());
return out + s.length();
}
static QChar *fromUtf16(QChar *out, const char *in, qsizetype length, QStringConverter::State *state)
{
QString s = QUtf16::convertToUnicode(in, length, state);
@ -1401,7 +1398,7 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
{
{ fromUtf8, fromUtf8Len, toUtf8, toUtf8Len },
{ QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
{ fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
{ fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
{ fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },

View File

@ -289,8 +289,10 @@ struct QUtf8
static QChar *convertToUnicode(QChar *, const char *, qsizetype) noexcept;
static QString convertToUnicode(const char *, qsizetype);
static QString convertToUnicode(const char *, qsizetype, QStringConverter::State *);
static QChar *convertToUnicode(QChar *out, const char *in, qsizetype length, QStringConverter::State *state);
static QByteArray convertFromUnicode(const QChar *, qsizetype);
static QByteArray convertFromUnicode(const QChar *, qsizetype, QStringConverter::State *);
static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state);
struct ValidUtf8Result {
bool isValidUtf8;
bool isValidAscii;