Always write XML documents as UTF-8

Remove support for setting a codec different from UTF-8
for writing XML files.

All XML readers today can handle UTF-8, and there is no
reason anymore to write a file in a different encoding.

Change-Id: If89fb2d2474a2b55644d9bed7473c11ad91033eb
Reviewed-by: Simon Hausmann <hausmann@gmail.com>
This commit is contained in:
Lars Knoll 2020-04-06 15:17:04 +02:00
parent 99632c2217
commit 2c7f9565ed
3 changed files with 20 additions and 235 deletions

View File

@ -48,6 +48,7 @@
#if QT_CONFIG(textcodec)
#include <qtextcodec.h>
#endif
#include <qstringconverter.h>
#include <qstack.h>
#include <qbuffer.h>
#include <qscopeguard.h>
@ -3009,8 +3010,7 @@ QStringRef QXmlStreamReader::documentEncoding() const
writeProcessingInstruction(), and writeDTD(). Chaining of XML
streams is supported with writeCurrentToken().
By default, QXmlStreamWriter encodes XML in UTF-8. Different
encodings can be enforced using setCodec().
QXmlStreamWriter always encodes XML in UTF-8.
If an error occurs while writing to the underlying device, hasError()
starts returning true and subsequent writes are ignored.
@ -3031,9 +3031,6 @@ public:
~QXmlStreamWriterPrivate() {
if (deleteDevice)
delete device;
#if QT_CONFIG(textcodec)
delete encoder;
#endif
}
void write(const QStringRef &);
@ -3053,16 +3050,10 @@ public:
uint hasIoError :1;
uint hasEncodingError :1;
uint autoFormatting :1;
uint isCodecASCIICompatible :1;
QByteArray autoFormattingIndent;
NamespaceDeclaration emptyNamespace;
qsizetype lastNamespaceDeclaration;
#if QT_CONFIG(textcodec)
QTextCodec *codec;
QTextEncoder *encoder;
#endif
void checkIfASCIICompatibleCodec();
QStringEncoder toUtf8;
NamespaceDeclaration &findNamespace(const QString &namespaceUri, bool writeDeclaration = false, bool noDefault = false);
void writeNamespaceDeclaration(const NamespaceDeclaration &namespaceDeclaration);
@ -3074,17 +3065,13 @@ public:
QXmlStreamWriterPrivate::QXmlStreamWriterPrivate(QXmlStreamWriter *q)
:autoFormattingIndent(4, ' ')
: autoFormattingIndent(4, ' '),
toUtf8(QStringEncoder::Utf8, QStringEncoder::Flag::Stateless)
{
q_ptr = q;
device = nullptr;
stringDevice = nullptr;
deleteDevice = false;
#if QT_CONFIG(textcodec)
codec = QTextCodec::codecForMib(106); // utf8
encoder = codec->makeEncoder(QTextCodec::IgnoreHeader); // no byte order mark for utf8
#endif
checkIfASCIICompatibleCodec();
inStartElement = inEmptyElement = false;
wroteSomething = false;
hasIoError = false;
@ -3095,37 +3082,16 @@ QXmlStreamWriterPrivate::QXmlStreamWriterPrivate(QXmlStreamWriter *q)
namespacePrefixCount = 0;
}
void QXmlStreamWriterPrivate::checkIfASCIICompatibleCodec()
{
#if QT_CONFIG(textcodec)
Q_ASSERT(encoder);
// test ASCII-compatibility using the letter 'a'
QChar letterA = QLatin1Char('a');
const QByteArray bytesA = encoder->fromUnicode(&letterA, 1);
const bool isCodecASCIICompatibleA = (bytesA.count() == 1) && (bytesA[0] == 0x61) ;
QChar letterLess = QLatin1Char('<');
const QByteArray bytesLess = encoder->fromUnicode(&letterLess, 1);
const bool isCodecASCIICompatibleLess = (bytesLess.count() == 1) && (bytesLess[0] == 0x3C) ;
isCodecASCIICompatible = isCodecASCIICompatibleA && isCodecASCIICompatibleLess ;
#else
isCodecASCIICompatible = true;
#endif
}
void QXmlStreamWriterPrivate::write(const QStringRef &s)
{
if (device) {
if (hasIoError)
return;
#if !QT_CONFIG(textcodec)
QByteArray bytes = s.toLatin1();
#else
QByteArray bytes = encoder->fromUnicode(s.constData(), s.size());
if (encoder->hasFailure()) {
QByteArray bytes = toUtf8(s);
if (toUtf8.hasError()) {
hasEncodingError = true;
return;
}
#endif
if (device->write(bytes) != bytes.size())
hasIoError = true;
}
@ -3140,15 +3106,11 @@ void QXmlStreamWriterPrivate::write(const QString &s)
if (device) {
if (hasIoError)
return;
#if !QT_CONFIG(textcodec)
QByteArray bytes = s.toLatin1();
#else
QByteArray bytes = encoder->fromUnicode(s);
if (encoder->hasFailure()) {
QByteArray bytes = toUtf8(s);
if (toUtf8.hasError()) {
hasEncodingError = true;
return;
}
#endif
if (device->write(bytes) != bytes.size())
hasIoError = true;
}
@ -3210,20 +3172,18 @@ void QXmlStreamWriterPrivate::writeEscaped(const QString &s, bool escapeWhitespa
write(escaped);
}
// Converts from ASCII to output encoding
// Writes utf8
void QXmlStreamWriterPrivate::write(const char *s, int len)
{
if (device) {
if (hasIoError)
return;
if (isCodecASCIICompatible) {
if (device->write(s, len) != len)
hasIoError = true;
return;
}
if (device->write(s, len) != len)
hasIoError = true;
return;
}
write(QString::fromLatin1(s, len));
write(QString::fromUtf8(s, len));
}
void QXmlStreamWriterPrivate::writeNamespaceDeclaration(const NamespaceDeclaration &namespaceDeclaration) {
@ -3338,8 +3298,6 @@ QXmlStreamWriter::QXmlStreamWriter(QByteArray *array)
/*! Constructs a stream writer that writes into \a string.
*
* Note that when writing to QString, QXmlStreamWriter ignores the codec set
* with setCodec(). See that function for more information.
*/
QXmlStreamWriter::QXmlStreamWriter(QString *string)
: d_ptr(new QXmlStreamWriterPrivate(this))
@ -3387,67 +3345,6 @@ QIODevice *QXmlStreamWriter::device() const
return d->device;
}
#if QT_CONFIG(textcodec)
/*!
Sets the codec for this stream to \a codec. The codec is used for
encoding any data that is written. By default, QXmlStreamWriter
uses UTF-8.
The encoding information is stored in the initial xml tag which
gets written when you call writeStartDocument(). Call this
function before calling writeStartDocument().
\note When writing the XML to a QString, the codec information is ignored
and the XML header will not include any encoding information, since all
QStrings are UTF-16. If you later convert the QString to an 8-bit format,
you must arrange for the encoding information to be transmitted
out-of-band.
\sa codec()
*/
void QXmlStreamWriter::setCodec(QTextCodec *codec)
{
Q_D(QXmlStreamWriter);
if (codec) {
d->codec = codec;
delete d->encoder;
d->encoder = codec->makeEncoder(QTextCodec::IgnoreHeader); // no byte order mark for utf8
d->checkIfASCIICompatibleCodec();
}
}
/*!
Sets the codec for this stream to the QTextCodec for the encoding
specified by \a codecName. Common values for \c codecName include
"ISO 8859-1", "UTF-8", and "UTF-16". If the encoding isn't
recognized, nothing happens.
\note When writing the XML to a QString, the codec information is ignored
and the XML header will not include any encoding information, since all
QStrings are UTF-16. If you later convert the QString to an 8-bit format,
you must arrange for the encoding information to be transmitted
out-of-band.
\sa QTextCodec::codecForName()
*/
void QXmlStreamWriter::setCodec(const char *codecName)
{
setCodec(QTextCodec::codecForName(codecName));
}
/*!
Returns the codec that is currently assigned to the stream.
\sa setCodec()
*/
QTextCodec *QXmlStreamWriter::codec() const
{
Q_D(const QXmlStreamWriter);
return d->codec;
}
#endif // textcodec
/*!
\property QXmlStreamWriter::autoFormatting
\since 4.4
@ -3886,10 +3783,9 @@ void QXmlStreamWriter::writeProcessingInstruction(const QString &target, const Q
/*!\overload
Writes a document start with XML version number "1.0". This also
writes the encoding information.
Writes a document start with XML version number "1.0".
\sa writeEndDocument(), setCodec()
\sa writeEndDocument()
\since 4.5
*/
void QXmlStreamWriter::writeStartDocument()
@ -3909,15 +3805,8 @@ void QXmlStreamWriter::writeStartDocument(const QString &version)
d->finishStartElement(false);
d->write("<?xml version=\"");
d->write(version);
if (d->device) { // stringDevice does not get any encoding
d->write("\" encoding=\"");
#if !QT_CONFIG(textcodec)
d->write("iso-8859-1");
#else
const QByteArray name = d->codec->name();
d->write(name.constData(), name.length());
#endif
}
if (d->device) // stringDevice does not get any encoding
d->write("\" encoding=\"UTF-8");
d->write("\"?>");
}
@ -3933,15 +3822,8 @@ void QXmlStreamWriter::writeStartDocument(const QString &version, bool standalon
d->finishStartElement(false);
d->write("<?xml version=\"");
d->write(version);
if (d->device) { // stringDevice does not get any encoding
d->write("\" encoding=\"");
#if !QT_CONFIG(textcodec)
d->write("iso-8859-1");
#else
const QByteArray name = d->codec->name();
d->write(name.constData(), name.length());
#endif
}
if (d->device) // stringDevice does not get any encoding
d->write("\" encoding=\"UTF-8");
if (standalone)
d->write("\" standalone=\"yes\"?>");
else

View File

@ -474,12 +474,6 @@ public:
void setDevice(QIODevice *device);
QIODevice *device() const;
#if QT_CONFIG(textcodec)
void setCodec(QTextCodec *codec);
void setCodec(const char *codecName);
QTextCodec *codec() const;
#endif
void setAutoFormatting(bool);
bool autoFormatting() const;

View File

@ -560,9 +560,7 @@ private slots:
void crashInUTF16Codec() const;
void hasAttributeSignature() const;
void hasAttribute() const;
void writeWithCodec() const;
void writeWithUtf8Codec() const;
void writeWithUtf16Codec() const;
void writeWithStandalone() const;
void entitiesAndWhitespace_1() const;
void entitiesAndWhitespace_2() const;
@ -573,7 +571,6 @@ private slots:
void checkCommentIndentation() const;
void checkCommentIndentation_data() const;
void crashInXmlStreamReader() const;
void write8bitCodec() const;
void invalidStringCharacters_data() const;
void invalidStringCharacters() const;
void hasError() const;
@ -1258,66 +1255,16 @@ void tst_QXmlStream::hasAttribute() const
QVERIFY(!reader.hasError());
}
void tst_QXmlStream::writeWithCodec() const
{
QByteArray outarray;
QXmlStreamWriter writer(&outarray);
writer.setAutoFormatting(true);
QTextCodec *codec = QTextCodec::codecForName("ISO 8859-15");
QVERIFY(codec);
writer.setCodec(codec);
const char *latin2 = "h\xe9 h\xe9";
const QString string = codec->toUnicode(latin2);
writer.writeStartDocument("1.0");
writer.writeTextElement("foo", string);
writer.writeEndElement();
writer.writeEndDocument();
QVERIFY(outarray.contains(latin2));
QVERIFY(outarray.contains(codec->name()));
}
void tst_QXmlStream::writeWithUtf8Codec() const
{
QByteArray outarray;
QXmlStreamWriter writer(&outarray);
QTextCodec *codec = QTextCodec::codecForMib(106); // utf-8
QVERIFY(codec);
writer.setCodec(codec);
writer.writeStartDocument("1.0");
static const char begin[] = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
QVERIFY(outarray.startsWith(begin));
}
void tst_QXmlStream::writeWithUtf16Codec() const
{
QByteArray outarray;
QXmlStreamWriter writer(&outarray);
QTextCodec *codec = QTextCodec::codecForMib(1014); // utf-16LE
QVERIFY(codec);
writer.setCodec(codec);
writer.writeStartDocument("1.0");
static const char begin[] = "<?xml version=\"1.0\" encoding=\"UTF-16"; // skip potential "LE" suffix
const int count = sizeof(begin) - 1; // don't include 0 terminator
QByteArray begin_UTF16;
begin_UTF16.reserve(2*(count));
for (int i = 0; i < count; ++i) {
begin_UTF16.append(begin[i]);
begin_UTF16.append((char)'\0');
}
QVERIFY(outarray.startsWith(begin_UTF16));
}
void tst_QXmlStream::writeWithStandalone() const
{
{
@ -1413,7 +1360,6 @@ void tst_QXmlStream::garbageInXMLPrologUTF8Explicitly() const
QVERIFY(out.open(QIODevice::ReadWrite));
QXmlStreamWriter writer (&out);
writer.setCodec("UTF-8");
writer.writeStartDocument();
writer.writeEmptyElement("Foo");
writer.writeEndDocument();
@ -1602,43 +1548,6 @@ void tst_QXmlStream::hasError() const
}
void tst_QXmlStream::write8bitCodec() const
{
QBuffer outBuffer;
QVERIFY(outBuffer.open(QIODevice::WriteOnly));
QXmlStreamWriter writer(&outBuffer);
writer.setAutoFormatting(false);
QTextCodec *codec = QTextCodec::codecForName("IBM500");
if (!codec) {
QSKIP("Encoding IBM500 not available.");
}
writer.setCodec(codec);
writer.writeStartDocument();
writer.writeStartElement("root");
writer.writeAttribute("attrib", "1");
writer.writeEndElement();
writer.writeEndDocument();
outBuffer.close();
// test 8 bit encoding
QByteArray values = outBuffer.data();
QVERIFY(values.size() > 1);
// check '<'
QCOMPARE(values[0] & 0x00FF, 0x4c);
// check '?'
QCOMPARE(values[1] & 0x00FF, 0x6F);
// convert the start of the XML
const QString expected = ("<?xml version=\"1.0\" encoding=\"IBM500\"?>");
QTextDecoder *decoder = codec->makeDecoder();
QVERIFY(decoder);
QString decodedText = decoder->toUnicode(values);
delete decoder;
QVERIFY(decodedText.startsWith(expected));
}
void tst_QXmlStream::invalidStringCharacters() const
{
// test scan in attributes