Correctly detect HTML 5 charset attribute in QTextCodec::codecForHtml()
QTextCodec::codecForHtml currently fails to detect the charset for this HTML: <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <meta http-equiv="X-UA-Compatible" content="IE=9,chrome=1"> <title>Test</title> </head> This patch makes the detection of charsets more flexible, allowing for the use of the HTML 5 charset attribute as well more terminator characters ("'", and ">"). I also added a *_data function for the unit tests. Task-number: QTBUG-5451 Change-Id: I69fe4a04582f0d845cbbe9140a86a950fb7dc861 Reviewed-by: Olivier Goffart <ogoffart@woboq.com> Reviewed-by: Denis Dzyubenko <denis@ddenis.info>
This commit is contained in:
parent
7abc1a6a82
commit
86115848b5
@ -1043,28 +1043,30 @@ QString QTextDecoder::toUnicode(const QByteArray &ba)
|
||||
QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
|
||||
{
|
||||
// determine charset
|
||||
int pos;
|
||||
QTextCodec *c = 0;
|
||||
|
||||
c = QTextCodec::codecForUtfText(ba, c);
|
||||
QTextCodec *c = QTextCodec::codecForUtfText(ba, 0);
|
||||
if (!c) {
|
||||
QByteArray header = ba.left(512).toLower();
|
||||
if ((pos = header.indexOf("http-equiv=")) != -1) {
|
||||
if ((pos = header.lastIndexOf("meta ", pos)) != -1) {
|
||||
pos = header.indexOf("charset=", pos) + int(strlen("charset="));
|
||||
if (pos != -1) {
|
||||
int pos2 = header.indexOf('\"', pos+1);
|
||||
QByteArray cs = header.mid(pos, pos2-pos);
|
||||
// qDebug("found charset: %s", cs.data());
|
||||
c = QTextCodec::codecForName(cs);
|
||||
int pos = header.indexOf("meta ");
|
||||
if (pos != -1) {
|
||||
pos = header.indexOf("charset=", pos);
|
||||
if (pos != -1) {
|
||||
pos += qstrlen("charset=");
|
||||
|
||||
int pos2 = pos;
|
||||
// The attribute can be closed with either """, "'", ">" or "/",
|
||||
// none of which are valid charset characters.
|
||||
while (++pos2 < header.size()) {
|
||||
char ch = header.at(pos2);
|
||||
if (ch == '\"' || ch == '\'' || ch == '>') {
|
||||
c = QTextCodec::codecForName(header.mid(pos, pos2 - pos));
|
||||
return c ? c : defaultCodec;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!c)
|
||||
c = defaultCodec;
|
||||
|
||||
return c;
|
||||
return defaultCodec;
|
||||
}
|
||||
|
||||
/*!
|
||||
|
@ -84,6 +84,7 @@ private slots:
|
||||
void utfHeaders_data();
|
||||
void utfHeaders();
|
||||
|
||||
void codecForHtml_data();
|
||||
void codecForHtml();
|
||||
|
||||
void codecForUtfText_data();
|
||||
@ -1853,23 +1854,81 @@ void tst_QTextCodec::utfHeaders()
|
||||
}
|
||||
}
|
||||
|
||||
void tst_QTextCodec::codecForHtml()
|
||||
void tst_QTextCodec::codecForHtml_data()
|
||||
{
|
||||
QByteArray html("<html><head></head><body>blah</body></html>");
|
||||
QTest::addColumn<QByteArray>("html");
|
||||
QTest::addColumn<int>("defaultCodecMib");
|
||||
QTest::addColumn<int>("expectedMibEnum");
|
||||
|
||||
QCOMPARE(QTextCodec::codecForHtml(html)->mibEnum(), 4); // latin 1
|
||||
int noDefault = -1;
|
||||
int fallback = 4; // latin 1
|
||||
QByteArray html = "<html><head></head><body>blah</body></html>";
|
||||
QTest::newRow("no charset, latin 1") << html << noDefault << fallback;
|
||||
|
||||
QCOMPARE(QTextCodec::codecForHtml(html, QTextCodec::codecForMib(106))->mibEnum(), 106); // UTF-8
|
||||
QTest::newRow("no charset, default UTF-8") << html << 106 << 106;
|
||||
|
||||
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-15\" /></head></html>";
|
||||
QCOMPARE(QTextCodec::codecForHtml(html, QTextCodec::codecForMib(106))->mibEnum(), 111); // latin 15
|
||||
QTest::newRow("latin 15, default UTF-8") << html << 106 << 111;
|
||||
|
||||
html = "<html><head><meta content=\"text/html; charset=ISO-8859-15\" http-equiv=\"content-type\" /></head></html>";
|
||||
QCOMPARE(QTextCodec::codecForHtml(html, QTextCodec::codecForMib(106))->mibEnum(), 111); // latin 15
|
||||
QTest::newRow("latin 15, default UTF-8 (#2)") << html << 106 << 111;
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset=\"utf-8\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>";
|
||||
QTest::newRow("UTF-8, no default") << html << noDefault << 106;
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset=\"ISO_8859-1:1987\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>";
|
||||
QTest::newRow("latin 1, no default") << html << noDefault << 4;
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8\"><title>Test</title></head>";
|
||||
QTest::newRow("UTF-8, no default (#2)") << html << noDefault << 106;
|
||||
|
||||
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8/></head></html>";
|
||||
QTest::newRow("UTF-8, no quotes") << html << noDefault << 106;
|
||||
|
||||
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset='UTF-8'/></head></html>";
|
||||
QTest::newRow("UTF-8, single quotes") << html << noDefault << 106;
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset=utf-8><title>Test</title></head>";
|
||||
QTest::newRow("UTF-8, > terminator") << html << noDefault << 106;
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset= utf-8 ><title>Test</title></head>";
|
||||
QTest::newRow("UTF-8, > terminator with spaces") << html << noDefault << 106;
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset= utf/8 ><title>Test</title></head>";
|
||||
QTest::newRow("UTF-8, > teminator with early backslash)") << html << noDefault << 106;
|
||||
|
||||
// Test invalid charsets.
|
||||
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=invalid-foo\" /></head></html>";
|
||||
QCOMPARE(QTextCodec::codecForHtml(html, QTextCodec::codecForMib(106))->mibEnum(), 106); // UTF-8
|
||||
QCOMPARE(QTextCodec::codecForHtml(html)->mibEnum(), 4); // latin 1
|
||||
QTest::newRow("invalid charset, no default") << html << noDefault << fallback;
|
||||
QTest::newRow("invalid charset, default UTF-8") << html << 106 << 106;
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"";
|
||||
html.prepend(QByteArray().fill(' ', 512 - html.size()));
|
||||
QTest::newRow("invalid charset (large header)") << html << noDefault << fallback;
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8";
|
||||
QTest::newRow("invalid charset (no closing double quote)") << html << noDefault << fallback;
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset='utf-8";
|
||||
QTest::newRow("invalid charset (no closing single quote)") << html << noDefault << fallback;
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset=utf-8 foo=bar><title>Test</title></head>";
|
||||
QTest::newRow("invalid (space terminator)") << html << noDefault << fallback;
|
||||
|
||||
html = "<!DOCTYPE html><html><head><meta charset=\" utf' 8 /><title>Test</title></head>";
|
||||
QTest::newRow("invalid charset, early terminator (')") << html << noDefault << fallback;
|
||||
}
|
||||
|
||||
void tst_QTextCodec::codecForHtml()
|
||||
{
|
||||
QFETCH(QByteArray, html);
|
||||
QFETCH(int, defaultCodecMib);
|
||||
QFETCH(int, expectedMibEnum);
|
||||
|
||||
if (defaultCodecMib != -1)
|
||||
QCOMPARE(QTextCodec::codecForHtml(html, QTextCodec::codecForMib(defaultCodecMib))->mibEnum(), expectedMibEnum);
|
||||
else // Test one parameter version when there is no default codec.
|
||||
QCOMPARE(QTextCodec::codecForHtml(html)->mibEnum(), expectedMibEnum);
|
||||
}
|
||||
|
||||
void tst_QTextCodec::codecForUtfText_data()
|
||||
|
Loading…
Reference in New Issue
Block a user