Remove the tolerant parsing function and make the recoder tolerant
The reason for this change is that the strict parser made little sense to exist. What would the recoder do if it was passed an invalid string? I believe that the tolerant recoder is more efficient than the correcting code followed by the strict recoder. This makes the recoder more complex and probably a little less efficient, but it's better in the common case (tolerant that doesn't need fixes) and in the worst case (needs fixes). Change-Id: I68a0c9fda6765de05914cbd6ba7d3cea560a7cd6 Reviewed-by: Lars Knoll <lars.knoll@nokia.com>
This commit is contained in:
parent
6028efa3ff
commit
73e16b15a6
@ -133,6 +133,18 @@ static inline ushort decodeNibble(ushort c)
|
|||||||
c >= 'A' ? c - 'A' + 0xA : c - '0';
|
c >= 'A' ? c - 'A' + 0xA : c - '0';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if the sequence at input is 2*HEXDIG, returns its decoding
|
||||||
|
// returns -1 if it isn't.
|
||||||
|
// assumes that the range has been checked already
|
||||||
|
static inline ushort decodePercentEncoding(const ushort *input)
|
||||||
|
{
|
||||||
|
ushort c1 = input[0];
|
||||||
|
ushort c2 = input[1];
|
||||||
|
if (!isHex(c1) || !isHex(c2))
|
||||||
|
return ushort(-1);
|
||||||
|
return decodeNibble(c1) << 4 | decodeNibble(c2);
|
||||||
|
}
|
||||||
|
|
||||||
static inline ushort encodeNibble(ushort c)
|
static inline ushort encodeNibble(ushort c)
|
||||||
{
|
{
|
||||||
static const uchar hexnumbers[] = "0123456789ABCDEF";
|
static const uchar hexnumbers[] = "0123456789ABCDEF";
|
||||||
@ -170,16 +182,15 @@ static inline bool isUnicodeNonCharacter(uint ucs4)
|
|||||||
// returns true if we performed an UTF-8 decoding
|
// returns true if we performed an UTF-8 decoding
|
||||||
static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded)
|
static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded)
|
||||||
{
|
{
|
||||||
|
int charsNeeded;
|
||||||
|
uint min_uc;
|
||||||
|
uint uc;
|
||||||
|
|
||||||
if (decoded <= 0xC1) {
|
if (decoded <= 0xC1) {
|
||||||
// an UTF-8 first character must be at least 0xC0
|
// an UTF-8 first character must be at least 0xC0
|
||||||
// however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
|
// however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
|
||||||
return false;
|
return false;
|
||||||
}
|
} else if (decoded < 0xe0) {
|
||||||
|
|
||||||
int charsNeeded;
|
|
||||||
uint min_uc;
|
|
||||||
uint uc;
|
|
||||||
if (decoded < 0xe0) {
|
|
||||||
charsNeeded = 1;
|
charsNeeded = 1;
|
||||||
min_uc = 0x80;
|
min_uc = 0x80;
|
||||||
uc = decoded & 0x1f;
|
uc = decoded & 0x1f;
|
||||||
@ -194,7 +205,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
|
|||||||
} else {
|
} else {
|
||||||
// the last Unicode character is U+10FFFF
|
// the last Unicode character is U+10FFFF
|
||||||
// it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
|
// it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
|
||||||
// therefore, a byte outside the range 0xC0..0xF4 is not the UTF-8 first byte
|
// therefore, a byte higher than 0xF4 is not the UTF-8 first byte
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -206,7 +217,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
// first continuation character
|
// first continuation character
|
||||||
decoded = (decodeNibble(input[3]) << 4) | decodeNibble(input[4]);
|
decoded = decodePercentEncoding(input + 3);
|
||||||
if ((decoded & 0xc0) != 0x80)
|
if ((decoded & 0xc0) != 0x80)
|
||||||
return false;
|
return false;
|
||||||
uc <<= 6;
|
uc <<= 6;
|
||||||
@ -217,7 +228,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
// second continuation character
|
// second continuation character
|
||||||
decoded = (decodeNibble(input[6]) << 4) | decodeNibble(input[7]);
|
decoded = decodePercentEncoding(input + 6);
|
||||||
if ((decoded & 0xc0) != 0x80)
|
if ((decoded & 0xc0) != 0x80)
|
||||||
return false;
|
return false;
|
||||||
uc <<= 6;
|
uc <<= 6;
|
||||||
@ -228,7 +239,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
// third continuation character
|
// third continuation character
|
||||||
decoded = (decodeNibble(input[9]) << 4) | decodeNibble(input[10]);
|
decoded = decodePercentEncoding(input + 9);
|
||||||
if ((decoded & 0xc0) != 0x80)
|
if ((decoded & 0xc0) != 0x80)
|
||||||
return false;
|
return false;
|
||||||
uc <<= 6;
|
uc <<= 6;
|
||||||
@ -348,72 +359,82 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort
|
|||||||
*output++ = encodeNibble(c & 0xf);
|
*output++ = encodeNibble(c & 0xf);
|
||||||
}
|
}
|
||||||
|
|
||||||
Q_AUTOTEST_EXPORT QString
|
static QString recode(const QString &component, QUrl::ComponentFormattingOptions encoding,
|
||||||
qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
|
const uchar *actionTable, bool retryBadEncoding)
|
||||||
const uchar *tableModifications)
|
|
||||||
{
|
{
|
||||||
uchar actionTable[sizeof defaultActionTable];
|
|
||||||
memcpy(actionTable, defaultActionTable, sizeof actionTable);
|
|
||||||
if (encoding & QUrl::DecodeSpaces)
|
|
||||||
actionTable[0] = DecodeCharacter; // decode
|
|
||||||
|
|
||||||
if (tableModifications) {
|
|
||||||
for (const ushort *p = tableModifications; *p; ++p)
|
|
||||||
actionTable[uchar(*p) - ' '] = *p >> 8;
|
|
||||||
}
|
|
||||||
|
|
||||||
QString result = component;
|
QString result = component;
|
||||||
const ushort *input = reinterpret_cast<const ushort *>(component.constData());
|
const ushort *input = reinterpret_cast<const ushort *>(component.constData());
|
||||||
const ushort * const end = input + component.length();
|
const ushort * const end = input + component.length();
|
||||||
ushort *output = 0;
|
ushort *output = 0;
|
||||||
|
|
||||||
while (input != end) {
|
while (input != end) {
|
||||||
register ushort c = *input++;
|
register ushort c;
|
||||||
register ushort decoded;
|
EncodingAction action;
|
||||||
if (c == '%') {
|
|
||||||
// our input is always valid, so there are two hex characters for us to read here
|
// try a run where no change is necessary
|
||||||
decoded = (decodeNibble(input[0]) << 4) | decodeNibble(input[1]);
|
while (input != end) {
|
||||||
|
c = *input++;
|
||||||
|
if (c < 0x20 || c >= 0x80) // also: (c - 0x20 < 0x60U)
|
||||||
|
goto non_trivial;
|
||||||
|
action = EncodingAction(actionTable[c - ' ']);
|
||||||
|
if (action == EncodeCharacter)
|
||||||
|
goto non_trivial;
|
||||||
|
if (output)
|
||||||
|
*output++ = c;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
non_trivial:
|
||||||
|
register uint decoded;
|
||||||
|
if (c == '%' && retryBadEncoding) {
|
||||||
|
// always write "%25"
|
||||||
|
ensureDetached(result, output, input, end);
|
||||||
|
*output++ = '%';
|
||||||
|
*output++ = '2';
|
||||||
|
*output++ = '5';
|
||||||
|
continue;
|
||||||
|
} else if (c == '%') {
|
||||||
|
// check if the input is valid
|
||||||
|
if (input + 1 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) {
|
||||||
|
// not valid, retry
|
||||||
|
result.clear();
|
||||||
|
return recode(component, encoding, actionTable, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (decoded >= 0x80) {
|
||||||
|
// decode the UTF-8 sequence
|
||||||
|
if (encoding & QUrl::DecodeUnicode &&
|
||||||
|
encodedUtf8ToUcs4(result, output, input, end, decoded))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// decoding the encoded UTF-8 failed
|
||||||
|
action = LeaveCharacter;
|
||||||
|
} else if (decoded >= 0x20) {
|
||||||
|
action = EncodingAction(actionTable[decoded - ' ']);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
decoded = c;
|
decoded = c;
|
||||||
}
|
if (decoded >= 0x80 && (encoding & QUrl::DecodeUnicode) == 0) {
|
||||||
|
// encode the UTF-8 sequence
|
||||||
EncodingAction action;
|
|
||||||
if (decoded < 0x20) {
|
|
||||||
// always encode control characters
|
|
||||||
action = EncodeCharacter;
|
|
||||||
} else if (decoded < 0x80) {
|
|
||||||
// use the table
|
|
||||||
action = EncodingAction(actionTable[decoded - ' ']);
|
|
||||||
} else {
|
|
||||||
// non-ASCII
|
|
||||||
bool decodeUnicode = encoding & QUrl::DecodeUnicode;
|
|
||||||
|
|
||||||
// should we leave it like this?
|
|
||||||
if ((c != '%' && decodeUnicode) || (c == '%' && !decodeUnicode)) {
|
|
||||||
action = LeaveCharacter;
|
|
||||||
} else if (decodeUnicode) {
|
|
||||||
// c == '%': decode the UTF-8 sequence
|
|
||||||
if (encodedUtf8ToUcs4(result, output, input, end, decoded))
|
|
||||||
continue;
|
|
||||||
action = LeaveCharacter;
|
|
||||||
} else {
|
|
||||||
// c != '%': encode the UTF-8 sequence
|
|
||||||
unicodeToEncodedUtf8(result, output, input, end, decoded);
|
unicodeToEncodedUtf8(result, output, input, end, decoded);
|
||||||
continue;
|
continue;
|
||||||
|
} else if (decoded >= 0x80) {
|
||||||
|
if (output)
|
||||||
|
*output++ = c;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (decoded < 0x20)
|
||||||
|
action = EncodeCharacter;
|
||||||
|
|
||||||
// there are six possibilities:
|
// there are six possibilities:
|
||||||
// current \ action | DecodeCharacter | LeaveCharacter | EncodeCharacter
|
// current \ action | DecodeCharacter | LeaveCharacter | EncodeCharacter
|
||||||
// decoded | 1:leave | 2:leave | 3:encode
|
// decoded | 1:leave | 2:leave | 3:encode
|
||||||
// encoded | 4:decode | 5:leave | 6:leave
|
// encoded | 4:decode | 5:leave | 6:leave
|
||||||
|
// cases 1 and 2 were handled before this section
|
||||||
|
|
||||||
if (c != '%' && (action == LeaveCharacter || action == DecodeCharacter)) {
|
if (c == '%' && action != DecodeCharacter) {
|
||||||
// cases 1 and 2: it's decoded and we're leaving it as is
|
|
||||||
// there's always enough memory allocated for a single character
|
|
||||||
if (output)
|
|
||||||
*output++ = c;
|
|
||||||
} else if (c == '%' && (action == LeaveCharacter || action == EncodeCharacter)) {
|
|
||||||
// cases 5 and 6: it's encoded and we're leaving it as it is
|
// cases 5 and 6: it's encoded and we're leaving it as it is
|
||||||
// except we're pedantic and we'll uppercase the hex
|
// except we're pedantic and we'll uppercase the hex
|
||||||
if (output || !isUpperHex(input[0]) || !isUpperHex(input[1])) {
|
if (output || !isUpperHex(input[0]) || !isUpperHex(input[1])) {
|
||||||
@ -442,63 +463,31 @@ qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding
|
|||||||
}
|
}
|
||||||
|
|
||||||
Q_AUTOTEST_EXPORT QString
|
Q_AUTOTEST_EXPORT QString
|
||||||
qt_tolerantParsePercentEncoding(const QString &url)
|
qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
|
||||||
|
const ushort *tableModifications)
|
||||||
{
|
{
|
||||||
// are there any '%'
|
uchar actionTable[sizeof defaultActionTable];
|
||||||
int firstPercent = url.indexOf(QLatin1Char('%'));
|
if (encoding & QUrl::DecodeAllDelimiters) {
|
||||||
if (firstPercent == -1) {
|
// reset the table
|
||||||
// none found, the string is fine
|
memset(actionTable, DecodeCharacter, sizeof actionTable);
|
||||||
return url;
|
if (!(encoding & QUrl::DecodeSpaces))
|
||||||
|
actionTable[0] = EncodeCharacter;
|
||||||
|
|
||||||
|
// these are always encoded
|
||||||
|
actionTable['%' - ' '] = EncodeCharacter;
|
||||||
|
actionTable[0x7F - ' '] = EncodeCharacter;
|
||||||
|
} else {
|
||||||
|
memcpy(actionTable, defaultActionTable, sizeof actionTable);
|
||||||
|
if (encoding & QUrl::DecodeSpaces)
|
||||||
|
actionTable[0] = DecodeCharacter; // decode
|
||||||
}
|
}
|
||||||
|
|
||||||
// are there any invalid percents?
|
if (tableModifications) {
|
||||||
int nextPercent = firstPercent;
|
for (const ushort *p = tableModifications; *p; ++p)
|
||||||
int percentCount = 0;
|
actionTable[uchar(*p) - ' '] = *p >> 8;
|
||||||
|
|
||||||
{
|
|
||||||
int len = url.length();
|
|
||||||
bool ok = true;
|
|
||||||
do {
|
|
||||||
++percentCount;
|
|
||||||
if (nextPercent + 2 >= len ||
|
|
||||||
!isHex(url.at(nextPercent + 1).unicode()) ||
|
|
||||||
!isHex(url.at(nextPercent + 2).unicode())) {
|
|
||||||
ok = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
nextPercent = url.indexOf(QLatin1Char('%'), nextPercent + 1);
|
|
||||||
} while (nextPercent != -1);
|
|
||||||
|
|
||||||
if (ok)
|
|
||||||
return url;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// we've found at least one invalid percent
|
return recode(component, encoding, actionTable, false);
|
||||||
// that means all of them are invalid
|
|
||||||
QString corrected(url.size() + percentCount * 2, Qt::Uninitialized);
|
|
||||||
ushort *output = reinterpret_cast<ushort *>(corrected.data());
|
|
||||||
const ushort *input = reinterpret_cast<const ushort *>(url.constData());
|
|
||||||
for (int i = 0; i <= firstPercent; ++i)
|
|
||||||
output[i] = input[i];
|
|
||||||
|
|
||||||
const ushort *const end = input + url.length();
|
|
||||||
output += firstPercent + 1;
|
|
||||||
input += firstPercent + 1;
|
|
||||||
|
|
||||||
// we've copied up to the first percent
|
|
||||||
// correct this one and all others
|
|
||||||
*output++ = '2';
|
|
||||||
*output++ = '5';
|
|
||||||
while (input != end) {
|
|
||||||
// copy verbatim until the next percent, inclusive
|
|
||||||
*output++ = *input;
|
|
||||||
if (*input == '%') {
|
|
||||||
*output++ = '2';
|
|
||||||
*output++ = '5';
|
|
||||||
}
|
|
||||||
++input;
|
|
||||||
}
|
|
||||||
return corrected;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
QT_END_NAMESPACE
|
QT_END_NAMESPACE
|
||||||
|
@ -50,7 +50,6 @@ Q_CORE_EXPORT extern void qt_nameprep(QString *source, int from);
|
|||||||
Q_CORE_EXPORT extern bool qt_check_std3rules(const QChar *, int);
|
Q_CORE_EXPORT extern bool qt_check_std3rules(const QChar *, int);
|
||||||
Q_CORE_EXPORT void qt_punycodeEncoder(const QChar *s, int ucLength, QString *output);
|
Q_CORE_EXPORT void qt_punycodeEncoder(const QChar *s, int ucLength, QString *output);
|
||||||
Q_CORE_EXPORT QString qt_punycodeDecoder(const QString &pc);
|
Q_CORE_EXPORT QString qt_punycodeDecoder(const QString &pc);
|
||||||
Q_CORE_EXPORT QString qt_tolerantParsePercentEncoding(const QString &url);
|
|
||||||
Q_CORE_EXPORT QString qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
|
Q_CORE_EXPORT QString qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
|
||||||
const ushort *tableModifications = 0);
|
const ushort *tableModifications = 0);
|
||||||
QT_END_NAMESPACE
|
QT_END_NAMESPACE
|
||||||
@ -791,6 +790,17 @@ void tst_QUrlInternal::correctEncodedMistakes_data()
|
|||||||
|
|
||||||
// three percents, one invalid
|
// three percents, one invalid
|
||||||
QTest::newRow("%01%02%3") << "%01%02%3" << "%2501%2502%253";
|
QTest::newRow("%01%02%3") << "%01%02%3" << "%2501%2502%253";
|
||||||
|
|
||||||
|
// now mix bad percents with Unicode decoding
|
||||||
|
QTest::newRow("%C2%") << "%C2%" << "%25C2%25";
|
||||||
|
QTest::newRow("%C2%A") << "%C2%A" << "%25C2%25A";
|
||||||
|
QTest::newRow("%C2%Az") << "%C2%Az" << "%25C2%25Az";
|
||||||
|
QTest::newRow("%E2%A0%") << "%E2%A0%" << "%25E2%25A0%25";
|
||||||
|
QTest::newRow("%E2%A0%A") << "%E2%A0%A" << "%25E2%25A0%25A";
|
||||||
|
QTest::newRow("%E2%A0%Az") << "%E2%A0%Az" << "%25E2%25A0%25Az";
|
||||||
|
QTest::newRow("%F2%A0%A0%") << "%F2%A0%A0%" << "%25F2%25A0%25A0%25";
|
||||||
|
QTest::newRow("%F2%A0%A0%A") << "%F2%A0%A0%A" << "%25F2%25A0%25A0%25A";
|
||||||
|
QTest::newRow("%F2%A0%A0%Az") << "%F2%A0%A0%Az" << "%25F2%25A0%25A0%25Az";
|
||||||
}
|
}
|
||||||
|
|
||||||
void tst_QUrlInternal::correctEncodedMistakes()
|
void tst_QUrlInternal::correctEncodedMistakes()
|
||||||
@ -798,7 +808,7 @@ void tst_QUrlInternal::correctEncodedMistakes()
|
|||||||
QFETCH(QString, input);
|
QFETCH(QString, input);
|
||||||
QFETCH(QString, expected);
|
QFETCH(QString, expected);
|
||||||
|
|
||||||
QString output = qt_tolerantParsePercentEncoding(input);
|
QString output = qt_urlRecode(input, QUrl::DecodeUnicode);
|
||||||
QCOMPARE(output, expected);
|
QCOMPARE(output, expected);
|
||||||
QCOMPARE(output.isNull(), expected.isNull());
|
QCOMPARE(output.isNull(), expected.isNull());
|
||||||
}
|
}
|
||||||
@ -921,10 +931,6 @@ void tst_QUrlInternal::encodingRecode()
|
|||||||
QFETCH(QString, expected);
|
QFETCH(QString, expected);
|
||||||
QFETCH(QUrl::ComponentFormattingOptions, encodingMode);
|
QFETCH(QUrl::ComponentFormattingOptions, encodingMode);
|
||||||
|
|
||||||
// ensure the string is properly percent-encoded
|
|
||||||
QVERIFY2(input == qt_tolerantParsePercentEncoding(input), "Test data is not properly encoded");
|
|
||||||
QVERIFY2(expected == qt_tolerantParsePercentEncoding(expected), "Test data is not properly encoded");
|
|
||||||
|
|
||||||
QString output = qt_urlRecode(input, encodingMode);
|
QString output = qt_urlRecode(input, encodingMode);
|
||||||
QCOMPARE(output, expected);
|
QCOMPARE(output, expected);
|
||||||
QCOMPARE(output.isNull(), expected.isNull());
|
QCOMPARE(output.isNull(), expected.isNull());
|
||||||
|
Loading…
Reference in New Issue
Block a user