QUrl: Implement UTS #46

UTS #46 (https://unicode.org/reports/tr46/) is a successor to
IDNA 2003/2008 standards from Unicode.

The current implementation uses nontransitional processing by default.
An optional argument is added to QUrl::toAce() and QUrl::fromAce() to
allow using transitional processing and to ignore the IDN whitelist.

[ChangeLog][QtCore][QUrl] ACE processing is now performed according
to the UTS #46 standard based on IDNA 2008 instead of IDNA 2003.

Task-number: QTBUG-85371
Change-Id: I46b2e86792bc9699cb6961bae8e283fbff72f874
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
This commit is contained in:
Ievgenii Meshcheriakov 2021-08-02 16:33:44 +02:00
parent f5360b7c72
commit 4bf3010378
4 changed files with 596 additions and 2248 deletions

View File

@ -389,6 +389,25 @@
\sa fromUserInput()
*/
/*!
\enum QUrl::AceProcessingOption
\since 6.3
The ACE processing options control the way URLs are transformed to and from
ASCII-Compatible Encoding.
\value IgnoreIDNWhitelist Ignore the IDN whitelist when converting URLs
to Unicode.
\value AceTransitionalProcessing Use transitional processing described in UTS #46.
This allows better compatibility with IDNA 2003
specification.
The default is to use nontransitional processing and to allow non-ASCII
characters only inside URLs whose top-level domains are listed in the IDN whitelist.
\sa toAce(), fromAce(), idnWhitelist()
*/
/*!
\fn QUrl::QUrl(QUrl &&other)
@ -1177,7 +1196,7 @@ inline void QUrlPrivate::appendHost(QString &appendTo, QUrl::FormattingOptions o
// this is either an IPv4Address or a reg-name
// if it is a reg-name, it is already stored in Unicode form
if (options & QUrl::EncodeUnicode && !(options & 0x4000000))
appendTo += qt_ACE_do(host, ToAceOnly, AllowLeadingDot);
appendTo += qt_ACE_do(host, ToAceOnly, AllowLeadingDot, {});
else
appendTo += host;
}
@ -1339,7 +1358,7 @@ inline bool QUrlPrivate::setHost(const QString &value, int from, int iend, QUrl:
// Unicode encoding (some non-ASCII characters case-fold to digits
// when nameprepping is done)
//
// The qt_ACE_do function below applies nameprepping and the STD3 check.
// The qt_ACE_do function below does IDNA normalization and the STD3 check.
// That means a Unicode string may become an IPv4 address, but it cannot
// produce a '[' or a '%'.
@ -1358,7 +1377,7 @@ inline bool QUrlPrivate::setHost(const QString &value, int from, int iend, QUrl:
return setHost(s, 0, s.length(), QUrl::StrictMode);
}
s = qt_ACE_do(QStringView(begin, len), NormalizeAce, ForbidLeadingDot);
s = qt_ACE_do(value.mid(from, iend - from), NormalizeAce, ForbidLeadingDot, {});
if (s.isEmpty()) {
setError(InvalidRegNameError, value);
return false;
@ -3013,50 +3032,72 @@ QByteArray QUrl::toPercentEncoding(const QString &input, const QByteArray &exclu
}
/*!
\since 4.2
\since 6.3
Returns the Unicode form of the given domain name
\a domain, which is encoded in the ASCII Compatible Encoding (ACE).
The output can be customized by passing flags with \a options.
The result of this function is considered equivalent to \a domain.
If the value in \a domain cannot be encoded, it will be converted
to QString and returned.
The ASCII Compatible Encoding (ACE) is defined by RFC 3490, RFC 3491
and RFC 3492. It is part of the Internationalizing Domain Names in
Applications (IDNA) specification, which allows for domain names
(like \c "example.com") to be written using international
characters.
The ASCII-Compatible Encoding (ACE) is defined by RFC 3490, RFC 3491
and RFC 3492 and updated by the Unicode Technical Standard #46. It is part
of the Internationalizing Domain Names in Applications (IDNA) specification,
which allows for domain names (like \c "example.com") to be written using
non-US-ASCII characters.
*/
QString QUrl::fromAce(const QByteArray &domain, QUrl::AceProcessingOptions options)
{
return qt_ACE_do(QString::fromLatin1(domain), NormalizeAce,
ForbidLeadingDot /*FIXME: make configurable*/, options);
}
#if QT_VERSION < QT_VERSION_CHECK(7, 0, 0)
/*!
\since 4.2
\overload
*/
QString QUrl::fromAce(const QByteArray &domain)
{
QVarLengthArray<char16_t> buffer;
buffer.resize(domain.size());
qt_from_latin1(buffer.data(), domain.data(), domain.size());
return qt_ACE_do(QStringView{buffer.data(), buffer.size()},
NormalizeAce, ForbidLeadingDot /*FIXME: make configurable*/);
return fromAce(domain, {});
}
#endif
/*!
\since 4.2
\since 6.3
Returns the ASCII Compatible Encoding of the given domain name \a domain.
The output can be customized by passing flags with \a options.
The result of this function is considered equivalent to \a domain.
The ASCII-Compatible Encoding (ACE) is defined by RFC 3490, RFC 3491
and RFC 3492. It is part of the Internationalizing Domain Names in
Applications (IDNA) specification, which allows for domain names
(like \c "example.com") to be written using international
characters.
and RFC 3492 and updated by the Unicode Technical Standard #46. It is part
of the Internationalizing Domain Names in Applications (IDNA) specification,
which allows for domain names (like \c "example.com") to be written using
non-US-ASCII characters.
This function returns an empty QByteArray if \a domain is not a valid
hostname. Note, in particular, that IPv6 literals are not valid domain
names.
*/
QByteArray QUrl::toAce(const QString &domain, AceProcessingOptions options)
{
return qt_ACE_do(domain, ToAceOnly, ForbidLeadingDot /*FIXME: make configurable*/, options)
.toLatin1();
}
#if QT_VERSION < QT_VERSION_CHECK(7, 0, 0)
/*!
\since 4.2
\overload
*/
QByteArray QUrl::toAce(const QString &domain)
{
return qt_ACE_do(domain, ToAceOnly, ForbidLeadingDot /*FIXME: make configurable*/).toLatin1();
return toAce(domain, {});
}
#endif
/*!
\internal

View File

@ -281,8 +281,22 @@ public:
NSURL *toNSURL() const Q_DECL_NS_RETURNS_AUTORELEASED;
#endif
enum AceProcessingOption : unsigned int {
IgnoreIDNWhitelist = 0x1,
AceTransitionalProcessing = 0x2,
};
Q_DECLARE_FLAGS(AceProcessingOptions, AceProcessingOption)
#if QT_VERSION >= QT_VERSION_CHECK(7, 0, 0)
static QString fromAce(const QByteArray &, AceProcessingOptions options = {});
static QByteArray toAce(const QString &, AceProcessingOptions options = {});
#else
static QString fromAce(const QByteArray &);
static QByteArray toAce(const QString &);
static QString fromAce(const QByteArray &, AceProcessingOptions options);
static QByteArray toAce(const QString &, AceProcessingOptions options);
#endif
static QStringList idnWhitelist();
static QStringList toStringList(const QList<QUrl> &uris, FormattingOptions options = FormattingOptions(PrettyDecoded));
static QList<QUrl> fromStringList(const QStringList &uris, ParsingMode mode = TolerantMode);
@ -302,6 +316,7 @@ public:
Q_DECLARE_SHARED(QUrl)
Q_DECLARE_OPERATORS_FOR_FLAGS(QUrl::ComponentFormattingOptions)
//Q_DECLARE_OPERATORS_FOR_FLAGS(QUrl::FormattingOptions)
Q_DECLARE_OPERATORS_FOR_FLAGS(QUrl::AceProcessingOptions)
#ifndef Q_QDOC
constexpr inline QUrl::FormattingOptions operator|(QUrl::UrlFormattingOption f1, QUrl::UrlFormattingOption f2)

View File

@ -65,7 +65,8 @@ extern Q_AUTOTEST_EXPORT qsizetype qt_urlRecode(QString &appendTo, QStringView u
// in qurlidna.cpp
enum AceLeadingDot { AllowLeadingDot, ForbidLeadingDot };
enum AceOperation { ToAceOnly, NormalizeAce };
extern QString qt_ACE_do(QStringView domain, AceOperation op, AceLeadingDot dot);
extern QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot,
QUrl::AceProcessingOptions options);
extern Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output);
extern Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc);

File diff suppressed because it is too large Load Diff