Round float->qfloat16 to even

Calibrated to match F16C and ARM-FP16 hardware conversions.

Change-Id: I3bdd4d3db3046fee4aeb24e4ce8b9bc9a06e0397
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Allan Sandfeld Jensen 2020-07-29 12:11:35 +02:00
parent a0e0b51001
commit d3ff95dcb8
4 changed files with 629 additions and 14 deletions

View File

@ -107,8 +107,9 @@ private:
Q_CORE_EXPORT static const quint32 mantissatable[]; Q_CORE_EXPORT static const quint32 mantissatable[];
Q_CORE_EXPORT static const quint32 exponenttable[]; Q_CORE_EXPORT static const quint32 exponenttable[];
Q_CORE_EXPORT static const quint32 offsettable[]; Q_CORE_EXPORT static const quint32 offsettable[];
Q_CORE_EXPORT static const quint32 basetable[]; Q_CORE_EXPORT static const quint16 basetable[];
Q_CORE_EXPORT static const quint32 shifttable[]; Q_CORE_EXPORT static const quint16 shifttable[];
Q_CORE_EXPORT static const quint32 roundtable[];
friend bool qIsNull(qfloat16 f) noexcept; friend bool qIsNull(qfloat16 f) noexcept;
#if !defined(QT_NO_FLOAT16_OPERATORS) #if !defined(QT_NO_FLOAT16_OPERATORS)
@ -173,14 +174,18 @@ inline qfloat16::qfloat16(float f) noexcept
quint32 u; quint32 u;
memcpy(&u, &f, sizeof(quint32)); memcpy(&u, &f, sizeof(quint32));
const quint32 signAndExp = u >> 23; const quint32 signAndExp = u >> 23;
const quint32 base = basetable[signAndExp]; const quint16 base = basetable[signAndExp];
const quint32 shift = shifttable[signAndExp]; const quint16 shift = shifttable[signAndExp];
const quint32 round = roundtable[signAndExp];
quint32 mantissa = (u & 0x007fffff); quint32 mantissa = (u & 0x007fffff);
if ((signAndExp & 0xff) == 0xff) { if ((signAndExp & 0xff) == 0xff) {
if (mantissa) // keep nan from truncating to inf if (mantissa) // keep nan from truncating to inf
mantissa = qMax(1U << shift, mantissa); mantissa = qMax(1U << shift, mantissa);
} else { } else {
mantissa += (1U << (shift - 1)) - 1; // rounding // round half to even
mantissa += round;
if (mantissa & (1 << shift))
--mantissa;
} }
// We use add as the mantissa may overflow causing // We use add as the mantissa may overflow causing

View File

@ -2,6 +2,7 @@
** **
** Copyright (C) 2016 by Southwest Research Institute (R) ** Copyright (C) 2016 by Southwest Research Institute (R)
** Copyright (C) 2019 Intel Corporation. ** Copyright (C) 2019 Intel Corporation.
** Copyright (C) 2020 The Qt Company Ltd.
** Contact: http://www.qt-project.org/legal ** Contact: http://www.qt-project.org/legal
** **
** This file is part of the QtCore module of the Qt Toolkit. ** This file is part of the QtCore module of the Qt Toolkit.
@ -38,7 +39,7 @@
** **
****************************************************************************/ ****************************************************************************/
/* This file was generated by gen_qfloat16_tables.cpp */ /* This file was generated by util/qfloat16-tables/gen_qfloat16_tables.cpp */
#include <QtCore/qfloat16.h> #include <QtCore/qfloat16.h>
@ -2231,7 +2232,7 @@ const quint32 qfloat16::offsettable[64] = {
1024U, 1024U,
}; };
const quint32 qfloat16::basetable[512] = { const quint16 qfloat16::basetable[512] = {
0x0U, 0x0U,
0x0U, 0x0U,
0x0U, 0x0U,
@ -2746,7 +2747,7 @@ const quint32 qfloat16::basetable[512] = {
0xFC00U, 0xFC00U,
}; };
const quint32 qfloat16::shifttable[512] = { const quint16 qfloat16::shifttable[512] = {
0x18U, 0x18U,
0x18U, 0x18U,
0x18U, 0x18U,
@ -3261,6 +3262,521 @@ const quint32 qfloat16::shifttable[512] = {
0xDU, 0xDU,
}; };
#endif // !__F16C__ && !__ARM_FP16_FORMAT_IEEE const quint32 qfloat16::roundtable[512] = {
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x1000000U,
0x400001U,
0x200000U,
0x100000U,
0x80000U,
0x40000U,
0x20000U,
0x10000U,
0x8000U,
0x4000U,
0x2000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x1000000U,
0x400001U,
0x200000U,
0x100000U,
0x80000U,
0x40000U,
0x20000U,
0x10000U,
0x8000U,
0x4000U,
0x2000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x1000U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
0x0U,
};
#endif // !__ARM_FP16_FORMAT_IEEE
QT_END_NAMESPACE QT_END_NAMESPACE

View File

@ -32,6 +32,10 @@
#include <math.h> #include <math.h>
//#define DO_FULL_TEST
static_assert(sizeof(float) == sizeof(quint32), "Float not 32-bit");
class tst_qfloat16: public QObject class tst_qfloat16: public QObject
{ {
Q_OBJECT Q_OBJECT
@ -48,6 +52,11 @@ private slots:
void promotionTests(); void promotionTests();
void arithOps_data(); void arithOps_data();
void arithOps(); void arithOps();
#if defined DO_FULL_TEST
void floatToFloat16Full_data();
void floatToFloat16Full();
void floatFromFloat16Full();
#endif
void floatToFloat16(); void floatToFloat16();
void floatFromFloat16(); void floatFromFloat16();
void finite_data(); void finite_data();
@ -344,6 +353,63 @@ void tst_qfloat16::arithOps()
QVERIFY(qFuzzyCompare(r4,1.f/val2)); QVERIFY(qFuzzyCompare(r4,1.f/val2));
} }
#if defined DO_FULL_TEST
void tst_qfloat16::floatToFloat16Full_data()
{
QTest::addColumn<quint32>("group");
for (quint32 j = 0x00; j < 0x100; ++j)
QTest::addRow("%02x", j) << j;
}
void tst_qfloat16::floatToFloat16Full()
{
QFETCH(quint32, group);
for (quint32 j = 0x00; j < 0x100; ++j) {
quint32 data[1<<16];
qfloat16 out[1<<16];
qfloat16 expected[1<<16];
float in[1<<16];
for (int i = 0; i < (1<<16); ++i)
data[i] = (group << 24) | (j << 16) | i;
memcpy(in, data, (1<<16)*sizeof(float));
for (int i = 0; i < (1<<16); ++i)
expected[i] = qfloat16(in[i]);
qFloatToFloat16(out, in, 1<<16);
for (int i = 0; i < (1<<16); ++i) {
if (out[i] != expected[i])
QVERIFY(qIsNaN(out[i]) && qIsNaN(expected[i]));
}
}
}
void tst_qfloat16::floatFromFloat16Full()
{
quint16 data[1<<16];
float out[1<<16];
float expected[1<<16];
for (int i = 0; i < (1<<16); ++i)
data[i] = i;
const qfloat16 *in = reinterpret_cast<const qfloat16 *>(data);
for (int i = 0; i < (1<<16); ++i)
expected[i] = float(in[i]);
qFloatFromFloat16(out, in, 1<<16);
for (int i = 0; i < (1<<16); ++i)
if (out[i] != expected[i])
QVERIFY(qIsNaN(out[i]) && qIsNaN(expected[i]));
}
#endif
void tst_qfloat16::floatToFloat16() void tst_qfloat16::floatToFloat16()
{ {
constexpr int count = 10000; constexpr int count = 10000;
@ -505,8 +571,8 @@ void tst_qfloat16::limits() // See also: qNaN() and infinity()
QCOMPARE(qFpClassify(high10), FP_NORMAL); QCOMPARE(qFpClassify(high10), FP_NORMAL);
// How many digits are significant ? (Casts avoid linker errors ...) // How many digits are significant ? (Casts avoid linker errors ...)
QCOMPARE(int(Bounds::digits10), 3); // ~9.78e-4 has enough sigificant digits: QCOMPARE(int(Bounds::digits10), 3); // ~9.88e-4 has enough sigificant digits:
qfloat16 below(9.781e-4f), above(9.789e-4f); // both round to ~9.785e-4 qfloat16 below(9.876e-4f), above(9.884e-4f); // both round to ~9.88e-4
QVERIFY(below == above); QVERIFY(below == above);
QCOMPARE(int(Bounds::max_digits10), 5); // we need 5 to distinguish these two: QCOMPARE(int(Bounds::max_digits10), 5); // we need 5 to distinguish these two:
QVERIFY(qfloat16(1000.5f) != qfloat16(1001.4f)); QVERIFY(qfloat16(1000.5f) != qfloat16(1001.4f));

View File

@ -2,6 +2,7 @@
** **
** Copyright (C) 2016 by Southwest Research Institute (R) ** Copyright (C) 2016 by Southwest Research Institute (R)
** Copyright (C) 2019 Intel Corporation. ** Copyright (C) 2019 Intel Corporation.
** Copyright (C) 2020 The Qt Company Ltd.
** Contact: http://www.qt-project.org/legal ** Contact: http://www.qt-project.org/legal
** **
** This file is part of the QtCore module of the Qt Toolkit. ** This file is part of the QtCore module of the Qt Toolkit.
@ -71,6 +72,7 @@ uint32_t convertmantissa(int32_t i)
// to more closely map the implementation given in the paper. // to more closely map the implementation given in the paper.
uint32_t basetable[512]; uint32_t basetable[512];
uint32_t shifttable[512]; uint32_t shifttable[512];
uint32_t roundtable[512];
int main() int main()
{ {
@ -113,50 +115,76 @@ int main()
int32_t e; int32_t e;
for (i = 0; i < 256; ++i) { for (i = 0; i < 256; ++i) {
e = i - 127; e = i - 127;
if (e < -24) { // Very small numbers map to zero if (e < -25) { // Very small numbers map to zero
basetable[i | 0x000] = 0x0000; basetable[i | 0x000] = 0x0000;
basetable[i | 0x100] = 0x8000; basetable[i | 0x100] = 0x8000;
shifttable[i | 0x000] = 24; shifttable[i | 0x000] = 24;
shifttable[i | 0x100] = 24; shifttable[i | 0x100] = 24;
roundtable[i | 0x000] = 0;
roundtable[i | 0x100] = 0;
} else if (e < -14) { // Small numbers map to denorms } else if (e < -14) { // Small numbers map to denorms
basetable[i | 0x000] = (0x0400 >> (-e - 14)); basetable[i | 0x000] = (0x0400 >> (-e - 14));
basetable[i | 0x100] = (0x0400 >> (-e - 14)) | 0x8000; basetable[i | 0x100] = (0x0400 >> (-e - 14)) | 0x8000;
shifttable[i | 0x000] = -e - 1; shifttable[i | 0x000] = -e - 1;
shifttable[i | 0x100] = -e - 1; shifttable[i | 0x100] = -e - 1;
if (e == -25) {
// rounds up
roundtable[i | 0x000] = (1 << 24);
roundtable[i | 0x100] = (1 << 24);
} else if (e == -24) {
// rounds half up
roundtable[i | 0x000] = (1 << 22) + 1;
roundtable[i | 0x100] = (1 << 22) + 1;
} else {
roundtable[i | 0x000] = (1 << (-e - 2));
roundtable[i | 0x100] = (1 << (-e - 2));
}
} else if (e <= 15) { // Normal numbers just lose precision } else if (e <= 15) { // Normal numbers just lose precision
basetable[i | 0x000] = ((e + 15) << 10); basetable[i | 0x000] = ((e + 15) << 10);
basetable[i | 0x100] = ((e + 15) << 10) | 0x8000; basetable[i | 0x100] = ((e + 15) << 10) | 0x8000;
shifttable[i | 0x000] = 13; shifttable[i | 0x000] = 13;
shifttable[i | 0x100] = 13; shifttable[i | 0x100] = 13;
roundtable[i | 0x000] = (1 << 12);
roundtable[i | 0x100] = (1 << 12);
} else if (e < 128) { // Large numbers map to Infinity } else if (e < 128) { // Large numbers map to Infinity
basetable[i | 0x000] = 0x7C00; basetable[i | 0x000] = 0x7C00;
basetable[i | 0x100] = 0xFC00; basetable[i | 0x100] = 0xFC00;
shifttable[i | 0x000] = 24; shifttable[i | 0x000] = 24;
shifttable[i | 0x100] = 24; shifttable[i | 0x100] = 24;
roundtable[i | 0x000] = 0;
roundtable[i | 0x100] = 0;
} else { // Infinity and NaN's stay Infinity and NaN's } else { // Infinity and NaN's stay Infinity and NaN's
basetable[i | 0x000] = 0x7C00; basetable[i | 0x000] = 0x7C00;
basetable[i | 0x100] = 0xFC00; basetable[i | 0x100] = 0xFC00;
shifttable[i | 0x000] = 13; shifttable[i | 0x000] = 13;
shifttable[i | 0x100] = 13; shifttable[i | 0x100] = 13;
roundtable[i | 0x000] = 0;
roundtable[i | 0x100] = 0;
} }
} }
printf("const quint32 qfloat16::basetable[512] = {\n"); printf("const quint16 qfloat16::basetable[512] = {\n");
for (i = 0; i < 512; i++) for (i = 0; i < 512; i++)
printf("0x%XU,\n", basetable[i]); printf("0x%XU,\n", basetable[i]);
printf("};\n\n"); printf("};\n\n");
printf("const quint32 qfloat16::shifttable[512] = {\n"); printf("const quint16 qfloat16::shifttable[512] = {\n");
for (i = 0; i < 512; i++) for (i = 0; i < 512; i++)
printf("0x%XU,\n", shifttable[i]); printf("0x%XU,\n", shifttable[i]);
printf("};\n\n"); printf("};\n\n");
printf("const quint32 qfloat16::roundtable[512] = {\n");
for (i = 0; i < 512; i++)
printf("0x%XU,\n", roundtable[i]);
printf("};\n\n");
printf("#endif // !__ARM_FP16_FORMAT_IEEE\n\n"); printf("#endif // !__ARM_FP16_FORMAT_IEEE\n\n");
printf("QT_END_NAMESPACE\n"); printf("QT_END_NAMESPACE\n");
return 0; return 0;