Round float->qfloat16 to even

Calibrated to match F16C and ARM-FP16 hardware conversions. Change-Id: I3bdd4d3db3046fee4aeb24e4ce8b9bc9a06e0397 Reviewed-by: Edward Welbourne <edward.welbourne@qt.io> Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-07-29 12:11:35 +02:00 · 2020-07-29 12:11:35 +02:00 · d3ff95dcb8
commit d3ff95dcb8
parent a0e0b51001
4 changed files with 629 additions and 14 deletions
--- a/src/corelib/global/qfloat16.h
+++ b/src/corelib/global/qfloat16.h
@ -107,8 +107,9 @@ private:
    Q_CORE_EXPORT static const quint32 mantissatable[];
    Q_CORE_EXPORT static const quint32 exponenttable[];
    Q_CORE_EXPORT static const quint32 offsettable[];
-    Q_CORE_EXPORT static const quint32 basetable[];
-    Q_CORE_EXPORT static const quint32 shifttable[];
+    Q_CORE_EXPORT static const quint16 basetable[];
+    Q_CORE_EXPORT static const quint16 shifttable[];
+    Q_CORE_EXPORT static const quint32 roundtable[];

    friend bool qIsNull(qfloat16 f) noexcept;
 #if !defined(QT_NO_FLOAT16_OPERATORS)
@ -173,14 +174,18 @@ inline qfloat16::qfloat16(float f) noexcept
    quint32 u;
    memcpy(&u, &f, sizeof(quint32));
    const quint32 signAndExp = u >> 23;
-    const quint32 base = basetable[signAndExp];
-    const quint32 shift = shifttable[signAndExp];
+    const quint16 base = basetable[signAndExp];
+    const quint16 shift = shifttable[signAndExp];
+    const quint32 round = roundtable[signAndExp];
    quint32 mantissa = (u & 0x007fffff);
    if ((signAndExp & 0xff) == 0xff) {
        if (mantissa) // keep nan from truncating to inf
            mantissa = qMax(1U << shift, mantissa);
    } else {
-        mantissa += (1U << (shift - 1)) - 1; // rounding
+        // round half to even
+        mantissa += round;
+        if (mantissa & (1 << shift))
+            --mantissa;
    }

    // We use add as the mantissa may overflow causing
--- a/src/corelib/global/qfloat16tables.cpp
+++ b/src/corelib/global/qfloat16tables.cpp
@ -2,6 +2,7 @@
 **
 ** Copyright (C) 2016 by Southwest Research Institute (R)
 ** Copyright (C) 2019 Intel Corporation.
+** Copyright (C) 2020 The Qt Company Ltd.
 ** Contact: http://www.qt-project.org/legal
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@ -38,7 +39,7 @@
 **
 ****************************************************************************/

-/* This file was generated by gen_qfloat16_tables.cpp */
+/* This file was generated by util/qfloat16-tables/gen_qfloat16_tables.cpp */

 #include <QtCore/qfloat16.h>

@ -2231,7 +2232,7 @@ const quint32 qfloat16::offsettable[64] = {
 1024U,
 };

-const quint32 qfloat16::basetable[512] = {
+const quint16 qfloat16::basetable[512] = {
 0x0U,
 0x0U,
 0x0U,
@ -2746,7 +2747,7 @@ const quint32 qfloat16::basetable[512] = {
 0xFC00U,
 };

-const quint32 qfloat16::shifttable[512] = {
+const quint16 qfloat16::shifttable[512] = {
 0x18U,
 0x18U,
 0x18U,
@ -3261,6 +3262,521 @@ const quint32 qfloat16::shifttable[512] = {
 0xDU,
 };

-#endif // !__F16C__ && !__ARM_FP16_FORMAT_IEEE
+const quint32 qfloat16::roundtable[512] = {
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x1000000U,
+0x400001U,
+0x200000U,
+0x100000U,
+0x80000U,
+0x40000U,
+0x20000U,
+0x10000U,
+0x8000U,
+0x4000U,
+0x2000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x1000000U,
+0x400001U,
+0x200000U,
+0x100000U,
+0x80000U,
+0x40000U,
+0x20000U,
+0x10000U,
+0x8000U,
+0x4000U,
+0x2000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x1000U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+0x0U,
+};
+
+#endif // !__ARM_FP16_FORMAT_IEEE

 QT_END_NAMESPACE
--- a/tests/auto/corelib/global/qfloat16/tst_qfloat16.cpp
+++ b/tests/auto/corelib/global/qfloat16/tst_qfloat16.cpp
@ -32,6 +32,10 @@

 #include <math.h>

+//#define DO_FULL_TEST
+
+static_assert(sizeof(float) == sizeof(quint32), "Float not 32-bit");
+
 class tst_qfloat16: public QObject
 {
    Q_OBJECT
@ -48,6 +52,11 @@ private slots:
    void promotionTests();
    void arithOps_data();
    void arithOps();
+#if defined DO_FULL_TEST
+    void floatToFloat16Full_data();
+    void floatToFloat16Full();
+    void floatFromFloat16Full();
+#endif
    void floatToFloat16();
    void floatFromFloat16();
    void finite_data();
@ -344,6 +353,63 @@ void tst_qfloat16::arithOps()
    QVERIFY(qFuzzyCompare(r4,1.f/val2));
 }

+#if defined DO_FULL_TEST
+void tst_qfloat16::floatToFloat16Full_data()
+{
+    QTest::addColumn<quint32>("group");
+    for (quint32 j = 0x00; j < 0x100; ++j)
+        QTest::addRow("%02x", j) << j;
+
+}
+
+void tst_qfloat16::floatToFloat16Full()
+{
+    QFETCH(quint32, group);
+    for (quint32 j = 0x00; j < 0x100; ++j) {
+        quint32 data[1<<16];
+        qfloat16 out[1<<16];
+        qfloat16 expected[1<<16];
+        float in[1<<16];
+
+        for (int i = 0; i < (1<<16); ++i)
+            data[i] = (group << 24) | (j << 16) | i;
+
+        memcpy(in, data, (1<<16)*sizeof(float));
+
+        for (int i = 0; i < (1<<16); ++i)
+            expected[i] = qfloat16(in[i]);
+
+        qFloatToFloat16(out, in, 1<<16);
+
+        for (int i = 0; i < (1<<16); ++i) {
+            if (out[i] != expected[i])
+                QVERIFY(qIsNaN(out[i]) && qIsNaN(expected[i]));
+        }
+    }
+}
+
+void tst_qfloat16::floatFromFloat16Full()
+{
+    quint16 data[1<<16];
+    float out[1<<16];
+    float expected[1<<16];
+
+    for (int i = 0; i < (1<<16); ++i)
+        data[i] = i;
+
+    const qfloat16 *in = reinterpret_cast<const qfloat16 *>(data);
+
+    for (int i = 0; i < (1<<16); ++i)
+        expected[i] = float(in[i]);
+
+    qFloatFromFloat16(out, in, 1<<16);
+
+    for (int i = 0; i < (1<<16); ++i)
+        if (out[i] != expected[i])
+            QVERIFY(qIsNaN(out[i]) && qIsNaN(expected[i]));
+}
+#endif
+
 void tst_qfloat16::floatToFloat16()
 {
    constexpr int count = 10000;
@ -505,8 +571,8 @@ void tst_qfloat16::limits() // See also: qNaN() and infinity()
    QCOMPARE(qFpClassify(high10), FP_NORMAL);

    // How many digits are significant ?  (Casts avoid linker errors ...)
-    QCOMPARE(int(Bounds::digits10), 3); // ~9.78e-4 has enough sigificant digits:
-    qfloat16 below(9.781e-4f), above(9.789e-4f); // both round to ~9.785e-4
+    QCOMPARE(int(Bounds::digits10), 3); // ~9.88e-4 has enough sigificant digits:
+    qfloat16 below(9.876e-4f), above(9.884e-4f); // both round to ~9.88e-4
    QVERIFY(below == above);
    QCOMPARE(int(Bounds::max_digits10), 5); // we need 5 to distinguish these two:
    QVERIFY(qfloat16(1000.5f) != qfloat16(1001.4f));
--- a/util/qfloat16-tables/gen_qfloat16_tables.cpp
+++ b/util/qfloat16-tables/gen_qfloat16_tables.cpp
@ -2,6 +2,7 @@
 **
 ** Copyright (C) 2016 by Southwest Research Institute (R)
 ** Copyright (C) 2019 Intel Corporation.
+** Copyright (C) 2020 The Qt Company Ltd.
 ** Contact: http://www.qt-project.org/legal
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@ -71,6 +72,7 @@ uint32_t convertmantissa(int32_t i)
 // to more closely map the implementation given in the paper.
 uint32_t basetable[512];
 uint32_t shifttable[512];
+uint32_t roundtable[512];

 int main()
 {
@ -113,50 +115,76 @@ int main()
    int32_t e;
    for (i = 0; i < 256; ++i) {
        e = i - 127;
-        if (e < -24) {   // Very small numbers map to zero
+        if (e < -25) {   // Very small numbers map to zero
            basetable[i | 0x000] = 0x0000;
            basetable[i | 0x100] = 0x8000;
            shifttable[i | 0x000] = 24;
            shifttable[i | 0x100] = 24;
+            roundtable[i | 0x000] = 0;
+            roundtable[i | 0x100] = 0;

        } else if (e < -14) {             // Small numbers map to denorms
            basetable[i | 0x000] = (0x0400 >> (-e - 14));
            basetable[i | 0x100] = (0x0400 >> (-e - 14)) | 0x8000;
            shifttable[i | 0x000] = -e - 1;
            shifttable[i | 0x100] = -e - 1;
+            if (e == -25) {
+                // rounds up
+                roundtable[i | 0x000] = (1 << 24);
+                roundtable[i | 0x100] = (1 << 24);
+            } else if (e == -24) {
+                // rounds half up
+                roundtable[i | 0x000] = (1 << 22) + 1;
+                roundtable[i | 0x100] = (1 << 22) + 1;
+            } else {
+                roundtable[i | 0x000] = (1 << (-e - 2));
+                roundtable[i | 0x100] = (1 << (-e - 2));
+            }

        } else if (e <= 15) {            // Normal numbers just lose precision
            basetable[i | 0x000] = ((e + 15) << 10);
            basetable[i | 0x100] = ((e + 15) << 10) | 0x8000;
            shifttable[i | 0x000] = 13;
            shifttable[i | 0x100] = 13;
+            roundtable[i | 0x000] = (1 << 12);
+            roundtable[i | 0x100] = (1 << 12);

        } else if (e < 128) {            // Large numbers map to Infinity
            basetable[i | 0x000] = 0x7C00;
            basetable[i | 0x100] = 0xFC00;
            shifttable[i | 0x000] = 24;
            shifttable[i | 0x100] = 24;
+            roundtable[i | 0x000] = 0;
+            roundtable[i | 0x100] = 0;

        } else {                     // Infinity and NaN's stay Infinity and NaN's
            basetable[i | 0x000] = 0x7C00;
            basetable[i | 0x100] = 0xFC00;
            shifttable[i | 0x000] = 13;
            shifttable[i | 0x100] = 13;
+            roundtable[i | 0x000] = 0;
+            roundtable[i | 0x100] = 0;
        }
    }

-    printf("const quint32 qfloat16::basetable[512] = {\n");
+    printf("const quint16 qfloat16::basetable[512] = {\n");
    for (i = 0; i < 512; i++)
        printf("0x%XU,\n", basetable[i]);

    printf("};\n\n");

-    printf("const quint32 qfloat16::shifttable[512] = {\n");
+    printf("const quint16 qfloat16::shifttable[512] = {\n");
    for (i = 0; i < 512; i++)
        printf("0x%XU,\n", shifttable[i]);

    printf("};\n\n");

+    printf("const quint32 qfloat16::roundtable[512] = {\n");
+    for (i = 0; i < 512; i++)
+        printf("0x%XU,\n", roundtable[i]);
+
+    printf("};\n\n");
+
    printf("#endif // !__ARM_FP16_FORMAT_IEEE\n\n");
    printf("QT_END_NAMESPACE\n");
    return 0;