skia2/experimental/lowp-basic/lowp_experiments.cpp

/*
 * Copyright 2021 Google LLC
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include <cassert>
#include <cstdio>
#include <cstdint>

// Compile for x86_64 + ssse3 with:
//     c++ -O3 --std=c++17 -mssse3 experimental/lowp-basic/lowp_experiments.cpp -o lowp
//
// Compile for aarch64 with (Mac os):
//    c++ -O3 --std=c++17 -arch arm64 experimental/lowp-basic/lowp_experiments.cpp  -o lowp
//
// View assembly:
//    dumpobj -d lowp | less

template <int N, typename T> using V = T __attribute__((ext_vector_type(N)));

#if !defined(__clang__)
    static_assert(false, "This only works on clang.");
#endif

#if defined(__SSSE3__)
    #include <immintrin.h>
#endif

#if defined(__ARM_NEON)
    // From section 5.5.5 of the ARM C Language Extensions (ACLE)
    #include <arm_neon.h>
#endif

using Q15 = V<8, uint16_t>;

// A pure C version of the ssse3 intrinsic mm_mulhrs_epi16;
static Q15 simulate_ssse3_mm_mulhrs_epi16(Q15 a, Q15 b) {
    Q15 result;
    auto m = [](int16_t r, int16_t s) {
        const int32_t rounding = 1 << 14;
        int32_t temp = (int32_t)r * (int32_t)s + rounding;
        return (int16_t)(temp >> 15);
    };
    for (int i = 0; i < 8; i++) {
        result[i] = m(a[i], b[i]);
    }
    return result;
}

// A pure C version of the neon intrinsic vqrdmulhq_s16;
static Q15 simulate_neon_vqrdmulhq_s16(Q15 a, Q15 b) {
    Q15 result;
    const int esize = 16;
    auto m = [](int16_t r, int16_t s) {
        const int64_t rounding = 1 << (esize - 1);
        int64_t product = 2LL * (int64_t)r * (int64_t)s + rounding;
        int64_t result = product >> esize;

        // Saturate the result
        if (int64_t limit =  (1LL << (esize - 1)) - 1; result > limit) { result = limit; }
        if (int64_t limit = -(1LL << (esize - 1))    ; result < limit) { result = limit; }
        return result;
    };
    for (int i = 0; i < 8; i++) {
        result[i] = m(a[i], b[i]);
    }
    return result;
}

#if defined(__SSSE3__)
static void test_mm_mulhrs_epi16_simulation() {
    for (int i = -32768; i < 32768; i++) {
        for (int j = -32768; j < 32768; j++) {
            Q15 a(i);
            Q15 b(j);
            Q15 simResult = simulate_ssse3_mm_mulhrs_epi16(a, b);
            Q15 intrinsicResult = _mm_mulhrs_epi16(a, b);
            for (int i = 0; i < 8; i++) {
                if (simResult[i] != intrinsicResult[i]) {
                    printf("simulate_ssse3_mm_mulhrs_epi16 broken\n");
                    printf("i: %d, a: %hx b: %hx, intrinsic: %hx, sim: %hx\n",
                           i, a[i], b[i], intrinsicResult[i], simResult[i]);
                    exit(1);
                }
            }
        }
    }
}

// Use ssse3 to simulate saturating multiply on arm.
static Q15 ssse3_vqrdmulhq_s16(Q15 a, Q15 b) {
    constexpr Q15 limit(0x8000);
    const Q15 product = _mm_mulhrs_epi16(a, b);
    const Q15 eq = _mm_cmpeq_epi16(product, limit);
    return _mm_xor_si128(eq, product);
}

static void test_ssse3_vqrdmulhq_s16() {
    for (int i = -32768; i < 32768; i++) {
        for (int j = -32768; j < 32768; j++) {
            Q15 a(i);
            Q15 b(j);
            Q15 simResult = ssse3_vqrdmulhq_s16(a, b);
            Q15 realVqrdmulhqS16 = simulate_neon_vqrdmulhq_s16(a, b);
            for (int i = 0; i < 8; i++) {
                if (simResult[i] != realVqrdmulhqS16[i]) {
                    printf("simulating vqrdmulhq_s16 with ssse3 broken\n");
                    printf("i: %d, a: %hx b: %hx, intrinsic: %hx, sim: %hx\n",
                           i, a[i], b[i], realVqrdmulhqS16[i], simResult[i]);
                    exit(1);
                }
            }
        }
    }
}

#endif

#if defined(__ARM_NEON)
static void test_neon_vqrdmulhq_s16_simulation() {
    for (int i = -32768; i < 32768; i++) {
        for (int j = -32768; j < 32768; j++) {
            Q15 a(i);
            Q15 b(j);
            Q15 simResult = simulate_neon_vqrdmulhq_s16(a, b);
            Q15 intrinsicResult = vqrdmulhq_s16(a, b);
            for (int i = 0; i < 8; i++) {
                if (simResult[i] != intrinsicResult[i]) {
                    printf("simulate_neon_vqrdmulhq_s16 broken\n");
                    printf("i: %d, a: %hx b: %hx, intrinsic: %hx, sim: %hx\n",
                           i, a[i], b[i], intrinsicResult[i], simResult[i]);
                    exit(1);
                }
            }
        }
    }
}
#endif

int main() {
    #if defined(__SSSE3__)
        //test_mm_mulhrs_epi16_simulation();
        test_ssse3_vqrdmulhq_s16();
    #endif
    #if defined(__ARM_NEON)
        test_neon_vqrdmulhq_s16_simulation();
    #endif
    printf("Done.\n");
    return 0;
}
basic experiments for understanding lowp Change-Id: I5f0c36b44cd5bfd3f978d892a342a1d008baceb2 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446176 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 16:28:44 +00:00			`/*`
			`* Copyright 2021 Google LLC`
			`*`
			`* Use of this source code is governed by a BSD-style license that can be`
			`* found in the LICENSE file.`
			`*/`

			`#include <cassert>`
			`#include <cstdio>`
			`#include <cstdint>`

			`// Compile for x86_64 + ssse3 with:`
add simulation for arm rounding saturating multiply Change-Id: I31dd90b2f09876d37c2c202d620cef4028af6266 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446183 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 20:14:47 +00:00			`// c++ -O3 --std=c++17 -mssse3 experimental/lowp-basic/lowp_experiments.cpp -o lowp`
basic experiments for understanding lowp Change-Id: I5f0c36b44cd5bfd3f978d892a342a1d008baceb2 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446176 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 16:28:44 +00:00			`//`
			`// Compile for aarch64 with (Mac os):`
add simulation for arm rounding saturating multiply Change-Id: I31dd90b2f09876d37c2c202d620cef4028af6266 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446183 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 20:14:47 +00:00			`// c++ -O3 --std=c++17 -arch arm64 experimental/lowp-basic/lowp_experiments.cpp -o lowp`
basic experiments for understanding lowp Change-Id: I5f0c36b44cd5bfd3f978d892a342a1d008baceb2 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446176 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 16:28:44 +00:00			`//`
			`// View assembly:`
			`// dumpobj -d lowp \| less`

			`template <int N, typename T> using V = T __attribute__((ext_vector_type(N)));`

			`#if !defined(__clang__)`
			`static_assert(false, "This only works on clang.");`
			`#endif`

			`#if defined(__SSSE3__)`
			`#include <immintrin.h>`
			`#endif`

			`#if defined(__ARM_NEON)`
			`// From section 5.5.5 of the ARM C Language Extensions (ACLE)`
			`#include <arm_neon.h>`
			`#endif`

			`using Q15 = V<8, uint16_t>;`

			`// A pure C version of the ssse3 intrinsic mm_mulhrs_epi16;`
			`static Q15 simulate_ssse3_mm_mulhrs_epi16(Q15 a, Q15 b) {`
			`Q15 result;`
			`auto m = [](int16_t r, int16_t s) {`
			`const int32_t rounding = 1 << 14;`
			`int32_t temp = (int32_t)r * (int32_t)s + rounding;`
			`return (int16_t)(temp >> 15);`
			`};`
			`for (int i = 0; i < 8; i++) {`
			`result[i] = m(a[i], b[i]);`
			`}`
			`return result;`
			`}`

implement ARM saturating multiply using ssse3 Change-Id: I530373e9b53e76a7cd1c0b3a5a1b05354975790c Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446186 Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 21:02:15 +00:00			`// A pure C version of the neon intrinsic vqrdmulhq_s16;`
			`static Q15 simulate_neon_vqrdmulhq_s16(Q15 a, Q15 b) {`
			`Q15 result;`
			`const int esize = 16;`
			`auto m = [](int16_t r, int16_t s) {`
			`const int64_t rounding = 1 << (esize - 1);`
			`int64_t product = 2LL * (int64_t)r * (int64_t)s + rounding;`
			`int64_t result = product >> esize;`

			`// Saturate the result`
			`if (int64_t limit = (1LL << (esize - 1)) - 1; result > limit) { result = limit; }`
			`if (int64_t limit = -(1LL << (esize - 1)) ; result < limit) { result = limit; }`
			`return result;`
			`};`
			`for (int i = 0; i < 8; i++) {`
			`result[i] = m(a[i], b[i]);`
			`}`
			`return result;`
			`}`

add simulation for arm rounding saturating multiply Change-Id: I31dd90b2f09876d37c2c202d620cef4028af6266 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446183 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 20:14:47 +00:00			`#if defined(__SSSE3__)`
basic experiments for understanding lowp Change-Id: I5f0c36b44cd5bfd3f978d892a342a1d008baceb2 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446176 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 16:28:44 +00:00			`static void test_mm_mulhrs_epi16_simulation() {`
			`for (int i = -32768; i < 32768; i++) {`
			`for (int j = -32768; j < 32768; j++) {`
			`Q15 a(i);`
			`Q15 b(j);`
			`Q15 simResult = simulate_ssse3_mm_mulhrs_epi16(a, b);`
			`Q15 intrinsicResult = _mm_mulhrs_epi16(a, b);`
			`for (int i = 0; i < 8; i++) {`
			`if (simResult[i] != intrinsicResult[i]) {`
			`printf("simulate_ssse3_mm_mulhrs_epi16 broken\n");`
			`printf("i: %d, a: %hx b: %hx, intrinsic: %hx, sim: %hx\n",`
			`i, a[i], b[i], intrinsicResult[i], simResult[i]);`
			`exit(1);`
			`}`
			`}`
			`}`
			`}`
			`}`
add simulation for arm rounding saturating multiply Change-Id: I31dd90b2f09876d37c2c202d620cef4028af6266 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446183 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 20:14:47 +00:00
implement ARM saturating multiply using ssse3 Change-Id: I530373e9b53e76a7cd1c0b3a5a1b05354975790c Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446186 Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 21:02:15 +00:00			`// Use ssse3 to simulate saturating multiply on arm.`
			`static Q15 ssse3_vqrdmulhq_s16(Q15 a, Q15 b) {`
			`constexpr Q15 limit(0x8000);`
			`const Q15 product = _mm_mulhrs_epi16(a, b);`
			`const Q15 eq = _mm_cmpeq_epi16(product, limit);`
			`return _mm_xor_si128(eq, product);`
			`}`
add simulation for arm rounding saturating multiply Change-Id: I31dd90b2f09876d37c2c202d620cef4028af6266 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446183 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 20:14:47 +00:00
implement ARM saturating multiply using ssse3 Change-Id: I530373e9b53e76a7cd1c0b3a5a1b05354975790c Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446186 Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 21:02:15 +00:00			`static void test_ssse3_vqrdmulhq_s16() {`
			`for (int i = -32768; i < 32768; i++) {`
			`for (int j = -32768; j < 32768; j++) {`
			`Q15 a(i);`
			`Q15 b(j);`
			`Q15 simResult = ssse3_vqrdmulhq_s16(a, b);`
			`Q15 realVqrdmulhqS16 = simulate_neon_vqrdmulhq_s16(a, b);`
			`for (int i = 0; i < 8; i++) {`
			`if (simResult[i] != realVqrdmulhqS16[i]) {`
			`printf("simulating vqrdmulhq_s16 with ssse3 broken\n");`
			`printf("i: %d, a: %hx b: %hx, intrinsic: %hx, sim: %hx\n",`
			`i, a[i], b[i], realVqrdmulhqS16[i], simResult[i]);`
			`exit(1);`
			`}`
			`}`
			`}`
add simulation for arm rounding saturating multiply Change-Id: I31dd90b2f09876d37c2c202d620cef4028af6266 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446183 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 20:14:47 +00:00			`}`
			`}`

implement ARM saturating multiply using ssse3 Change-Id: I530373e9b53e76a7cd1c0b3a5a1b05354975790c Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446186 Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 21:02:15 +00:00			`#endif`

add simulation for arm rounding saturating multiply Change-Id: I31dd90b2f09876d37c2c202d620cef4028af6266 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446183 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 20:14:47 +00:00			`#if defined(__ARM_NEON)`
			`static void test_neon_vqrdmulhq_s16_simulation() {`
			`for (int i = -32768; i < 32768; i++) {`
			`for (int j = -32768; j < 32768; j++) {`
			`Q15 a(i);`
			`Q15 b(j);`
			`Q15 simResult = simulate_neon_vqrdmulhq_s16(a, b);`
			`Q15 intrinsicResult = vqrdmulhq_s16(a, b);`
			`for (int i = 0; i < 8; i++) {`
			`if (simResult[i] != intrinsicResult[i]) {`
			`printf("simulate_neon_vqrdmulhq_s16 broken\n");`
			`printf("i: %d, a: %hx b: %hx, intrinsic: %hx, sim: %hx\n",`
			`i, a[i], b[i], intrinsicResult[i], simResult[i]);`
			`exit(1);`
			`}`
			`}`
			`}`
			`}`
			`}`
			`#endif`
basic experiments for understanding lowp Change-Id: I5f0c36b44cd5bfd3f978d892a342a1d008baceb2 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446176 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 16:28:44 +00:00
			`int main() {`
add simulation for arm rounding saturating multiply Change-Id: I31dd90b2f09876d37c2c202d620cef4028af6266 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446183 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 20:14:47 +00:00			`#if defined(__SSSE3__)`
implement ARM saturating multiply using ssse3 Change-Id: I530373e9b53e76a7cd1c0b3a5a1b05354975790c Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446186 Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 21:02:15 +00:00			`//test_mm_mulhrs_epi16_simulation();`
			`test_ssse3_vqrdmulhq_s16();`
add simulation for arm rounding saturating multiply Change-Id: I31dd90b2f09876d37c2c202d620cef4028af6266 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446183 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 20:14:47 +00:00			`#endif`
			`#if defined(__ARM_NEON)`
			`test_neon_vqrdmulhq_s16_simulation();`
			`#endif`
basic experiments for understanding lowp Change-Id: I5f0c36b44cd5bfd3f978d892a342a1d008baceb2 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/446176 Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: Herb Derby <herb@google.com> 2021-09-07 16:28:44 +00:00			`printf("Done.\n");`
			`return 0;`
			`}`