From fa64774820cb42594d3f5bc2059953510f038636 Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Thu, 16 Feb 2017 06:21:54 -0500 Subject: [PATCH] Flush to zero when loading f16 with sse2/sse4.1. The multiply by 0x77800000 is quite slow when the input is denormalized. We don't mind flushing those values (in the range of 1e-5) to zero. Implement portable load_f16() / store_f16() too. Change-Id: I125cff1c79ca71d9abe22ac7877136d86707cb56 Reviewed-on: https://skia-review.googlesource.com/8467 Reviewed-by: Mike Klein Commit-Queue: Mike Klein --- src/jumper/SkJumper.cpp | 2 +- src/jumper/SkJumper.h | 1 + src/jumper/SkJumper_generated.h | 30 ++++++++++++++++++++++-------- src/jumper/SkJumper_stages.cpp | 28 ++++++++++++++++++++++++---- 4 files changed, 48 insertions(+), 13 deletions(-) diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index a58a7d7a11..3b27ffe81e 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -18,7 +18,7 @@ static const SkJumper_constants kConstants = { {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}, 0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f, // from_srgb 12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb - 0x77800000, 0x07800000, // fp16 <-> fp32 + 0x77800000, 0x07800000, 0x04000400, // fp16 <-> fp32 }; using JumperStage = void(size_t, void**, const SkJumper_constants*); diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h index f6088dd55b..7a42a52f10 100644 --- a/src/jumper/SkJumper.h +++ b/src/jumper/SkJumper.h @@ -46,6 +46,7 @@ struct SkJumper_constants { // fp16 <-> fp32 uint32_t _0x77800000; uint32_t _0x07800000; + uint32_t _0x04000400; }; #endif//SkJumper_DEFINED diff --git a/src/jumper/SkJumper_generated.h b/src/jumper/SkJumper_generated.h index 9641b5f7d0..591a3c7e21 100644 --- a/src/jumper/SkJumper_generated.h +++ b/src/jumper/SkJumper_generated.h @@ -1953,12 +1953,19 @@ static const unsigned char sse41_sk_load_f16[] = { 0x48,0x8b,0x00, // mov (%rax),%rax 0xf3,0x0f,0x6f,0x04,0xf8, // movdqu (%rax,%rdi,8),%xmm0 0xf3,0x0f,0x6f,0x4c,0xf8,0x10, // movdqu 0x10(%rax,%rdi,8),%xmm1 - 0x66,0x0f,0x6f,0xd8, // movdqa %xmm0,%xmm3 - 0x66,0x0f,0x61,0xd9, // punpcklwd %xmm1,%xmm3 + 0x66,0x0f,0x6f,0xd0, // movdqa %xmm0,%xmm2 + 0x66,0x0f,0x61,0xd1, // punpcklwd %xmm1,%xmm2 0x66,0x0f,0x69,0xc1, // punpckhwd %xmm1,%xmm0 + 0x66,0x44,0x0f,0x6f,0xc2, // movdqa %xmm2,%xmm8 + 0x66,0x44,0x0f,0x61,0xc0, // punpcklwd %xmm0,%xmm8 + 0x66,0x0f,0x69,0xd0, // punpckhwd %xmm0,%xmm2 + 0x66,0x0f,0x6e,0x42,0x64, // movd 0x64(%rdx),%xmm0 + 0x66,0x0f,0x70,0xd8,0x00, // pshufd $0x0,%xmm0,%xmm3 0x66,0x0f,0x6f,0xcb, // movdqa %xmm3,%xmm1 - 0x66,0x0f,0x61,0xc8, // punpcklwd %xmm0,%xmm1 - 0x66,0x0f,0x69,0xd8, // punpckhwd %xmm0,%xmm3 + 0x66,0x41,0x0f,0x65,0xc8, // pcmpgtw %xmm8,%xmm1 + 0x66,0x41,0x0f,0xdf,0xc8, // pandn %xmm8,%xmm1 + 0x66,0x0f,0x65,0xda, // pcmpgtw %xmm2,%xmm3 + 0x66,0x0f,0xdf,0xda, // pandn %xmm2,%xmm3 0x66,0x0f,0x38,0x33,0xc1, // pmovzxwd %xmm1,%xmm0 0x66,0x0f,0x72,0xf0,0x0d, // pslld $0xd,%xmm0 0x66,0x0f,0x6e,0x52,0x5c, // movd 0x5c(%rdx),%xmm2 @@ -2586,12 +2593,19 @@ static const unsigned char sse2_sk_load_f16[] = { 0x48,0x8b,0x00, // mov (%rax),%rax 0xf3,0x0f,0x6f,0x04,0xf8, // movdqu (%rax,%rdi,8),%xmm0 0xf3,0x0f,0x6f,0x4c,0xf8,0x10, // movdqu 0x10(%rax,%rdi,8),%xmm1 - 0x66,0x0f,0x6f,0xd8, // movdqa %xmm0,%xmm3 - 0x66,0x0f,0x61,0xd9, // punpcklwd %xmm1,%xmm3 + 0x66,0x0f,0x6f,0xd0, // movdqa %xmm0,%xmm2 + 0x66,0x0f,0x61,0xd1, // punpcklwd %xmm1,%xmm2 0x66,0x0f,0x69,0xc1, // punpckhwd %xmm1,%xmm0 + 0x66,0x44,0x0f,0x6f,0xc2, // movdqa %xmm2,%xmm8 + 0x66,0x44,0x0f,0x61,0xc0, // punpcklwd %xmm0,%xmm8 + 0x66,0x0f,0x69,0xd0, // punpckhwd %xmm0,%xmm2 + 0x66,0x0f,0x6e,0x42,0x64, // movd 0x64(%rdx),%xmm0 + 0x66,0x0f,0x70,0xd8,0x00, // pshufd $0x0,%xmm0,%xmm3 0x66,0x0f,0x6f,0xcb, // movdqa %xmm3,%xmm1 - 0x66,0x0f,0x61,0xc8, // punpcklwd %xmm0,%xmm1 - 0x66,0x0f,0x69,0xd8, // punpckhwd %xmm0,%xmm3 + 0x66,0x41,0x0f,0x65,0xc8, // pcmpgtw %xmm8,%xmm1 + 0x66,0x41,0x0f,0xdf,0xc8, // pandn %xmm8,%xmm1 + 0x66,0x0f,0x65,0xda, // pcmpgtw %xmm2,%xmm3 + 0x66,0x0f,0xdf,0xda, // pandn %xmm2,%xmm3 0x66,0x45,0x0f,0xef,0xc0, // pxor %xmm8,%xmm8 0x66,0x0f,0x6f,0xc1, // movdqa %xmm1,%xmm0 0x66,0x41,0x0f,0x61,0xc0, // punpcklwd %xmm8,%xmm0 diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index 6c106c3f05..20ea719727 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -402,8 +402,16 @@ STAGE(load_f16) { auto ptr = *(const uint64_t**)ctx + x; #if !defined(JUMPER) - // TODO: - (void)ptr; + auto half_to_float = [&](int16_t h) { + if (h < 0x0400) { h = 0; } // Flush denorm and negative to zero. + return bit_cast(h << 13) // Line up the mantissa, + * bit_cast(U32(k->_0x77800000)); // then fix up the exponent. + }; + auto rgba = (const int16_t*)ptr; + r = half_to_float(rgba[0]); + g = half_to_float(rgba[1]); + b = half_to_float(rgba[2]); + a = half_to_float(rgba[3]); #elif defined(__aarch64__) auto halfs = vld4_f16((const float16_t*)ptr); r = vcvt_f32_f16(halfs.val[0]); @@ -448,6 +456,11 @@ STAGE(load_f16) { auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3 + // half_to_float() slows down ~10x for denorm inputs, so we flush them to zero. + // With a signed comparison this conveniently also flushes negative half floats to zero. + rg = _mm_andnot_si128(_mm_cmplt_epi16(rg, U32(k->_0x04000400)), rg); + ba = _mm_andnot_si128(_mm_cmplt_epi16(ba, U32(k->_0x04000400)), ba); + auto half_to_float = [&](U32 h) { return bit_cast(h << 13) // Line up the mantissa, * bit_cast(U32(k->_0x77800000)); // then fix up the exponent. @@ -464,8 +477,15 @@ STAGE(store_f16) { auto ptr = *(uint64_t**)ctx + x; #if !defined(JUMPER) - // TODO: - (void)ptr; + auto float_to_half = [&](F f) { + return bit_cast(f * bit_cast(U32(k->_0x07800000))) // Fix up the exponent, + >> 13; // then line up the mantissa. + }; + auto rgba = (int16_t*)ptr; + rgba[0] = float_to_half(r); + rgba[1] = float_to_half(g); + rgba[2] = float_to_half(b); + rgba[3] = float_to_half(a); #elif defined(__aarch64__) float16x4x4_t halfs = {{ vcvt_f16_f32(r),