Add Sk4f::ToBytes(uint8_t[16], Sk4f, Sk4f, Sk4f, Sk4f)
This is a big speedup for float -> byte. E.g. gradient_linear_clamp_3color: x86-64 147µs -> 103µs (Broadwell MBP) arm64 2.03ms -> 648µs (Galaxy S6) armv7 1.12ms -> 489µs (Galaxy S6, same device!) BUG=skia: CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot;client.skia.android:Test-Android-GCC-Nexus9-CPU-Denver-Arm64-Debug-Trybot Review URL: https://codereview.chromium.org/1483953002
This commit is contained in:
parent
eeebdb538d
commit
9db43ac4ee
@ -62,10 +62,7 @@ struct Sk4fGradientBench : public Benchmark {
|
||||
c = b + dcdx,
|
||||
d = c + dcdx;
|
||||
for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {
|
||||
a.toBytes((uint8_t*)(fDevice+i+0));
|
||||
b.toBytes((uint8_t*)(fDevice+i+1));
|
||||
c.toBytes((uint8_t*)(fDevice+i+2));
|
||||
d.toBytes((uint8_t*)(fDevice+i+3));
|
||||
Sk4f::ToBytes((uint8_t*)(fDevice+i), a, b, c, d);
|
||||
a = a + dcdx4;
|
||||
b = b + dcdx4;
|
||||
c = c + dcdx4;
|
||||
|
@ -113,6 +113,15 @@ public:
|
||||
fHi.toBytes(bytes+N/2);
|
||||
}
|
||||
|
||||
// Some implementations can do this faster.
|
||||
static void ToBytes(uint8_t bytes[4*N],
|
||||
const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
|
||||
a.toBytes(bytes+0*N);
|
||||
b.toBytes(bytes+1*N);
|
||||
c.toBytes(bytes+2*N);
|
||||
d.toBytes(bytes+3*N);
|
||||
}
|
||||
|
||||
SkNx operator + (const SkNx& o) const { return SkNx(fLo + o.fLo, fHi + o.fHi); }
|
||||
SkNx operator - (const SkNx& o) const { return SkNx(fLo - o.fLo, fHi - o.fHi); }
|
||||
SkNx operator * (const SkNx& o) const { return SkNx(fLo * o.fLo, fHi * o.fHi); }
|
||||
|
@ -748,10 +748,15 @@ template <bool apply_alpha> void ramp(SkPMColor dstC[], int n, const Sk4f& c, co
|
||||
Sk4f cd2 = cd0 + dc2;
|
||||
Sk4f cd3 = cd1 + dc2;
|
||||
while (n >= 4) {
|
||||
if (!apply_alpha) {
|
||||
Sk4f::ToBytes((uint8_t*)dstC, cd0, cd1, cd2, cd3);
|
||||
dstC += 4;
|
||||
} else {
|
||||
*dstC++ = trunc_from_255<apply_alpha>(cd0);
|
||||
*dstC++ = trunc_from_255<apply_alpha>(cd1);
|
||||
*dstC++ = trunc_from_255<apply_alpha>(cd2);
|
||||
*dstC++ = trunc_from_255<apply_alpha>(cd3);
|
||||
}
|
||||
cd0 = cd0 + dc4;
|
||||
cd1 = cd1 + dc4;
|
||||
cd2 = cd2 + dc4;
|
||||
|
@ -165,6 +165,14 @@ public:
|
||||
vst1_lane_u32((uint32_t*)bytes, (uint32x2_t)fix8, 0);
|
||||
}
|
||||
|
||||
static void ToBytes(uint8_t bytes[16],
|
||||
const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
|
||||
vst1q_u8(bytes, vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec),
|
||||
(uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0],
|
||||
vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec),
|
||||
(uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0]);
|
||||
}
|
||||
|
||||
SkNx approxInvert() const {
|
||||
float32x4_t est0 = vrecpeq_f32(fVec),
|
||||
est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0);
|
||||
|
@ -132,6 +132,15 @@ public:
|
||||
*(int*)bytes = _mm_cvtsi128_si32(fix8);
|
||||
}
|
||||
|
||||
static void ToBytes(uint8_t bytes[16],
|
||||
const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
|
||||
_mm_storeu_si128((__m128i*)bytes,
|
||||
_mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec),
|
||||
_mm_cvttps_epi32(b.fVec)),
|
||||
_mm_packus_epi16(_mm_cvttps_epi32(c.fVec),
|
||||
_mm_cvttps_epi32(d.fVec))));
|
||||
}
|
||||
|
||||
SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
|
||||
SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
|
||||
SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
|
||||
|
Loading…
Reference in New Issue
Block a user