Fix new IT blocks ARMv8
ARMv8 specifies that an IT block should be followed by only one 16-bit instruction. * SkFloatToFix is back to a C implementation that mirrors the assembly code. * S32A_D565_Opaque_neon switched the usage of the temporary 'ip' register to let the compiler choose what is best in the context of the IT block. And replaced 'keep_dst' by 'ip' where low register or high register does not matter. BUG=skia: CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD Change-Id: If587110a0c74b637ae99460419d46cf969c694fc Reviewed-on: https://skia-review.googlesource.com/9346 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
parent
b8a1392b02
commit
ac0e705af1
@ -102,19 +102,9 @@ inline SkFixed SkFixedMul_longlong(SkFixed a, SkFixed b) {
|
|||||||
*/
|
*/
|
||||||
SK_ALWAYS_INLINE SkFixed SkFloatToFixed_arm(float x)
|
SK_ALWAYS_INLINE SkFixed SkFloatToFixed_arm(float x)
|
||||||
{
|
{
|
||||||
int32_t y, z;
|
int32_t y;
|
||||||
asm("movs %1, %3, lsl #1 \n"
|
asm("vcvt.s32.f32 %0, %0, #16": "+w"(x));
|
||||||
"mov %2, #0x8E \n"
|
memcpy(&y, &x, sizeof(y));
|
||||||
"sub %1, %2, %1, lsr #24 \n"
|
|
||||||
"mov %2, %3, lsl #8 \n"
|
|
||||||
"orr %2, %2, #0x80000000 \n"
|
|
||||||
"mov %1, %2, lsr %1 \n"
|
|
||||||
"it cs \n"
|
|
||||||
"rsbcs %1, %1, #0 \n"
|
|
||||||
: "=r"(x), "=&r"(y), "=&r"(z)
|
|
||||||
: "r"(x)
|
|
||||||
: "cc"
|
|
||||||
);
|
|
||||||
return y;
|
return y;
|
||||||
}
|
}
|
||||||
inline SkFixed SkFixedMul_arm(SkFixed x, SkFixed y)
|
inline SkFixed SkFixedMul_arm(SkFixed x, SkFixed y)
|
||||||
|
@ -194,85 +194,85 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
|
|||||||
SkASSERT(255 == alpha);
|
SkASSERT(255 == alpha);
|
||||||
|
|
||||||
if (count >= 8) {
|
if (count >= 8) {
|
||||||
uint16_t* SK_RESTRICT keep_dst = 0;
|
int32_t tmp = 0;
|
||||||
|
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"ands ip, %[count], #7 \n\t"
|
"ands %[tmp], %[count], #7 \n\t"
|
||||||
"vmov.u8 d31, #1<<7 \n\t"
|
"vmov.u8 d31, #1<<7 \n\t"
|
||||||
"vld1.16 {q12}, [%[dst]] \n\t"
|
"vld1.16 {q12}, [%[dst]] \n\t"
|
||||||
"vld4.8 {d0-d3}, [%[src]] \n\t"
|
"vld4.8 {d0-d3}, [%[src]] \n\t"
|
||||||
// Thumb does not support the standard ARM conditional
|
// Thumb does not support the standard ARM conditional
|
||||||
// instructions but instead requires the 'it' instruction
|
// instructions but instead requires the 'it' instruction
|
||||||
// to signal conditional execution
|
// to signal conditional execution
|
||||||
"it eq \n\t"
|
"it eq \n\t"
|
||||||
"moveq ip, #8 \n\t"
|
"moveq %[tmp], #8 \n\t"
|
||||||
"mov %[keep_dst], %[dst] \n\t"
|
"mov ip, %[dst] \n\t"
|
||||||
|
|
||||||
"add %[src], %[src], ip, LSL#2 \n\t"
|
"add %[src], %[src], %[tmp], LSL#2 \n\t"
|
||||||
"add %[dst], %[dst], ip, LSL#1 \n\t"
|
"add %[dst], %[dst], %[tmp], LSL#1 \n\t"
|
||||||
"subs %[count], %[count], ip \n\t"
|
"subs %[count], %[count], %[tmp] \n\t"
|
||||||
"b 9f \n\t"
|
"b 9f \n\t"
|
||||||
// LOOP
|
// LOOP
|
||||||
"2: \n\t"
|
"2: \n\t"
|
||||||
|
|
||||||
"vld1.16 {q12}, [%[dst]]! \n\t"
|
"vld1.16 {q12}, [%[dst]]! \n\t"
|
||||||
"vld4.8 {d0-d3}, [%[src]]! \n\t"
|
"vld4.8 {d0-d3}, [%[src]]! \n\t"
|
||||||
"vst1.16 {q10}, [%[keep_dst]] \n\t"
|
"vst1.16 {q10}, [ip] \n\t"
|
||||||
"sub %[keep_dst], %[dst], #8*2 \n\t"
|
"sub ip, %[dst], #8*2 \n\t"
|
||||||
"subs %[count], %[count], #8 \n\t"
|
"subs %[count], %[count], #8 \n\t"
|
||||||
"9: \n\t"
|
"9: \n\t"
|
||||||
"pld [%[dst],#32] \n\t"
|
"pld [%[dst],#32] \n\t"
|
||||||
// expand 0565 q12 to 8888 {d4-d7}
|
// expand 0565 q12 to 8888 {d4-d7}
|
||||||
"vmovn.u16 d4, q12 \n\t"
|
"vmovn.u16 d4, q12 \n\t"
|
||||||
"vshr.u16 q11, q12, #5 \n\t"
|
"vshr.u16 q11, q12, #5 \n\t"
|
||||||
"vshr.u16 q10, q12, #6+5 \n\t"
|
"vshr.u16 q10, q12, #6+5 \n\t"
|
||||||
"vmovn.u16 d5, q11 \n\t"
|
"vmovn.u16 d5, q11 \n\t"
|
||||||
"vmovn.u16 d6, q10 \n\t"
|
"vmovn.u16 d6, q10 \n\t"
|
||||||
"vshl.u8 d4, d4, #3 \n\t"
|
"vshl.u8 d4, d4, #3 \n\t"
|
||||||
"vshl.u8 d5, d5, #2 \n\t"
|
"vshl.u8 d5, d5, #2 \n\t"
|
||||||
"vshl.u8 d6, d6, #3 \n\t"
|
"vshl.u8 d6, d6, #3 \n\t"
|
||||||
|
|
||||||
"vmovl.u8 q14, d31 \n\t"
|
"vmovl.u8 q14, d31 \n\t"
|
||||||
"vmovl.u8 q13, d31 \n\t"
|
"vmovl.u8 q13, d31 \n\t"
|
||||||
"vmovl.u8 q12, d31 \n\t"
|
"vmovl.u8 q12, d31 \n\t"
|
||||||
|
|
||||||
// duplicate in 4/2/1 & 8pix vsns
|
// duplicate in 4/2/1 & 8pix vsns
|
||||||
"vmvn.8 d30, d3 \n\t"
|
"vmvn.8 d30, d3 \n\t"
|
||||||
"vmlal.u8 q14, d30, d6 \n\t"
|
"vmlal.u8 q14, d30, d6 \n\t"
|
||||||
"vmlal.u8 q13, d30, d5 \n\t"
|
"vmlal.u8 q13, d30, d5 \n\t"
|
||||||
"vmlal.u8 q12, d30, d4 \n\t"
|
"vmlal.u8 q12, d30, d4 \n\t"
|
||||||
"vshr.u16 q8, q14, #5 \n\t"
|
"vshr.u16 q8, q14, #5 \n\t"
|
||||||
"vshr.u16 q9, q13, #6 \n\t"
|
"vshr.u16 q9, q13, #6 \n\t"
|
||||||
"vaddhn.u16 d6, q14, q8 \n\t"
|
"vaddhn.u16 d6, q14, q8 \n\t"
|
||||||
"vshr.u16 q8, q12, #5 \n\t"
|
"vshr.u16 q8, q12, #5 \n\t"
|
||||||
"vaddhn.u16 d5, q13, q9 \n\t"
|
"vaddhn.u16 d5, q13, q9 \n\t"
|
||||||
"vaddhn.u16 d4, q12, q8 \n\t"
|
"vaddhn.u16 d4, q12, q8 \n\t"
|
||||||
// intentionally don't calculate alpha
|
// intentionally don't calculate alpha
|
||||||
// result in d4-d6
|
// result in d4-d6
|
||||||
|
|
||||||
#ifdef SK_PMCOLOR_IS_RGBA
|
#ifdef SK_PMCOLOR_IS_RGBA
|
||||||
"vqadd.u8 d6, d6, d0 \n\t"
|
"vqadd.u8 d6, d6, d0 \n\t"
|
||||||
"vqadd.u8 d5, d5, d1 \n\t"
|
"vqadd.u8 d5, d5, d1 \n\t"
|
||||||
"vqadd.u8 d4, d4, d2 \n\t"
|
"vqadd.u8 d4, d4, d2 \n\t"
|
||||||
#else
|
#else
|
||||||
"vqadd.u8 d6, d6, d2 \n\t"
|
"vqadd.u8 d6, d6, d2 \n\t"
|
||||||
"vqadd.u8 d5, d5, d1 \n\t"
|
"vqadd.u8 d5, d5, d1 \n\t"
|
||||||
"vqadd.u8 d4, d4, d0 \n\t"
|
"vqadd.u8 d4, d4, d0 \n\t"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// pack 8888 {d4-d6} to 0565 q10
|
// pack 8888 {d4-d6} to 0565 q10
|
||||||
"vshll.u8 q10, d6, #8 \n\t"
|
"vshll.u8 q10, d6, #8 \n\t"
|
||||||
"vshll.u8 q3, d5, #8 \n\t"
|
"vshll.u8 q3, d5, #8 \n\t"
|
||||||
"vshll.u8 q2, d4, #8 \n\t"
|
"vshll.u8 q2, d4, #8 \n\t"
|
||||||
"vsri.u16 q10, q3, #5 \n\t"
|
"vsri.u16 q10, q3, #5 \n\t"
|
||||||
"vsri.u16 q10, q2, #11 \n\t"
|
"vsri.u16 q10, q2, #11 \n\t"
|
||||||
|
|
||||||
"bne 2b \n\t"
|
"bne 2b \n\t"
|
||||||
|
|
||||||
"1: \n\t"
|
"1: \n\t"
|
||||||
"vst1.16 {q10}, [%[keep_dst]] \n\t"
|
"vst1.16 {q10}, [ip] \n\t"
|
||||||
: [count] "+r" (count)
|
: [count] "+r" (count)
|
||||||
: [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
|
: [dst] "r" (dst), [src] "r" (src), [tmp] "r"(tmp)
|
||||||
: "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
|
: "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
|
||||||
"d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
|
"d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
|
||||||
"d30","d31"
|
"d30","d31"
|
||||||
|
Loading…
Reference in New Issue
Block a user