Fix new IT blocks ARMv8

ARMv8 specifies that an IT block should be followed by only one 16-bit instruction.
* SkFloatToFix is back to a C implementation that mirrors the assembly code.

* S32A_D565_Opaque_neon switched the usage of the temporary 'ip' register to let
the compiler choose what is best in the context of the IT block. And replaced
'keep_dst' by 'ip' where low register or high register does not matter.

BUG=skia:

CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD

Change-Id: If587110a0c74b637ae99460419d46cf969c694fc
Reviewed-on: https://skia-review.googlesource.com/9346
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Amaury Le Leyzour 2017-03-07 16:58:08 -08:00 committed by Skia Commit-Bot
parent b8a1392b02
commit ac0e705af1
2 changed files with 56 additions and 66 deletions

View File

@ -102,19 +102,9 @@ inline SkFixed SkFixedMul_longlong(SkFixed a, SkFixed b) {
*/ */
SK_ALWAYS_INLINE SkFixed SkFloatToFixed_arm(float x) SK_ALWAYS_INLINE SkFixed SkFloatToFixed_arm(float x)
{ {
int32_t y, z; int32_t y;
asm("movs %1, %3, lsl #1 \n" asm("vcvt.s32.f32 %0, %0, #16": "+w"(x));
"mov %2, #0x8E \n" memcpy(&y, &x, sizeof(y));
"sub %1, %2, %1, lsr #24 \n"
"mov %2, %3, lsl #8 \n"
"orr %2, %2, #0x80000000 \n"
"mov %1, %2, lsr %1 \n"
"it cs \n"
"rsbcs %1, %1, #0 \n"
: "=r"(x), "=&r"(y), "=&r"(z)
: "r"(x)
: "cc"
);
return y; return y;
} }
inline SkFixed SkFixedMul_arm(SkFixed x, SkFixed y) inline SkFixed SkFixedMul_arm(SkFixed x, SkFixed y)

View File

@ -194,85 +194,85 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
SkASSERT(255 == alpha); SkASSERT(255 == alpha);
if (count >= 8) { if (count >= 8) {
uint16_t* SK_RESTRICT keep_dst = 0; int32_t tmp = 0;
asm volatile ( asm volatile (
"ands ip, %[count], #7 \n\t" "ands %[tmp], %[count], #7 \n\t"
"vmov.u8 d31, #1<<7 \n\t" "vmov.u8 d31, #1<<7 \n\t"
"vld1.16 {q12}, [%[dst]] \n\t" "vld1.16 {q12}, [%[dst]] \n\t"
"vld4.8 {d0-d3}, [%[src]] \n\t" "vld4.8 {d0-d3}, [%[src]] \n\t"
// Thumb does not support the standard ARM conditional // Thumb does not support the standard ARM conditional
// instructions but instead requires the 'it' instruction // instructions but instead requires the 'it' instruction
// to signal conditional execution // to signal conditional execution
"it eq \n\t" "it eq \n\t"
"moveq ip, #8 \n\t" "moveq %[tmp], #8 \n\t"
"mov %[keep_dst], %[dst] \n\t" "mov ip, %[dst] \n\t"
"add %[src], %[src], ip, LSL#2 \n\t" "add %[src], %[src], %[tmp], LSL#2 \n\t"
"add %[dst], %[dst], ip, LSL#1 \n\t" "add %[dst], %[dst], %[tmp], LSL#1 \n\t"
"subs %[count], %[count], ip \n\t" "subs %[count], %[count], %[tmp] \n\t"
"b 9f \n\t" "b 9f \n\t"
// LOOP // LOOP
"2: \n\t" "2: \n\t"
"vld1.16 {q12}, [%[dst]]! \n\t" "vld1.16 {q12}, [%[dst]]! \n\t"
"vld4.8 {d0-d3}, [%[src]]! \n\t" "vld4.8 {d0-d3}, [%[src]]! \n\t"
"vst1.16 {q10}, [%[keep_dst]] \n\t" "vst1.16 {q10}, [ip] \n\t"
"sub %[keep_dst], %[dst], #8*2 \n\t" "sub ip, %[dst], #8*2 \n\t"
"subs %[count], %[count], #8 \n\t" "subs %[count], %[count], #8 \n\t"
"9: \n\t" "9: \n\t"
"pld [%[dst],#32] \n\t" "pld [%[dst],#32] \n\t"
// expand 0565 q12 to 8888 {d4-d7} // expand 0565 q12 to 8888 {d4-d7}
"vmovn.u16 d4, q12 \n\t" "vmovn.u16 d4, q12 \n\t"
"vshr.u16 q11, q12, #5 \n\t" "vshr.u16 q11, q12, #5 \n\t"
"vshr.u16 q10, q12, #6+5 \n\t" "vshr.u16 q10, q12, #6+5 \n\t"
"vmovn.u16 d5, q11 \n\t" "vmovn.u16 d5, q11 \n\t"
"vmovn.u16 d6, q10 \n\t" "vmovn.u16 d6, q10 \n\t"
"vshl.u8 d4, d4, #3 \n\t" "vshl.u8 d4, d4, #3 \n\t"
"vshl.u8 d5, d5, #2 \n\t" "vshl.u8 d5, d5, #2 \n\t"
"vshl.u8 d6, d6, #3 \n\t" "vshl.u8 d6, d6, #3 \n\t"
"vmovl.u8 q14, d31 \n\t" "vmovl.u8 q14, d31 \n\t"
"vmovl.u8 q13, d31 \n\t" "vmovl.u8 q13, d31 \n\t"
"vmovl.u8 q12, d31 \n\t" "vmovl.u8 q12, d31 \n\t"
// duplicate in 4/2/1 & 8pix vsns // duplicate in 4/2/1 & 8pix vsns
"vmvn.8 d30, d3 \n\t" "vmvn.8 d30, d3 \n\t"
"vmlal.u8 q14, d30, d6 \n\t" "vmlal.u8 q14, d30, d6 \n\t"
"vmlal.u8 q13, d30, d5 \n\t" "vmlal.u8 q13, d30, d5 \n\t"
"vmlal.u8 q12, d30, d4 \n\t" "vmlal.u8 q12, d30, d4 \n\t"
"vshr.u16 q8, q14, #5 \n\t" "vshr.u16 q8, q14, #5 \n\t"
"vshr.u16 q9, q13, #6 \n\t" "vshr.u16 q9, q13, #6 \n\t"
"vaddhn.u16 d6, q14, q8 \n\t" "vaddhn.u16 d6, q14, q8 \n\t"
"vshr.u16 q8, q12, #5 \n\t" "vshr.u16 q8, q12, #5 \n\t"
"vaddhn.u16 d5, q13, q9 \n\t" "vaddhn.u16 d5, q13, q9 \n\t"
"vaddhn.u16 d4, q12, q8 \n\t" "vaddhn.u16 d4, q12, q8 \n\t"
// intentionally don't calculate alpha // intentionally don't calculate alpha
// result in d4-d6 // result in d4-d6
#ifdef SK_PMCOLOR_IS_RGBA #ifdef SK_PMCOLOR_IS_RGBA
"vqadd.u8 d6, d6, d0 \n\t" "vqadd.u8 d6, d6, d0 \n\t"
"vqadd.u8 d5, d5, d1 \n\t" "vqadd.u8 d5, d5, d1 \n\t"
"vqadd.u8 d4, d4, d2 \n\t" "vqadd.u8 d4, d4, d2 \n\t"
#else #else
"vqadd.u8 d6, d6, d2 \n\t" "vqadd.u8 d6, d6, d2 \n\t"
"vqadd.u8 d5, d5, d1 \n\t" "vqadd.u8 d5, d5, d1 \n\t"
"vqadd.u8 d4, d4, d0 \n\t" "vqadd.u8 d4, d4, d0 \n\t"
#endif #endif
// pack 8888 {d4-d6} to 0565 q10 // pack 8888 {d4-d6} to 0565 q10
"vshll.u8 q10, d6, #8 \n\t" "vshll.u8 q10, d6, #8 \n\t"
"vshll.u8 q3, d5, #8 \n\t" "vshll.u8 q3, d5, #8 \n\t"
"vshll.u8 q2, d4, #8 \n\t" "vshll.u8 q2, d4, #8 \n\t"
"vsri.u16 q10, q3, #5 \n\t" "vsri.u16 q10, q3, #5 \n\t"
"vsri.u16 q10, q2, #11 \n\t" "vsri.u16 q10, q2, #11 \n\t"
"bne 2b \n\t" "bne 2b \n\t"
"1: \n\t" "1: \n\t"
"vst1.16 {q10}, [%[keep_dst]] \n\t" "vst1.16 {q10}, [ip] \n\t"
: [count] "+r" (count) : [count] "+r" (count)
: [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) : [dst] "r" (dst), [src] "r" (src), [tmp] "r"(tmp)
: "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
"d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
"d30","d31" "d30","d31"