Fix new IT blocks ARMv8

ARMv8 specifies that an IT block should be followed by only one 16-bit instruction. * SkFloatToFix is back to a C implementation that mirrors the assembly code. * S32A_D565_Opaque_neon switched the usage of the temporary 'ip' register to let the compiler choose what is best in the context of the IT block. And replaced 'keep_dst' by 'ip' where low register or high register does not matter. BUG=skia: CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD Change-Id: If587110a0c74b637ae99460419d46cf969c694fc Reviewed-on: https://skia-review.googlesource.com/9346 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
2017-03-07 16:58:08 -08:00 · 2017-03-07 16:58:08 -08:00 · ac0e705af1
commit ac0e705af1
parent b8a1392b02
2 changed files with 56 additions and 66 deletions
--- a/include/private/SkFixed.h
+++ b/include/private/SkFixed.h
@ -102,19 +102,9 @@ inline SkFixed SkFixedMul_longlong(SkFixed a, SkFixed b) {
    */
    SK_ALWAYS_INLINE SkFixed SkFloatToFixed_arm(float x)
    {
-        int32_t y, z;
-        asm("movs    %1, %3, lsl #1         \n"
-            "mov     %2, #0x8E              \n"
-            "sub     %1, %2, %1, lsr #24    \n"
-            "mov     %2, %3, lsl #8         \n"
-            "orr     %2, %2, #0x80000000    \n"
-            "mov     %1, %2, lsr %1         \n"
-            "it cs                          \n"
-            "rsbcs   %1, %1, #0             \n"
-            : "=r"(x), "=&r"(y), "=&r"(z)
-            : "r"(x)
-            : "cc"
-            );
+        int32_t y;
+        asm("vcvt.s32.f32 %0, %0, #16": "+w"(x));
+        memcpy(&y, &x, sizeof(y));
        return y;
    }
    inline SkFixed SkFixedMul_arm(SkFixed x, SkFixed y)
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@ -194,10 +194,10 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    SkASSERT(255 == alpha);

    if (count >= 8) {
-        uint16_t* SK_RESTRICT keep_dst = 0;
+        int32_t tmp = 0;

        asm volatile (
-                      "ands       ip, %[count], #7            \n\t"
+                      "ands       %[tmp], %[count], #7            \n\t"
                      "vmov.u8    d31, #1<<7                      \n\t"
                      "vld1.16    {q12}, [%[dst]]                 \n\t"
                      "vld4.8     {d0-d3}, [%[src]]               \n\t"
@ -205,20 +205,20 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
                      // instructions but instead requires the 'it' instruction
                      // to signal conditional execution
                      "it eq                                      \n\t"
-                      "moveq      ip, #8                      \n\t"
-                      "mov        %[keep_dst], %[dst]         \n\t"
+                      "moveq      %[tmp], #8                      \n\t"
+                      "mov        ip, %[dst]             \n\t"

-                      "add        %[src], %[src], ip, LSL#2   \n\t"
-                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
-                      "subs       %[count], %[count], ip      \n\t"
+                      "add        %[src], %[src], %[tmp], LSL#2   \n\t"
+                      "add        %[dst], %[dst], %[tmp], LSL#1   \n\t"
+                      "subs       %[count], %[count], %[tmp]      \n\t"
                      "b          9f                              \n\t"
                      // LOOP
                      "2:                                         \n\t"

                      "vld1.16    {q12}, [%[dst]]!                \n\t"
                      "vld4.8     {d0-d3}, [%[src]]!              \n\t"
-                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
-                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
+                      "vst1.16    {q10}, [ip]            \n\t"
+                      "sub        ip, %[dst], #8*2       \n\t"
                      "subs       %[count], %[count], #8          \n\t"
                      "9:                                         \n\t"
                      "pld        [%[dst],#32]                    \n\t"
@ -270,9 +270,9 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
                      "bne        2b                              \n\t"

                      "1:                                         \n\t"
-                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
+                      "vst1.16      {q10}, [ip]          \n\t"
                      : [count] "+r" (count)
-                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
+                      : [dst] "r" (dst), [src] "r" (src), [tmp] "r"(tmp)
                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
                      "d30","d31"