Reland "[wasm-simd][liftoff][x64] Move v128.select into macro-assembler"

This is a reland of 2d5f981a04 The fix is in liftoff-assembler-x64, to call S128Select with dst as the mask when AVX is not supported and dst != mask. Original change's description: > [wasm-simd][liftoff][x64] Move v128.select into macro-assembler > > This allows us to reuse this optimized code sequence in Liftoff. > > We can't do the same thing in IA32 yet, there is no kScratchDoubleReg > defined in the macro-assembler-ia32.cc, it is defined in code-generator-ia32 > as xmm0 but I'm not sure if it is safe to just use that in the macro assembler. > > Change-Id: I6c761857c49d2518fbc82cd0796c62fc86665cb5 > Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2596581 > Commit-Queue: Zhi An Ng <zhin@chromium.org> > Reviewed-by: Clemens Backes <clemensb@chromium.org> > Reviewed-by: Bill Budge <bbudge@chromium.org> > Cr-Commit-Position: refs/heads/master@{#71915} Change-Id: Ib96ce0e1d5762f6513ef87f240b25ef3ae59441f Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2612324 Reviewed-by: Clemens Backes <clemensb@chromium.org> Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#71961}
2021-01-06 01:28:15 +00:00 · 2021-01-06 01:28:15 +00:00 · 025443a4a8
commit 025443a4a8
parent 2893b9fbd6
4 changed files with 33 additions and 29 deletions
--- a/src/codegen/x64/macro-assembler-x64.cc
+++ b/src/codegen/x64/macro-assembler-x64.cc
@ -2214,6 +2214,25 @@ void TurboAssembler::Psrld(XMMRegister dst, XMMRegister src, byte imm8) {
  }
 }
 void TurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
                                XMMRegister src1, XMMRegister src2) {
  // v128.select = v128.or(v128.and(v1, c), v128.andnot(v2, c)).
  // pandn(x, y) = !x & y, so we have to flip the mask and input.
  if (CpuFeatures::IsSupported(AVX)) {
    CpuFeatureScope avx_scope(this, AVX);
    vpandn(kScratchDoubleReg, mask, src2);
    vpand(dst, src1, mask);
    vpor(dst, dst, kScratchDoubleReg);
  } else {
    DCHECK_EQ(dst, mask);
    // Use float ops as they are 1 byte shorter than int ops.
    movaps(kScratchDoubleReg, mask);
    andnps(kScratchDoubleReg, src2);
    andps(dst, src1);
    orps(dst, kScratchDoubleReg);
  }
 }
 void TurboAssembler::Lzcntl(Register dst, Register src) {
  if (CpuFeatures::IsSupported(LZCNT)) {
    CpuFeatureScope scope(this, LZCNT);
--- a/src/codegen/x64/macro-assembler-x64.h
+++ b/src/codegen/x64/macro-assembler-x64.h
@ -578,6 +578,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src);
  void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src);
  // Requires dst == mask when AVX is not supported.
  void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
                  XMMRegister src2);
  void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
                   bool low, bool is_signed);
  // Requires that dst == src1 if AVX is not supported.
--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -3762,25 +3762,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kX64S128Select: {
-      // v128.select = v128.or(v128.and(v1, c), v128.andnot(v2, c)).
+      __ S128Select(i.OutputSimd128Register(), i.InputSimd128Register(0),
-      // pandn(x, y) = !x & y, so we have to flip the mask and input.
+                    i.InputSimd128Register(1), i.InputSimd128Register(2));
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister mask = i.InputSimd128Register(0);
      XMMRegister src1 = i.InputSimd128Register(1);
      XMMRegister src2 = i.InputSimd128Register(2);
      if (CpuFeatures::IsSupported(AVX)) {
        CpuFeatureScope avx_scope(tasm(), AVX);
        __ vpandn(kScratchDoubleReg, mask, src2);
        __ vpand(dst, src1, mask);
        __ vpor(dst, dst, kScratchDoubleReg);
      } else {
        DCHECK_EQ(dst, mask);
        // Use float ops as they are 1 byte shorter than int ops.
        __ movaps(kScratchDoubleReg, mask);
        __ andnps(kScratchDoubleReg, src2);
        __ andps(dst, src1);
        __ orps(dst, kScratchDoubleReg);
      }
      break;
    }
    case kX64S128AndNot: {
--- a/src/wasm/baseline/x64/liftoff-assembler-x64.h
+++ b/src/wasm/baseline/x64/liftoff-assembler-x64.h
@ -7,6 +7,7 @@
 #include "src/base/platform/wrappers.h"
 #include "src/codegen/assembler.h"
 #include "src/codegen/cpu-features.h"
 #include "src/heap/memory-chunk.h"
 #include "src/wasm/baseline/liftoff-assembler.h"
 #include "src/wasm/simd-shuffle.h"
@ -2756,17 +2757,14 @@ void LiftoffAssembler::emit_s128_select(LiftoffRegister dst,
                                        LiftoffRegister src1,
                                        LiftoffRegister src2,
                                        LiftoffRegister mask) {
-  if (CpuFeatures::IsSupported(AVX)) {
+  // Ensure that we don't overwrite any inputs with the movdqu below.
-    CpuFeatureScope scope(this, AVX);
+  DCHECK_NE(dst, src1);
-    vxorps(kScratchDoubleReg, src1.fp(), src2.fp());
+  DCHECK_NE(dst, src2);
-    vandps(kScratchDoubleReg, kScratchDoubleReg, mask.fp());
+  if (!CpuFeatures::IsSupported(AVX) && dst != mask) {
-    vxorps(dst.fp(), kScratchDoubleReg, src2.fp());
+    movdqu(dst.fp(), mask.fp());
    S128Select(dst.fp(), dst.fp(), src1.fp(), src2.fp());
  } else {
-    movaps(kScratchDoubleReg, src1.fp());
+    S128Select(dst.fp(), mask.fp(), src1.fp(), src2.fp());
    xorps(kScratchDoubleReg, src2.fp());
    andps(kScratchDoubleReg, mask.fp());
    if (dst.fp() != src2.fp()) movaps(dst.fp(), src2.fp());
    xorps(dst.fp(), kScratchDoubleReg);
  }
 }