[ia32][x64][liftoff] Share i64x2mul code

Optimize i64x2mul when AVX is supported to elide some moves. Bug: v8:11589 Change-Id: Ide0bba502a35cbb632e3fc311c9697c5f54f9d82 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3163280 Reviewed-by: Adam Klein <adamk@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/main@{#76889}
2021-09-15 11:34:07 -07:00 · 2021-09-15 11:34:07 -07:00 · 693112bfc0
commit 693112bfc0
parent af7232380f
6 changed files with 55 additions and 70 deletions
--- a/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc
+++ b/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc
@ -900,6 +900,51 @@ void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
  Psubq(dst, xmm_tmp);
 }

+void SharedTurboAssembler::I64x2Mul(XMMRegister dst, XMMRegister lhs,
+                                    XMMRegister rhs, XMMRegister tmp1,
+                                    XMMRegister tmp2) {
+  DCHECK(!AreAliased(dst, tmp1, tmp2));
+  DCHECK(!AreAliased(lhs, tmp1, tmp2));
+  DCHECK(!AreAliased(rhs, tmp1, tmp2));
+
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope avx_scope(this, AVX);
+    // 1. Multiply high dword of each qword of left with right.
+    vpsrlq(tmp1, lhs, byte{32});
+    vpmuludq(tmp1, tmp1, rhs);
+    // 2. Multiply high dword of each qword of right with left.
+    vpsrlq(tmp2, rhs, byte{32});
+    vpmuludq(tmp2, tmp2, lhs);
+    // 3. Add 1 and 2, then shift left by 32 (this is the high dword of result).
+    vpaddq(tmp2, tmp2, tmp1);
+    vpsllq(tmp2, tmp2, byte{32});
+    // 4. Multiply low dwords (this is the low dword of result).
+    vpmuludq(dst, lhs, rhs);
+    // 5. Add 3 and 4.
+    vpaddq(dst, dst, tmp2);
+  } else {
+    // Same algorithm as AVX version, but with moves to not overwrite inputs.
+    movaps(tmp1, lhs);
+    movaps(tmp2, rhs);
+    psrlq(tmp1, byte{32});
+    pmuludq(tmp1, rhs);
+    psrlq(tmp2, byte{32});
+    pmuludq(tmp2, lhs);
+    paddq(tmp2, tmp1);
+    psllq(tmp2, byte{32});
+    if (dst == rhs) {
+      // pmuludq is commutative
+      pmuludq(dst, lhs);
+    } else {
+      if (dst != lhs) {
+        movaps(dst, lhs);
+      }
+      pmuludq(dst, rhs);
+    }
+    paddq(dst, tmp2);
+  }
+}
+
 // 1. Unpack src0, src1 into even-number elements of scratch.
 // 2. Unpack src1, src0 into even-number elements of dst.
 // 3. Multiply 1. with 2.
--- a/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h
+++ b/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h
@ -441,6 +441,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
  void I64x2ShrS(XMMRegister dst, XMMRegister src, Register shift,
                 XMMRegister xmm_tmp, XMMRegister xmm_shift,
                 Register tmp_shift);
+  void I64x2Mul(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
+                XMMRegister tmp1, XMMRegister tmp2);
  void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
                   XMMRegister scratch, bool low, bool is_signed);
  void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src);
--- a/src/compiler/backend/ia32/code-generator-ia32.cc
+++ b/src/compiler/backend/ia32/code-generator-ia32.cc
@ -1985,28 +1985,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kIA32I64x2Mul: {
-      XMMRegister dst = i.OutputSimd128Register();
-      XMMRegister left = i.InputSimd128Register(0);
-      XMMRegister right = i.InputSimd128Register(1);
-      XMMRegister tmp1 = i.TempSimd128Register(0);
-      XMMRegister tmp2 = i.TempSimd128Register(1);
-
-      __ Movaps(tmp1, left);
-      __ Movaps(tmp2, right);
-
-      // Multiply high dword of each qword of left with right.
-      __ Psrlq(tmp1, byte{32});
-      __ Pmuludq(tmp1, tmp1, right);
-
-      // Multiply high dword of each qword of right with left.
-      __ Psrlq(tmp2, byte{32});
-      __ Pmuludq(tmp2, tmp2, left);
-
-      __ Paddq(tmp2, tmp2, tmp1);
-      __ Psllq(tmp2, tmp2, byte{32});
-
-      __ Pmuludq(dst, left, right);
-      __ Paddq(dst, dst, tmp2);
+      __ I64x2Mul(i.OutputSimd128Register(), i.InputSimd128Register(0),
+                  i.InputSimd128Register(1), i.TempSimd128Register(0),
+                  i.TempSimd128Register(1));
      break;
    }
    case kIA32I64x2ShrU: {
--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -2940,28 +2940,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kX64I64x2Mul: {
-      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
-      XMMRegister left = i.InputSimd128Register(0);
-      XMMRegister right = i.InputSimd128Register(1);
-      XMMRegister tmp1 = i.TempSimd128Register(0);
-      XMMRegister tmp2 = kScratchDoubleReg;
-
-      __ Movdqa(tmp1, left);
-      __ Movdqa(tmp2, right);
-
-      // Multiply high dword of each qword of left with right.
-      __ Psrlq(tmp1, byte{32});
-      __ Pmuludq(tmp1, right);
-
-      // Multiply high dword of each qword of right with left.
-      __ Psrlq(tmp2, byte{32});
-      __ Pmuludq(tmp2, left);
-
-      __ Paddq(tmp2, tmp1);
-      __ Psllq(tmp2, byte{32});
-
-      __ Pmuludq(left, right);
-      __ Paddq(left, tmp2);  // left == dst
+      __ I64x2Mul(i.OutputSimd128Register(), i.InputSimd128Register(0),
+                  i.InputSimd128Register(1), i.TempSimd128Register(0),
+                  kScratchDoubleReg);
      break;
    }
    case kX64I64x2Eq: {
--- a/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
+++ b/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
@ -3863,19 +3863,7 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
      GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
  LiftoffRegister tmp2 =
      GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs, tmp1));
-  Movaps(tmp1.fp(), lhs.fp());
-  Movaps(tmp2.fp(), rhs.fp());
-  // Multiply high dword of each qword of left with right.
-  Psrlq(tmp1.fp(), byte{32});
-  Pmuludq(tmp1.fp(), tmp1.fp(), rhs.fp());
-  // Multiply high dword of each qword of right with left.
-  Psrlq(tmp2.fp(), byte{32});
-  Pmuludq(tmp2.fp(), tmp2.fp(), lhs.fp());
-  Paddq(tmp2.fp(), tmp2.fp(), tmp1.fp());
-  Psllq(tmp2.fp(), tmp2.fp(), byte{32});
-  liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmuludq, &Assembler::pmuludq>(
-      this, dst, lhs, rhs);
-  Paddq(dst.fp(), dst.fp(), tmp2.fp());
+  I64x2Mul(dst.fp(), lhs.fp(), rhs.fp(), tmp1.fp(), tmp2.fp());
 }

 void LiftoffAssembler::emit_i64x2_extmul_low_i32x4_s(LiftoffRegister dst,
--- a/src/wasm/baseline/x64/liftoff-assembler-x64.h
+++ b/src/wasm/baseline/x64/liftoff-assembler-x64.h
@ -3413,19 +3413,7 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
      GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
  LiftoffRegister tmp2 =
      GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs, tmp1));
-  Movaps(tmp1.fp(), lhs.fp());
-  Movaps(tmp2.fp(), rhs.fp());
-  // Multiply high dword of each qword of left with right.
-  Psrlq(tmp1.fp(), byte{32});
-  Pmuludq(tmp1.fp(), rhs.fp());
-  // Multiply high dword of each qword of right with left.
-  Psrlq(tmp2.fp(), byte{32});
-  Pmuludq(tmp2.fp(), lhs.fp());
-  Paddq(tmp2.fp(), tmp1.fp());
-  Psllq(tmp2.fp(), byte{32});
-  liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmuludq, &Assembler::pmuludq>(
-      this, dst, lhs, rhs);
-  Paddq(dst.fp(), tmp2.fp());
+  I64x2Mul(dst.fp(), lhs.fp(), rhs.fp(), tmp1.fp(), tmp2.fp());
 }

 void LiftoffAssembler::emit_i64x2_extmul_low_i32x4_s(LiftoffRegister dst,