[wasm-simd][x64] Fix F64x2ConvertLowI32x4U isel and codegen

The previous instruction selection was too loose, it only required registers for the inputs. The codegen also used Unpcklps(dst, mask), and failed to use src at all. The test case was accidentally passing because dst == src (xmm0) by chance. We fix this bug requiring that for AVX, any register is fine, but for SSE, require dst == src. Also redefine Unpcklps to check dst == src in the no AVX case. Bug: v8:11265 Change-Id: I1988b2d2da8263512bf6e675e6297c50f55663f7 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2668918 Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#72536}
2021-02-02 16:31:02 -08:00 · 2021-02-02 16:31:02 -08:00 · 5ce3afe2d7
commit 5ce3afe2d7
parent fec9c5d655
4 changed files with 25 additions and 5 deletions
--- a/src/codegen/x64/macro-assembler-x64.cc
+++ b/src/codegen/x64/macro-assembler-x64.cc
@ -1825,6 +1825,16 @@ void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
  }
 }

+void TurboAssembler::Unpcklps(XMMRegister dst, XMMRegister src1, Operand src2) {
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope avx_scope(this, AVX);
+    vunpcklps(dst, src1, src2);
+  } else {
+    DCHECK_EQ(dst, src1);
+    unpcklps(dst, src2);
+  }
+}
+
 void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
                            byte imm8) {
  if (CpuFeatures::IsSupported(AVX)) {
--- a/src/codegen/x64/macro-assembler-x64.h
+++ b/src/codegen/x64/macro-assembler-x64.h
@ -162,7 +162,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  AVX_OP(Addss, addss)
  AVX_OP(Addsd, addsd)
  AVX_OP(Mulsd, mulsd)
-  AVX_OP(Unpcklps, unpcklps)
  AVX_OP(Andps, andps)
  AVX_OP(Andnps, andnps)
  AVX_OP(Andpd, andpd)
@ -542,6 +541,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  void Pmaddubsw(XMMRegister dst, XMMRegister src1, Operand src2);
  void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);

+  void Unpcklps(XMMRegister dst, XMMRegister src1, Operand src2);
  // Shufps that will mov src1 into dst if AVX is not supported.
  void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, byte imm8);

--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -2483,13 +2483,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
    case kX64F64x2ConvertLowI32x4U: {
      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src = i.InputSimd128Register(0);
      // dst = [ src_low, 0x43300000, src_high, 0x4330000 ];
      // 0x43300000'00000000 is a special double where the significand bits
      // precisely represents all uint32 numbers.
      __ Unpcklps(
-          dst, __ ExternalReferenceAsOperand(
-                   ExternalReference::
-                       address_of_wasm_f64x2_convert_low_i32x4_u_int_mask()));
+          dst, src,
+          __ ExternalReferenceAsOperand(
+              ExternalReference::
+                  address_of_wasm_f64x2_convert_low_i32x4_u_int_mask()));
      __ Subpd(dst,
               __ ExternalReferenceAsOperand(
                   ExternalReference::address_of_wasm_double_2_power_52()));
--- a/src/compiler/backend/x64/instruction-selector-x64.cc
+++ b/src/compiler/backend/x64/instruction-selector-x64.cc
@ -10,7 +10,9 @@
 #include "src/base/platform/wrappers.h"
 #include "src/codegen/cpu-features.h"
 #include "src/codegen/machine-type.h"
+#include "src/compiler/backend/instruction-codes.h"
 #include "src/compiler/backend/instruction-selector-impl.h"
+#include "src/compiler/backend/instruction.h"
 #include "src/compiler/machine-operator.h"
 #include "src/compiler/node-matchers.h"
 #include "src/compiler/node-properties.h"
@ -2931,7 +2933,6 @@ VISIT_ATOMIC_BINOP(Xor)
 #define SIMD_UNOP_LIST(V)   \
  V(F64x2Sqrt)              \
  V(F64x2ConvertLowI32x4S)  \
-  V(F64x2ConvertLowI32x4U)  \
  V(F64x2PromoteLowF32x4)   \
  V(F32x4SConvertI32x4)     \
  V(F32x4Abs)               \
@ -3726,6 +3727,13 @@ void InstructionSelector::VisitI8x16Popcnt(Node* node) {
       arraysize(temps), temps);
 }

+void InstructionSelector::VisitF64x2ConvertLowI32x4U(Node* node) {
+  X64OperandGenerator g(this);
+  InstructionOperand dst =
+      IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
+  Emit(kX64F64x2ConvertLowI32x4U, dst, g.UseRegister(node->InputAt(0)));
+}
+
 void InstructionSelector::VisitI32x4TruncSatF64x2SZero(Node* node) {
  X64OperandGenerator g(this);
  if (CpuFeatures::IsSupported(AVX)) {