[wasm-simd] Implement F64x2 min max for x64

Also add a IsExtreme(double) overload. This wasn't causing issues because there was no codepath which exercised it (only approx operations did). Change-Id: If7583fb567137c428d16c0d2cdfc37e086f7f3fd Bug: v8:8460 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1726675 Reviewed-by: Michael Starzinger <mstarzinger@chromium.org> Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#63053}
2019-07-31 13:13:36 -07:00 · 2019-07-31 13:13:36 -07:00 · e17ac92556
commit e17ac92556
parent 5cf67ad933
16 changed files with 97 additions and 3 deletions
--- a/src/codegen/x64/assembler-x64.h
+++ b/src/codegen/x64/assembler-x64.h
@ -1335,10 +1335,10 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
  AVX_S_3(vsub, 0x5c)
  AVX_S_3(vmul, 0x59)
  AVX_SP_3(vdiv, 0x5e)
-  AVX_SP_3(vmin, 0x5d)
-  AVX_SP_3(vmax, 0x5f)
+  AVX_S_3(vmin, 0x5d)
+  AVX_S_3(vmax, 0x5f)
  AVX_P_3(vand, 0x54)
-  AVX_P_3(vandn, 0x55)
+  AVX_3(vandnps, 0x55, vps)
  AVX_P_3(vor, 0x56)
  AVX_P_3(vxor, 0x57)
  AVX_3(vcvtsd2ss, 0x5a, vsd)
--- a/src/codegen/x64/sse-instr.h
+++ b/src/codegen/x64/sse-instr.h
@ -6,10 +6,13 @@
 #define V8_CODEGEN_X64_SSE_INSTR_H_

 #define SSE2_INSTRUCTION_LIST(V) \
+  V(andnpd, 66, 0F, 55)          \
  V(addpd, 66, 0F, 58)           \
  V(mulpd, 66, 0F, 59)           \
  V(cvtps2dq, 66, 0F, 5B)        \
  V(subpd, 66, 0F, 5C)           \
+  V(minpd, 66, 0F, 5D)           \
+  V(maxpd, 66, 0F, 5F)           \
  V(punpcklbw, 66, 0F, 60)       \
  V(punpcklwd, 66, 0F, 61)       \
  V(punpckldq, 66, 0F, 62)       \
--- a/src/compiler/backend/instruction-selector.cc
+++ b/src/compiler/backend/instruction-selector.cc
@ -1833,6 +1833,10 @@ void InstructionSelector::VisitNode(Node* node) {
      return MarkAsSimd128(node), VisitF64x2Sub(node);
    case IrOpcode::kF64x2Mul:
      return MarkAsSimd128(node), VisitF64x2Mul(node);
+    case IrOpcode::kF64x2Min:
+      return MarkAsSimd128(node), VisitF64x2Min(node);
+    case IrOpcode::kF64x2Max:
+      return MarkAsSimd128(node), VisitF64x2Max(node);
    case IrOpcode::kF64x2Eq:
      return MarkAsSimd128(node), VisitF64x2Eq(node);
    case IrOpcode::kF64x2Ne:
@ -2575,6 +2579,8 @@ void InstructionSelector::VisitF64x2Neg(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF64x2Add(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF64x2Sub(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF64x2Mul(Node* node) { UNIMPLEMENTED(); }
+void InstructionSelector::VisitF64x2Min(Node* node) { UNIMPLEMENTED(); }
+void InstructionSelector::VisitF64x2Max(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF64x2Eq(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF64x2Ne(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF64x2Lt(Node* node) { UNIMPLEMENTED(); }
--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -2296,6 +2296,45 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      ASSEMBLE_SSE_BINOP(mulpd);
      break;
    }
+    case kX64F64x2Min: {
+      XMMRegister src1 = i.InputSimd128Register(1),
+                  dst = i.OutputSimd128Register();
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      // The minpd instruction doesn't propagate NaNs and +0's in its first
+      // operand. Perform minpd in both orders, merge the resuls, and adjust.
+      __ movapd(kScratchDoubleReg, src1);
+      __ minpd(kScratchDoubleReg, dst);
+      __ minpd(dst, src1);
+      // propagate -0's and NaNs, which may be non-canonical.
+      __ orpd(kScratchDoubleReg, dst);
+      // Canonicalize NaNs by quieting and clearing the payload.
+      __ cmppd(dst, kScratchDoubleReg, 3);
+      __ orpd(kScratchDoubleReg, dst);
+      __ psrlq(dst, 13);
+      __ andnpd(dst, kScratchDoubleReg);
+      break;
+    }
+    case kX64F64x2Max: {
+      XMMRegister src1 = i.InputSimd128Register(1),
+                  dst = i.OutputSimd128Register();
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      // The maxpd instruction doesn't propagate NaNs and +0's in its first
+      // operand. Perform maxpd in both orders, merge the resuls, and adjust.
+      __ movapd(kScratchDoubleReg, src1);
+      __ maxpd(kScratchDoubleReg, dst);
+      __ maxpd(dst, src1);
+      // Find discrepancies.
+      __ xorpd(dst, kScratchDoubleReg);
+      // Propagate NaNs, which may be non-canonical.
+      __ orpd(kScratchDoubleReg, dst);
+      // Propagate sign discrepancy and (subtle) quiet NaNs.
+      __ subpd(kScratchDoubleReg, dst);
+      // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
+      __ cmppd(dst, kScratchDoubleReg, 3);
+      __ psrlq(dst, 13);
+      __ andnpd(dst, kScratchDoubleReg);
+      break;
+    }
    case kX64F64x2Eq: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ cmpeqpd(i.OutputSimd128Register(), i.InputSimd128Register(1));
--- a/src/compiler/backend/x64/instruction-codes-x64.h
+++ b/src/compiler/backend/x64/instruction-codes-x64.h
@ -167,6 +167,8 @@ namespace compiler {
  V(X64F64x2Add)                          \
  V(X64F64x2Sub)                          \
  V(X64F64x2Mul)                          \
+  V(X64F64x2Min)                          \
+  V(X64F64x2Max)                          \
  V(X64F64x2Eq)                           \
  V(X64F64x2Ne)                           \
  V(X64F64x2Lt)                           \
--- a/src/compiler/backend/x64/instruction-scheduler-x64.cc
+++ b/src/compiler/backend/x64/instruction-scheduler-x64.cc
@ -132,6 +132,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64F64x2Add:
    case kX64F64x2Sub:
    case kX64F64x2Mul:
+    case kX64F64x2Min:
+    case kX64F64x2Max:
    case kX64F64x2Eq:
    case kX64F64x2Ne:
    case kX64F64x2Lt:
--- a/src/compiler/backend/x64/instruction-selector-x64.cc
+++ b/src/compiler/backend/x64/instruction-selector-x64.cc
@ -2590,6 +2590,8 @@ VISIT_ATOMIC_BINOP(Xor)
  V(F64x2Add)              \
  V(F64x2Sub)              \
  V(F64x2Mul)              \
+  V(F64x2Min)              \
+  V(F64x2Max)              \
  V(F64x2Eq)               \
  V(F64x2Ne)               \
  V(F64x2Lt)               \
--- a/src/compiler/machine-operator.cc
+++ b/src/compiler/machine-operator.cc
@ -251,6 +251,8 @@ MachineType AtomicOpType(Operator const* op) {
  V(F64x2Add, Operator::kCommutative, 2, 0, 1)                                \
  V(F64x2Sub, Operator::kNoProperties, 2, 0, 1)                               \
  V(F64x2Mul, Operator::kCommutative, 2, 0, 1)                                \
+  V(F64x2Min, Operator::kCommutative, 2, 0, 1)                                \
+  V(F64x2Max, Operator::kCommutative, 2, 0, 1)                                \
  V(F64x2Eq, Operator::kCommutative, 2, 0, 1)                                 \
  V(F64x2Ne, Operator::kCommutative, 2, 0, 1)                                 \
  V(F64x2Lt, Operator::kNoProperties, 2, 0, 1)                                \
--- a/src/compiler/machine-operator.h
+++ b/src/compiler/machine-operator.h
@ -475,6 +475,8 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
  const Operator* F64x2Sub();
  const Operator* F64x2Mul();
  const Operator* F64x2ExtractLane(int32_t);
+  const Operator* F64x2Min();
+  const Operator* F64x2Max();
  const Operator* F64x2ReplaceLane(int32_t);
  const Operator* F64x2Eq();
  const Operator* F64x2Ne();
--- a/src/compiler/opcodes.h
+++ b/src/compiler/opcodes.h
@ -749,6 +749,8 @@
  V(F64x2Add)                   \
  V(F64x2Sub)                   \
  V(F64x2Mul)                   \
+  V(F64x2Min)                   \
+  V(F64x2Max)                   \
  V(F64x2Eq)                    \
  V(F64x2Ne)                    \
  V(F64x2Lt)                    \
--- a/src/compiler/wasm-compiler.cc
+++ b/src/compiler/wasm-compiler.cc
@ -4013,6 +4013,12 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
    case wasm::kExprF64x2Mul:
      return graph()->NewNode(mcgraph()->machine()->F64x2Mul(), inputs[0],
                              inputs[1]);
+    case wasm::kExprF64x2Min:
+      return graph()->NewNode(mcgraph()->machine()->F64x2Min(), inputs[0],
+                              inputs[1]);
+    case wasm::kExprF64x2Max:
+      return graph()->NewNode(mcgraph()->machine()->F64x2Max(), inputs[0],
+                              inputs[1]);
    case wasm::kExprF64x2Eq:
      return graph()->NewNode(mcgraph()->machine()->F64x2Eq(), inputs[0],
                              inputs[1]);
--- a/src/diagnostics/x64/disasm-x64.cc
+++ b/src/diagnostics/x64/disasm-x64.cc
@ -1847,6 +1847,8 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
        const char* mnemonic;
        if (opcode == 0x54) {
          mnemonic = "andpd";
+        } else if (opcode == 0x55) {
+          mnemonic = "andnpd";
        } else if (opcode == 0x56) {
          mnemonic = "orpd";
        } else if (opcode == 0x57) {
@ -1859,6 +1861,10 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
          mnemonic = "cvtps2dq";
        } else if (opcode == 0x5C) {
          mnemonic = "subpd";
+        } else if (opcode == 0x5D) {
+          mnemonic = "minpd";
+        } else if (opcode == 0x5F) {
+          mnemonic = "maxpd";
        } else if (opcode == 0x60) {
          mnemonic = "punpcklbw";
        } else if (opcode == 0x61) {
--- a/src/wasm/wasm-interpreter.cc
+++ b/src/wasm/wasm-interpreter.cc
@ -2244,6 +2244,8 @@ class ThreadImpl {
      BINOP_CASE(F64x2Add, f64x2, float2, 2, a + b)
      BINOP_CASE(F64x2Sub, f64x2, float2, 2, a - b)
      BINOP_CASE(F64x2Mul, f64x2, float2, 2, a * b)
+      BINOP_CASE(F64x2Min, f64x2, float2, 2, JSMin(a, b))
+      BINOP_CASE(F64x2Max, f64x2, float2, 2, JSMax(a, b))
      BINOP_CASE(F32x4Add, f32x4, float4, 4, a + b)
      BINOP_CASE(F32x4Sub, f32x4, float4, 4, a - b)
      BINOP_CASE(F32x4Mul, f32x4, float4, 4, a * b)
--- a/src/wasm/wasm-opcodes.cc
+++ b/src/wasm/wasm-opcodes.cc
@ -247,7 +247,9 @@ const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
    CASE_F32x4_OP(AddHoriz, "add_horizontal")
    CASE_F32x4_OP(RecipApprox, "recip_approx")
    CASE_F32x4_OP(RecipSqrtApprox, "recip_sqrt_approx")
+    CASE_F64x2_OP(Min, "min")
    CASE_F32x4_OP(Min, "min")
+    CASE_F64x2_OP(Max, "max")
    CASE_F32x4_OP(Max, "max")
    CASE_F32x4_OP(Lt, "lt")
    CASE_F32x4_OP(Le, "le")
--- a/src/wasm/wasm-opcodes.h
+++ b/src/wasm/wasm-opcodes.h
@ -406,6 +406,8 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, bool hasBigIntFeature);
  V(F64x2Add, 0xfda5, s_ss)              \
  V(F64x2Sub, 0xfda6, s_ss)              \
  V(F64x2Mul, 0xfda7, s_ss)              \
+  V(F64x2Min, 0xfda9, s_ss)              \
+  V(F64x2Max, 0xfdaa, s_ss)              \
  V(I32x4SConvertF32x4, 0xfdab, s_s)     \
  V(I32x4UConvertF32x4, 0xfdac, s_s)     \
  V(F32x4SConvertI32x4, 0xfdaf, s_s)     \
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -1038,6 +1038,14 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Le) {
  RunF64x2CompareOpTest(execution_tier, lower_simd, kExprF64x2Le, LessEqual);
 }

+bool IsExtreme(double x) {
+  double abs_x = std::fabs(x);
+  const double kSmallFloatThreshold = 1.0e-298;
+  const double kLargeFloatThreshold = 1.0e298;
+  return abs_x != 0.0f &&  // 0 or -0 are fine.
+         (abs_x < kSmallFloatThreshold || abs_x > kLargeFloatThreshold);
+}
+
 bool IsSameNan(double expected, double actual) {
  // Sign is non-deterministic.
  uint64_t expected_bits = bit_cast<uint64_t>(expected) & ~0x8000000000000000;
@ -1209,6 +1217,14 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Mul) {
  RunF64x2BinOpTest(execution_tier, lower_simd, kExprF64x2Mul, Mul);
 }

+WASM_SIMD_TEST_NO_LOWERING(F64x2Min) {
+  RunF64x2BinOpTest(execution_tier, lower_simd, kExprF64x2Min, JSMin);
+}
+
+WASM_SIMD_TEST_NO_LOWERING(F64x2Max) {
+  RunF64x2BinOpTest(execution_tier, lower_simd, kExprF64x2Max, JSMax);
+}
+
 #undef FOR_FLOAT64_NAN_INPUTS

 WASM_SIMD_TEST_NO_LOWERING(I64x2ExtractWithF64x2) {