[wasm simd] Make F32x4Min/Max IEEE 754 compliant

- Changes min and max sequences to propagate NaNs and signed zeroes. - Note that NaN propagation must preserve canonical NaNs. This is achieved by always returning canonical NaNs. This is also consistent with the WebAssembly scalar math spec. Bug: v8:8639 Change-Id: I04fdefabc54ea60f4d02e2081c32444a02dd6a83 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1524634 Commit-Queue: Bill Budge <bbudge@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Cr-Commit-Position: refs/heads/master@{#60414}
2019-03-21 13:30:49 -07:00 · 2019-03-21 13:30:49 -07:00 · aa706218d6
commit aa706218d6
parent 41a4651af6
14 changed files with 155 additions and 64 deletions
--- a/src/arm/simulator-arm.cc
+++ b/src/arm/simulator-arm.cc
@ -19,6 +19,7 @@
 #include "src/objects-inl.h"
 #include "src/ostreams.h"
 #include "src/runtime/runtime-utils.h"
+#include "src/utils.h"
 #include "src/vector.h"

 // Only build the simulator if not compiling for real ARM hardware.
@ -4181,10 +4182,8 @@ void CompareGreater(Simulator* simulator, int Vd, int Vm, int Vn, bool ge) {
 }

 float MinMax(float a, float b, bool is_min) {
-  if (std::isnan(a) || std::isnan(b)) return NAN;
-  return is_min ? fmin(a, b) : fmax(a, b);
+  return is_min ? JSMin(a, b) : JSMax(a, b);
 }
-
 template <typename T>
 T MinMax(T a, T b, bool is_min) {
  return is_min ? std::min(a, b) : std::max(a, b);
--- a/src/compiler/backend/ia32/code-generator-ia32.cc
+++ b/src/compiler/backend/ia32/code-generator-ia32.cc
@ -1952,48 +1952,73 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kSSEF32x4Min: {
-      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
-      // minps doesn't propagate NaN lanes in the first source. Compare this
-      // with itself to generate 1's in those lanes (quiet NaNs) and or them
-      // with the result of minps to simulate NaN propagation.
-      __ movaps(kScratchDoubleReg, i.InputSimd128Register(0));
-      __ cmpps(kScratchDoubleReg, kScratchDoubleReg, 0x4);
-      __ minps(i.OutputSimd128Register(), i.InputSimd128Register(1));
-      __ orps(i.OutputSimd128Register(), kScratchDoubleReg);
-
+      XMMRegister src1 = i.InputSimd128Register(1),
+                  dst = i.OutputSimd128Register();
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      // The minps instruction doesn't propagate NaNs and +0's in its first
+      // operand. Perform minps in both orders, merge the resuls, and adjust.
+      __ movaps(kScratchDoubleReg, src1);
+      __ minps(kScratchDoubleReg, dst);
+      __ minps(dst, src1);
+      // propagate -0's and NaNs, which may be non-canonical.
+      __ orps(dst, kScratchDoubleReg);
+      // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
+      __ movaps(kScratchDoubleReg, dst);
+      __ cmpps(dst, dst, 4);
+      __ psrld(dst, 10);
+      __ andnps(dst, kScratchDoubleReg);
      break;
    }
    case kAVXF32x4Min: {
      CpuFeatureScope avx_scope(tasm(), AVX);
-      // See comment above for minps and NaN propagation.
-      __ vcmpneqps(kScratchDoubleReg, i.InputSimd128Register(0),
-                   i.InputSimd128Register(0));  // Is NaN?
-      __ vminps(i.OutputSimd128Register(), i.InputSimd128Register(0),
-                i.InputOperand(1));
-      __ vorps(i.OutputSimd128Register(), i.OutputSimd128Register(),
-               kScratchDoubleReg);  // re-NaN-imate.
+      XMMRegister dst = i.OutputSimd128Register();
+      Operand src1 = i.InputOperand(1);
+      // See comment above for correction of minps.
+      __ movups(kScratchDoubleReg, src1);
+      __ vminps(kScratchDoubleReg, kScratchDoubleReg, dst);
+      __ vminps(dst, dst, src1);
+      __ vorps(dst, dst, kScratchDoubleReg);
+      __ vcmpneqps(kScratchDoubleReg, dst, dst);
+      __ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 10);
+      __ vandnps(dst, kScratchDoubleReg, dst);
      break;
    }
    case kSSEF32x4Max: {
-      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
-      // maxps doesn't propagate NaN lanes in the first source. Compare this
-      // with itself to generate 1's in those lanes (quiet NaNs) and or them
-      // with the result of maxps to simulate NaN propagation.
-      __ movaps(kScratchDoubleReg, i.InputSimd128Register(0));
-      __ cmpps(kScratchDoubleReg, kScratchDoubleReg, 0x4);
-      __ maxps(i.OutputSimd128Register(), i.InputSimd128Register(1));
-      __ orps(i.OutputSimd128Register(), kScratchDoubleReg);
+      XMMRegister src1 = i.InputSimd128Register(1),
+                  dst = i.OutputSimd128Register();
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      // The maxps instruction doesn't propagate NaNs and +0's in its first
+      // operand. Perform maxps in both orders, merge the resuls, and adjust.
+      __ movaps(kScratchDoubleReg, src1);
+      __ maxps(kScratchDoubleReg, dst);
+      __ maxps(dst, src1);
+      // Find discrepancies.
+      __ xorps(kScratchDoubleReg, dst);
+      // Propagate NaNs, which may be non-canonical.
+      __ orps(dst, kScratchDoubleReg);
+      // Propagate sign discrepancy. NaNs and correct lanes are preserved.
+      __ subps(dst, kScratchDoubleReg);
+      // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
+      __ movaps(kScratchDoubleReg, dst);
+      __ cmpps(dst, dst, 4);
+      __ psrld(dst, 10);
+      __ andnps(dst, kScratchDoubleReg);
      break;
    }
    case kAVXF32x4Max: {
      CpuFeatureScope avx_scope(tasm(), AVX);
-      // See comment above for maxps and NaN propagation.
-      __ vcmpneqps(kScratchDoubleReg, i.InputSimd128Register(0),
-                   i.InputSimd128Register(0));
-      __ vmaxps(i.OutputSimd128Register(), i.InputSimd128Register(0),
-                i.InputOperand(1));
-      __ vorps(i.OutputSimd128Register(), i.OutputSimd128Register(),
-               kScratchDoubleReg);
+      XMMRegister dst = i.OutputSimd128Register();
+      Operand src1 = i.InputOperand(1);
+      // See comment above for correction of maxps.
+      __ movaps(kScratchDoubleReg, src1);
+      __ vmaxps(kScratchDoubleReg, kScratchDoubleReg, dst);
+      __ vmaxps(dst, dst, src1);
+      __ vxorps(kScratchDoubleReg, kScratchDoubleReg, dst);
+      __ vorps(dst, dst, kScratchDoubleReg);
+      __ vsubps(dst, dst, kScratchDoubleReg);
+      __ vcmpneqps(kScratchDoubleReg, dst, dst);
+      __ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 10);
+      __ vandnps(dst, kScratchDoubleReg, dst);
      break;
    }
    case kSSEF32x4Eq: {
--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -2347,25 +2347,43 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kX64F32x4Min: {
-      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
-      // minps doesn't propagate NaN lanes in the first source. Compare this
-      // with itself to generate 1's in those lanes (quiet NaNs) and or them
-      // with the result of minps to simulate NaN propagation.
-      __ movaps(kScratchDoubleReg, i.InputSimd128Register(0));
-      __ cmpps(kScratchDoubleReg, kScratchDoubleReg, 0x4);
-      __ minps(i.OutputSimd128Register(), i.InputSimd128Register(1));
-      __ orps(i.OutputSimd128Register(), kScratchDoubleReg);
+      XMMRegister src1 = i.InputSimd128Register(1),
+                  dst = i.OutputSimd128Register();
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      // The minps instruction doesn't propagate NaNs and +0's in its first
+      // operand. Perform minps in both orders, merge the resuls, and adjust.
+      __ movaps(kScratchDoubleReg, src1);
+      __ minps(kScratchDoubleReg, dst);
+      __ minps(dst, src1);
+      // propagate -0's and NaNs, which may be non-canonical.
+      __ orps(dst, kScratchDoubleReg);
+      // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
+      __ movaps(kScratchDoubleReg, dst);
+      __ cmpps(dst, dst, 4);
+      __ psrld(dst, 10);
+      __ andnps(dst, kScratchDoubleReg);
      break;
    }
    case kX64F32x4Max: {
-      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
-      // maxps doesn't propagate NaN lanes in the first source. Compare this
-      // with itself to generate 1's in those lanes (quiet NaNs) and or them
-      // with the result of maxps to simulate NaN propagation.
-      __ movaps(kScratchDoubleReg, i.InputSimd128Register(0));
-      __ cmpps(kScratchDoubleReg, kScratchDoubleReg, 0x4);
-      __ maxps(i.OutputSimd128Register(), i.InputSimd128Register(1));
-      __ orps(i.OutputSimd128Register(), kScratchDoubleReg);
+      XMMRegister src1 = i.InputSimd128Register(1),
+                  dst = i.OutputSimd128Register();
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      // The maxps instruction doesn't propagate NaNs and +0's in its first
+      // operand. Perform maxps in both orders, merge the resuls, and adjust.
+      __ movaps(kScratchDoubleReg, src1);
+      __ maxps(kScratchDoubleReg, dst);
+      __ maxps(dst, src1);
+      // Find discrepancies.
+      __ xorps(kScratchDoubleReg, dst);
+      // Propagate NaNs, which may be non-canonical.
+      __ orps(dst, kScratchDoubleReg);
+      // Propagate sign discrepancy. NaNs and correct lanes are preserved.
+      __ subps(dst, kScratchDoubleReg);
+      // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
+      __ movaps(kScratchDoubleReg, dst);
+      __ cmpps(dst, dst, 4);
+      __ psrld(dst, 10);
+      __ andnps(dst, kScratchDoubleReg);
      break;
    }
    case kX64F32x4Eq: {
--- a/src/ia32/assembler-ia32.cc
+++ b/src/ia32/assembler-ia32.cc
@ -2280,6 +2280,13 @@ void Assembler::andps(XMMRegister dst, Operand src) {
  emit_sse_operand(dst, src);
 }

+void Assembler::andnps(XMMRegister dst, Operand src) {
+  EnsureSpace ensure_space(this);
+  EMIT(0x0F);
+  EMIT(0x55);
+  emit_sse_operand(dst, src);
+}
+
 void Assembler::orps(XMMRegister dst, Operand src) {
  EnsureSpace ensure_space(this);
  EMIT(0x0F);
@ -2468,21 +2475,13 @@ void Assembler::cmpltsd(XMMRegister dst, XMMRegister src) {
  EMIT(1);  // LT == 1
 }

-
-void Assembler::movaps(XMMRegister dst, XMMRegister src) {
+void Assembler::movaps(XMMRegister dst, Operand src) {
  EnsureSpace ensure_space(this);
  EMIT(0x0F);
  EMIT(0x28);
  emit_sse_operand(dst, src);
 }

-void Assembler::movups(XMMRegister dst, XMMRegister src) {
-  EnsureSpace ensure_space(this);
-  EMIT(0x0F);
-  EMIT(0x10);
-  emit_sse_operand(dst, src);
-}
-
 void Assembler::movups(XMMRegister dst, Operand src) {
  EnsureSpace ensure_space(this);
  EMIT(0x0F);
--- a/src/ia32/assembler-ia32.h
+++ b/src/ia32/assembler-ia32.h
@ -856,8 +856,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {

  void ucomiss(XMMRegister dst, XMMRegister src) { ucomiss(dst, Operand(src)); }
  void ucomiss(XMMRegister dst, Operand src);
-  void movaps(XMMRegister dst, XMMRegister src);
-  void movups(XMMRegister dst, XMMRegister src);
+  void movaps(XMMRegister dst, XMMRegister src) { movaps(dst, Operand(src)); }
+  void movaps(XMMRegister dst, Operand src);
+  void movups(XMMRegister dst, XMMRegister src) { movups(dst, Operand(src)); }
  void movups(XMMRegister dst, Operand src);
  void movups(Operand dst, XMMRegister src);
  void shufps(XMMRegister dst, XMMRegister src, byte imm8);
@ -869,6 +870,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {

  void andps(XMMRegister dst, Operand src);
  void andps(XMMRegister dst, XMMRegister src) { andps(dst, Operand(src)); }
+  void andnps(XMMRegister dst, Operand src);
+  void andnps(XMMRegister dst, XMMRegister src) { andnps(dst, Operand(src)); }
  void xorps(XMMRegister dst, Operand src);
  void xorps(XMMRegister dst, XMMRegister src) { xorps(dst, Operand(src)); }
  void orps(XMMRegister dst, Operand src);
@ -1320,9 +1323,10 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
  void vhaddps(XMMRegister dst, XMMRegister src1, Operand src2) {
    vinstr(0x7C, dst, src1, src2, kF2, k0F, kWIG);
  }
-  void vmovaps(XMMRegister dst, XMMRegister src) {
-    vps(0x28, dst, xmm0, Operand(src));
-  }
+  void vmovaps(XMMRegister dst, XMMRegister src) { vmovaps(dst, Operand(src)); }
+  void vmovaps(XMMRegister dst, Operand src) { vps(0x28, dst, xmm0, src); }
+  void vmovups(XMMRegister dst, XMMRegister src) { vmovups(dst, Operand(src)); }
+  void vmovups(XMMRegister dst, Operand src) { vps(0x10, dst, xmm0, src); }
  void vshufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, byte imm8) {
    vshufps(dst, src1, Operand(src2), imm8);
  }
@ -1500,6 +1504,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {

 #define PACKED_OP_LIST(V) \
  V(and, 0x54)            \
+  V(andn, 0x55)           \
  V(or, 0x56)             \
  V(xor, 0x57)            \
  V(add, 0x58)            \
--- a/src/ia32/disasm-ia32.cc
+++ b/src/ia32/disasm-ia32.cc
@ -1119,6 +1119,11 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
                       NameOfXMMRegister(vvvv));
        current += PrintRightXMMOperand(current);
        break;
+      case 0x55:
+        AppendToBuffer("vandnps %s,%s,", NameOfXMMRegister(regop),
+                       NameOfXMMRegister(vvvv));
+        current += PrintRightXMMOperand(current);
+        break;
      case 0x57:
        AppendToBuffer("vxorps %s,%s,", NameOfXMMRegister(regop),
                       NameOfXMMRegister(vvvv));
--- a/src/ia32/macro-assembler-ia32.h
+++ b/src/ia32/macro-assembler-ia32.h
@ -282,6 +282,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  AVX_OP3_XO(Punpckhbw, punpckhbw)
  AVX_OP3_XO(Pxor, pxor)
  AVX_OP3_XO(Andps, andps)
+  AVX_OP3_XO(Andnps, andnps)
  AVX_OP3_XO(Andpd, andpd)
  AVX_OP3_XO(Xorps, xorps)
  AVX_OP3_XO(Xorpd, xorpd)
--- a/src/x64/assembler-x64.cc
+++ b/src/x64/assembler-x64.cc
@ -2829,6 +2829,21 @@ void Assembler::andps(XMMRegister dst, Operand src) {
  emit_sse_operand(dst, src);
 }

+void Assembler::andnps(XMMRegister dst, XMMRegister src) {
+  EnsureSpace ensure_space(this);
+  emit_optional_rex_32(dst, src);
+  emit(0x0F);
+  emit(0x55);
+  emit_sse_operand(dst, src);
+}
+
+void Assembler::andnps(XMMRegister dst, Operand src) {
+  EnsureSpace ensure_space(this);
+  emit_optional_rex_32(dst, src);
+  emit(0x0F);
+  emit(0x55);
+  emit_sse_operand(dst, src);
+}

 void Assembler::orps(XMMRegister dst, XMMRegister src) {
  EnsureSpace ensure_space(this);
--- a/src/x64/assembler-x64.h
+++ b/src/x64/assembler-x64.h
@ -872,6 +872,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {

  void andps(XMMRegister dst, XMMRegister src);
  void andps(XMMRegister dst, Operand src);
+  void andnps(XMMRegister dst, XMMRegister src);
+  void andnps(XMMRegister dst, Operand src);
  void orps(XMMRegister dst, XMMRegister src);
  void orps(XMMRegister dst, Operand src);
  void xorps(XMMRegister dst, XMMRegister src);
@ -1320,6 +1322,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
  AVX_SP_3(vmin, 0x5d)
  AVX_SP_3(vmax, 0x5f)
  AVX_P_3(vand, 0x54)
+  AVX_P_3(vandn, 0x55)
  AVX_P_3(vor, 0x56)
  AVX_P_3(vxor, 0x57)
  AVX_3(vcvtsd2ss, 0x5a, vsd)
--- a/src/x64/disasm-x64.cc
+++ b/src/x64/disasm-x64.cc
@ -1313,6 +1313,11 @@ int DisassemblerX64::AVXInstruction(byte* data) {
                       NameOfXMMRegister(vvvv));
        current += PrintRightXMMOperand(current);
        break;
+      case 0x55:
+        AppendToBuffer("vandnps %s,%s,", NameOfXMMRegister(regop),
+                       NameOfXMMRegister(vvvv));
+        current += PrintRightXMMOperand(current);
+        break;
      case 0x57:
        AppendToBuffer("vxorps %s,%s,", NameOfXMMRegister(regop),
                       NameOfXMMRegister(vvvv));
--- a/src/x64/macro-assembler-x64.h
+++ b/src/x64/macro-assembler-x64.h
@ -147,6 +147,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  AVX_OP(Addsd, addsd)
  AVX_OP(Mulsd, mulsd)
  AVX_OP(Andps, andps)
+  AVX_OP(Andnps, andnps)
  AVX_OP(Andpd, andpd)
  AVX_OP(Orpd, orpd)
  AVX_OP(Cmpeqps, cmpeqps)
--- a/test/cctest/test-disasm-ia32.cc
+++ b/test/cctest/test-disasm-ia32.cc
@ -399,6 +399,8 @@ TEST(DisasmIa320) {
    // logic operation
    __ andps(xmm0, xmm1);
    __ andps(xmm0, Operand(ebx, ecx, times_4, 10000));
+    __ andnps(xmm0, xmm1);
+    __ andnps(xmm0, Operand(ebx, ecx, times_4, 10000));
    __ orps(xmm0, xmm1);
    __ orps(xmm0, Operand(ebx, ecx, times_4, 10000));
    __ xorps(xmm0, xmm1);
@ -619,6 +621,8 @@ TEST(DisasmIa320) {

      __ vandps(xmm0, xmm1, xmm2);
      __ vandps(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
+      __ vandnps(xmm0, xmm1, xmm2);
+      __ vandnps(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
      __ vxorps(xmm0, xmm1, xmm2);
      __ vxorps(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
      __ vaddps(xmm0, xmm1, xmm2);
--- a/test/cctest/test-disasm-x64.cc
+++ b/test/cctest/test-disasm-x64.cc
@ -395,6 +395,8 @@ TEST(DisasmX64) {
    // logic operation
    __ andps(xmm0, xmm1);
    __ andps(xmm0, Operand(rbx, rcx, times_4, 10000));
+    __ andnps(xmm0, xmm1);
+    __ andnps(xmm0, Operand(rbx, rcx, times_4, 10000));
    __ orps(xmm0, xmm1);
    __ orps(xmm0, Operand(rbx, rcx, times_4, 10000));
    __ xorps(xmm0, xmm1);
@ -684,6 +686,8 @@ TEST(DisasmX64) {

      __ vandps(xmm0, xmm9, xmm2);
      __ vandps(xmm9, xmm1, Operand(rbx, rcx, times_4, 10000));
+      __ vandnps(xmm0, xmm9, xmm2);
+      __ vandnps(xmm9, xmm1, Operand(rbx, rcx, times_4, 10000));
      __ vxorps(xmm0, xmm1, xmm9);
      __ vxorps(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
      __ vhaddps(xmm0, xmm1, xmm9);
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -520,8 +520,15 @@ void RunF32x4BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
        float actual = ReadLittleEndianValue<float>(&g[i]);
        if (std::isnan(expected)) {
          CHECK(std::isnan(actual));
+          // Finally, check that NaN outputs are canonical. The sign bit is
+          // allowed to be non-deterministic.
+          uint32_t expected_bits = bit_cast<uint32_t>(expected) & ~0x80000000;
+          uint32_t actual_bits = bit_cast<uint32_t>(actual) & ~0x80000000;
+          CHECK_EQ(expected_bits, actual_bits);
        } else {
          CHECK_EQ(expected, actual);
+          // The sign of 0's must match.
+          CHECK_EQ(std::signbit(expected), std::signbit(actual));
        }
      }
    }