[wasm-simd] Implement rounding average on x64 and interpreter

This change includes templatization of the test helper to allow the same function to be reused for both signed and unsigned data types. We implement a new function RoundingAverageUnsigned in overflowing-math, rather than in base/utils, since the addition could overflow. SIMD scalar lowering and implementation for other backends will follow in future patches. Bug: v8:10039 Change-Id: I70735f7b6536f197869ef1afbccaf5649e7e8448 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1958007 Reviewed-by: Clemens Backes <clemensb@chromium.org> Reviewed-by: Tobias Tebbi <tebbi@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#65531}
2019-12-09 10:32:55 +00:00 · 2019-12-09 10:32:55 +00:00 · 90b42052c6
commit 90b42052c6
parent c82338295a
17 changed files with 92 additions and 14 deletions
--- a/src/base/overflowing-math.h
+++ b/src/base/overflowing-math.h
@ -83,6 +83,13 @@ inline float RecipSqrt(float a) {
  return -std::numeric_limits<float>::infinity();
 }

+template <typename T>
+inline T RoundingAverageUnsigned(T a, T b) {
+  static_assert(std::is_unsigned<T>::value, "Only for unsiged types");
+  static_assert(sizeof(T) < sizeof(uint64_t), "Must be smaller than uint64_t");
+  return (static_cast<uint64_t>(a) + static_cast<uint64_t>(b) + 1) >> 1;
+}
+
 }  // namespace base
 }  // namespace v8

--- a/src/codegen/x64/macro-assembler-x64.h
+++ b/src/codegen/x64/macro-assembler-x64.h
@ -187,6 +187,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  AVX_OP(Pxor, pxor)
  AVX_OP(Psubd, psubd)
  AVX_OP(Pslld, pslld)
+  AVX_OP(Pavgb, pavgb)
+  AVX_OP(Pavgw, pavgw)
  AVX_OP(Psrad, psrad)
  AVX_OP(Psrld, psrld)
  AVX_OP(Paddd, paddd)
--- a/src/codegen/x64/sse-instr.h
+++ b/src/codegen/x64/sse-instr.h
@ -79,8 +79,10 @@
  V(psllw, 66, 0F, F1)           \
  V(pslld, 66, 0F, F2)           \
  V(psllq, 66, 0F, F3)           \
+  V(pavgb, 66, 0F, E0)           \
  V(psraw, 66, 0F, E1)           \
  V(psrad, 66, 0F, E2)           \
+  V(pavgw, 66, 0F, E3)           \
  V(psrlw, 66, 0F, D1)           \
  V(psrld, 66, 0F, D2)           \
  V(psrlq, 66, 0F, D3)           \
--- a/src/compiler/backend/instruction-selector.cc
+++ b/src/compiler/backend/instruction-selector.cc
@ -2095,6 +2095,8 @@ void InstructionSelector::VisitNode(Node* node) {
      return MarkAsSimd128(node), VisitI16x8GtU(node);
    case IrOpcode::kI16x8GeU:
      return MarkAsSimd128(node), VisitI16x8GeU(node);
+    case IrOpcode::kI16x8RoundingAverageU:
+      return MarkAsSimd128(node), VisitI16x8RoundingAverageU(node);
    case IrOpcode::kI8x16Splat:
      return MarkAsSimd128(node), VisitI8x16Splat(node);
    case IrOpcode::kI8x16ExtractLaneU:
@ -2149,6 +2151,8 @@ void InstructionSelector::VisitNode(Node* node) {
      return MarkAsSimd128(node), VisitI8x16GtU(node);
    case IrOpcode::kI8x16GeU:
      return MarkAsSimd128(node), VisitI8x16GeU(node);
+    case IrOpcode::kI8x16RoundingAverageU:
+      return MarkAsSimd128(node), VisitI8x16RoundingAverageU(node);
    case IrOpcode::kS128Zero:
      return MarkAsSimd128(node), VisitS128Zero(node);
    case IrOpcode::kS128And:
@ -2630,6 +2634,12 @@ void InstructionSelector::VisitF64x2SConvertI64x2(Node* node) {
 void InstructionSelector::VisitF64x2UConvertI64x2(Node* node) {
  UNIMPLEMENTED();
 }
+void InstructionSelector::VisitI16x8RoundingAverageU(Node* node) {
+  UNIMPLEMENTED();
+}
+void InstructionSelector::VisitI8x16RoundingAverageU(Node* node) {
+  UNIMPLEMENTED();
+}
 #if !V8_TARGET_ARCH_ARM64
 #if !V8_TARGET_ARCH_ARM
 void InstructionSelector::VisitLoadTransform(Node* node) { UNIMPLEMENTED(); }
--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -3339,6 +3339,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ pcmpeqw(dst, src);
      break;
    }
+    case kX64I16x8RoundingAverageU: {
+      __ Pavgw(i.OutputSimd128Register(), i.InputSimd128Register(1));
+      break;
+    }
    case kX64I8x16Splat: {
      CpuFeatureScope sse_scope(tasm(), SSSE3);
      XMMRegister dst = i.OutputSimd128Register();
@ -3578,6 +3582,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ pcmpeqb(dst, src);
      break;
    }
+    case kX64I8x16RoundingAverageU: {
+      __ Pavgb(i.OutputSimd128Register(), i.InputSimd128Register(1));
+      break;
+    }
    case kX64S128And: {
      __ pand(i.OutputSimd128Register(), i.InputSimd128Register(1));
      break;
--- a/src/compiler/backend/x64/instruction-codes-x64.h
+++ b/src/compiler/backend/x64/instruction-codes-x64.h
@ -274,6 +274,7 @@ namespace compiler {
  V(X64I16x8MaxU)                         \
  V(X64I16x8GtU)                          \
  V(X64I16x8GeU)                          \
+  V(X64I16x8RoundingAverageU)             \
  V(X64I8x16Splat)                        \
  V(X64I8x16ExtractLaneU)                 \
  V(X64I8x16ExtractLaneS)                 \
@ -301,6 +302,7 @@ namespace compiler {
  V(X64I8x16MaxU)                         \
  V(X64I8x16GtU)                          \
  V(X64I8x16GeU)                          \
+  V(X64I8x16RoundingAverageU)             \
  V(X64S128Zero)                          \
  V(X64S128Not)                           \
  V(X64S128And)                           \
--- a/src/compiler/backend/x64/instruction-scheduler-x64.cc
+++ b/src/compiler/backend/x64/instruction-scheduler-x64.cc
@ -246,6 +246,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64I16x8MaxU:
    case kX64I16x8GtU:
    case kX64I16x8GeU:
+    case kX64I16x8RoundingAverageU:
    case kX64I8x16Splat:
    case kX64I8x16ExtractLaneU:
    case kX64I8x16ExtractLaneS:
@ -273,6 +274,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64I8x16MaxU:
    case kX64I8x16GtU:
    case kX64I8x16GeU:
+    case kX64I8x16RoundingAverageU:
    case kX64S128And:
    case kX64S128Or:
    case kX64S128Xor:
--- a/src/compiler/backend/x64/instruction-selector-x64.cc
+++ b/src/compiler/backend/x64/instruction-selector-x64.cc
@ -2675,6 +2675,7 @@ VISIT_ATOMIC_BINOP(Xor)
  V(I16x8MinU)             \
  V(I16x8MaxU)             \
  V(I16x8GeU)              \
+  V(I16x8RoundingAverageU) \
  V(I8x16SConvertI16x8)    \
  V(I8x16Add)              \
  V(I8x16AddSaturateS)     \
@ -2690,6 +2691,7 @@ VISIT_ATOMIC_BINOP(Xor)
  V(I8x16MinU)             \
  V(I8x16MaxU)             \
  V(I8x16GeU)              \
+  V(I8x16RoundingAverageU) \
  V(S128And)               \
  V(S128Or)                \
  V(S128Xor)
--- a/src/compiler/machine-operator.cc
+++ b/src/compiler/machine-operator.cc
@ -421,6 +421,7 @@ MachineType AtomicOpType(Operator const* op) {
  V(I16x8MaxU, Operator::kCommutative, 2, 0, 1)                            \
  V(I16x8GtU, Operator::kNoProperties, 2, 0, 1)                            \
  V(I16x8GeU, Operator::kNoProperties, 2, 0, 1)                            \
+  V(I16x8RoundingAverageU, Operator::kCommutative, 2, 0, 1)                \
  V(I8x16Splat, Operator::kNoProperties, 1, 0, 1)                          \
  V(I8x16Neg, Operator::kNoProperties, 1, 0, 1)                            \
  V(I8x16Shl, Operator::kNoProperties, 2, 0, 1)                            \
@ -445,6 +446,7 @@ MachineType AtomicOpType(Operator const* op) {
  V(I8x16MaxU, Operator::kCommutative, 2, 0, 1)                            \
  V(I8x16GtU, Operator::kNoProperties, 2, 0, 1)                            \
  V(I8x16GeU, Operator::kNoProperties, 2, 0, 1)                            \
+  V(I8x16RoundingAverageU, Operator::kCommutative, 2, 0, 1)                \
  V(S128Load, Operator::kNoProperties, 2, 0, 1)                            \
  V(S128Store, Operator::kNoProperties, 3, 0, 1)                           \
  V(S128Zero, Operator::kNoProperties, 0, 0, 1)                            \
--- a/src/compiler/machine-operator.h
+++ b/src/compiler/machine-operator.h
@ -666,6 +666,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
  const Operator* I16x8MaxU();
  const Operator* I16x8GtU();
  const Operator* I16x8GeU();
+  const Operator* I16x8RoundingAverageU();

  const Operator* I8x16Splat();
  const Operator* I8x16ExtractLaneU(int32_t);
@ -695,6 +696,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
  const Operator* I8x16MaxU();
  const Operator* I8x16GtU();
  const Operator* I8x16GeU();
+  const Operator* I8x16RoundingAverageU();

  const Operator* S128Load();
  const Operator* S128Store();
--- a/src/compiler/opcodes.h
+++ b/src/compiler/opcodes.h
@ -876,6 +876,7 @@
  V(I16x8LeU)                   \
  V(I16x8GtU)                   \
  V(I16x8GeU)                   \
+  V(I16x8RoundingAverageU)      \
  V(I8x16Splat)                 \
  V(I8x16ExtractLaneU)          \
  V(I8x16ExtractLaneS)          \
@ -907,6 +908,7 @@
  V(I8x16LeU)                   \
  V(I8x16GtU)                   \
  V(I8x16GeU)                   \
+  V(I8x16RoundingAverageU)      \
  V(S128Load)                   \
  V(S128Store)                  \
  V(S128Zero)                   \
--- a/src/compiler/wasm-compiler.cc
+++ b/src/compiler/wasm-compiler.cc
@ -4407,6 +4407,9 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
    case wasm::kExprI16x8GeU:
      return graph()->NewNode(mcgraph()->machine()->I16x8GeU(), inputs[0],
                              inputs[1]);
+    case wasm::kExprI16x8RoundingAverageU:
+      return graph()->NewNode(mcgraph()->machine()->I16x8RoundingAverageU(),
+                              inputs[0], inputs[1]);
    case wasm::kExprI8x16Splat:
      return graph()->NewNode(mcgraph()->machine()->I8x16Splat(), inputs[0]);
    case wasm::kExprI8x16Neg:
@ -4489,6 +4492,9 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
    case wasm::kExprI8x16GeU:
      return graph()->NewNode(mcgraph()->machine()->I8x16GeU(), inputs[0],
                              inputs[1]);
+    case wasm::kExprI8x16RoundingAverageU:
+      return graph()->NewNode(mcgraph()->machine()->I8x16RoundingAverageU(),
+                              inputs[0], inputs[1]);
    case wasm::kExprS128And:
      return graph()->NewNode(mcgraph()->machine()->S128And(), inputs[0],
                              inputs[1]);
--- a/src/diagnostics/x64/disasm-x64.cc
+++ b/src/diagnostics/x64/disasm-x64.cc
@ -2006,10 +2006,14 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
          mnemonic = "paddusw";
        } else if (opcode == 0xDE) {
          mnemonic = "pmaxub";
+        } else if (opcode == 0xE0) {
+          mnemonic = "pavgb";
        } else if (opcode == 0xE1) {
          mnemonic = "psraw";
        } else if (opcode == 0xE2) {
          mnemonic = "psrad";
+        } else if (opcode == 0xE3) {
+          mnemonic = "pavgw";
        } else if (opcode == 0xE8) {
          mnemonic = "psubsb";
        } else if (opcode == 0xE9) {
--- a/src/wasm/wasm-interpreter.cc
+++ b/src/wasm/wasm-interpreter.cc
@ -2325,6 +2325,8 @@ class ThreadImpl {
      BINOP_CASE(I16x8AddSaturateU, i16x8, int8, 8, SaturateAdd<uint16_t>(a, b))
      BINOP_CASE(I16x8SubSaturateS, i16x8, int8, 8, SaturateSub<int16_t>(a, b))
      BINOP_CASE(I16x8SubSaturateU, i16x8, int8, 8, SaturateSub<uint16_t>(a, b))
+      BINOP_CASE(I16x8RoundingAverageU, i16x8, int8, 8,
+                 base::RoundingAverageUnsigned<uint16_t>(a, b))
      BINOP_CASE(I8x16Add, i8x16, int16, 16, base::AddWithWraparound(a, b))
      BINOP_CASE(I8x16Sub, i8x16, int16, 16, base::SubWithWraparound(a, b))
      BINOP_CASE(I8x16Mul, i8x16, int16, 16, base::MulWithWraparound(a, b))
@ -2340,6 +2342,8 @@ class ThreadImpl {
      BINOP_CASE(I8x16SubSaturateS, i8x16, int16, 16, SaturateSub<int8_t>(a, b))
      BINOP_CASE(I8x16SubSaturateU, i8x16, int16, 16,
                 SaturateSub<uint8_t>(a, b))
+      BINOP_CASE(I8x16RoundingAverageU, i8x16, int16, 16,
+                 base::RoundingAverageUnsigned<uint8_t>(a, b))
 #undef BINOP_CASE
 #define UNOP_CASE(op, name, stype, count, expr) \
  case kExpr##op: {                             \
--- a/src/wasm/wasm-opcodes.cc
+++ b/src/wasm/wasm-opcodes.cc
@ -334,6 +334,9 @@ const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
    CASE_I64x2_OP(Load32x2S, "load32x2_s")
    CASE_I64x2_OP(Load32x2U, "load32x2_u")

+    CASE_I8x16_OP(RoundingAverageU, "avgr_u")
+    CASE_I16x8_OP(RoundingAverageU, "avgr_u")
+
    // Atomic operations.
    CASE_OP(AtomicNotify, "atomic.notify")
    CASE_INT_OP(AtomicWait, "atomic.wait")
--- a/src/wasm/wasm-opcodes.h
+++ b/src/wasm/wasm-opcodes.h
@ -444,6 +444,8 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, const WasmFeatures&);
  V(I32x4Load16x4U, 0xfdd5, s_s)         \
  V(I64x2Load32x2S, 0xfdd6, s_s)         \
  V(I64x2Load32x2U, 0xfdd7, s_s)         \
+  V(I8x16RoundingAverageU, 0xfdd9, s_ss) \
+  V(I16x8RoundingAverageU, 0xfdda, s_ss) \
  V(I16x8AddHoriz, 0xfdbd, s_ss)         \
  V(I32x4AddHoriz, 0xfdbe, s_ss)         \
  V(F32x4AddHoriz, 0xfdbf, s_ss)         \
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -2059,11 +2059,12 @@ WASM_SIMD_TEST(I16x8Neg) {
                   base::NegateWithWraparound);
 }

+template <typename T = int16_t, typename OpType = T (*)(T, T)>
 void RunI16x8BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
-                       WasmOpcode opcode, Int16BinOp expected_op) {
-  WasmRunner<int32_t, int32_t, int32_t> r(execution_tier, lower_simd);
+                       WasmOpcode opcode, OpType expected_op) {
+  WasmRunner<int32_t, T, T> r(execution_tier, lower_simd);
  // Global to hold output.
-  int16_t* g = r.builder().AddGlobal<int16_t>(kWasmS128);
+  T* g = r.builder().template AddGlobal<T>(kWasmS128);
  // Build fn to splat test values, perform binop, and write the result.
  byte value1 = 0, value2 = 1;
  byte temp1 = r.AllocateLocal(kWasmS128);
@ -2074,12 +2075,12 @@ void RunI16x8BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                                           WASM_GET_LOCAL(temp2))),
        WASM_ONE);

-  FOR_INT16_INPUTS(x) {
-    FOR_INT16_INPUTS(y) {
+  for (T x : compiler::ValueHelper::GetVector<T>()) {
+    for (T y : compiler::ValueHelper::GetVector<T>()) {
      r.Call(x, y);
-      int16_t expected = expected_op(x, y);
+      T expected = expected_op(x, y);
      for (int i = 0; i < 8; i++) {
-        CHECK_EQ(expected, ReadLittleEndianValue<int16_t>(&g[i]));
+        CHECK_EQ(expected, ReadLittleEndianValue<T>(&g[i]));
      }
    }
  }
@ -2180,6 +2181,14 @@ WASM_SIMD_TEST(I16x8LeU) {
                    UnsignedLessEqual);
 }

+#if V8_TARGET_ARCH_X64
+WASM_SIMD_TEST_NO_LOWERING(I16x8RoundingAverageU) {
+  RunI16x8BinOpTest<uint16_t>(execution_tier, lower_simd,
+                              kExprI16x8RoundingAverageU,
+                              base::RoundingAverageUnsigned);
+}
+#endif  // V8_TARGET_ARCH_X64
+
 void RunI16x8ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                         WasmOpcode opcode, Int16ShiftOp expected_op) {
  // Intentionally shift by 16, should be no-op.
@ -2276,11 +2285,12 @@ WASM_SIMD_TEST(I8x16ConvertI16x8) {
  }
 }

+template <typename T = int8_t, typename OpType = T (*)(T, T)>
 void RunI8x16BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
-                       WasmOpcode opcode, Int8BinOp expected_op) {
-  WasmRunner<int32_t, int32_t, int32_t> r(execution_tier, lower_simd);
+                       WasmOpcode opcode, OpType expected_op) {
+  WasmRunner<int32_t, T, T> r(execution_tier, lower_simd);
  // Global to hold output.
-  int8_t* g = r.builder().AddGlobal<int8_t>(kWasmS128);
+  T* g = r.builder().template AddGlobal<T>(kWasmS128);
  // Build fn to splat test values, perform binop, and write the result.
  byte value1 = 0, value2 = 1;
  byte temp1 = r.AllocateLocal(kWasmS128);
@ -2291,12 +2301,12 @@ void RunI8x16BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                                           WASM_GET_LOCAL(temp2))),
        WASM_ONE);

-  FOR_INT8_INPUTS(x) {
-    FOR_INT8_INPUTS(y) {
+  for (T x : compiler::ValueHelper::GetVector<T>()) {
+    for (T y : compiler::ValueHelper::GetVector<T>()) {
      r.Call(x, y);
-      int8_t expected = expected_op(x, y);
+      T expected = expected_op(x, y);
      for (int i = 0; i < 16; i++) {
-        CHECK_EQ(expected, ReadLittleEndianValue<int8_t>(&g[i]));
+        CHECK_EQ(expected, ReadLittleEndianValue<T>(&g[i]));
      }
    }
  }
@ -2397,6 +2407,14 @@ WASM_SIMD_TEST(I8x16Mul) {
                    base::MulWithWraparound);
 }

+#if V8_TARGET_ARCH_X64
+WASM_SIMD_TEST_NO_LOWERING(I8x16RoundingAverageU) {
+  RunI8x16BinOpTest<uint8_t>(execution_tier, lower_simd,
+                             kExprI8x16RoundingAverageU,
+                             base::RoundingAverageUnsigned);
+}
+#endif  // V8_TARGET_ARCH_X64
+
 void RunI8x16ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                         WasmOpcode opcode, Int8ShiftOp expected_op) {
  // Intentionally shift by 8, should be no-op.