[wasm-simd] Implement rounding average on x64 and interpreter

This change includes templatization of the test helper to allow the
same function to be reused for both signed and unsigned data types.

We implement a new function RoundingAverageUnsigned in overflowing-math,
rather than in base/utils, since the addition could overflow.

SIMD scalar lowering and implementation for other backends will follow
in future patches.

Bug: v8:10039
Change-Id: I70735f7b6536f197869ef1afbccaf5649e7e8448
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1958007
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Reviewed-by: Tobias Tebbi <tebbi@chromium.org>
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65531}
This commit is contained in:
Ng Zhi An 2019-12-09 10:32:55 +00:00 committed by Commit Bot
parent c82338295a
commit 90b42052c6
17 changed files with 92 additions and 14 deletions

View File

@ -83,6 +83,13 @@ inline float RecipSqrt(float a) {
return -std::numeric_limits<float>::infinity();
}
template <typename T>
inline T RoundingAverageUnsigned(T a, T b) {
static_assert(std::is_unsigned<T>::value, "Only for unsiged types");
static_assert(sizeof(T) < sizeof(uint64_t), "Must be smaller than uint64_t");
return (static_cast<uint64_t>(a) + static_cast<uint64_t>(b) + 1) >> 1;
}
} // namespace base
} // namespace v8

View File

@ -187,6 +187,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Pxor, pxor)
AVX_OP(Psubd, psubd)
AVX_OP(Pslld, pslld)
AVX_OP(Pavgb, pavgb)
AVX_OP(Pavgw, pavgw)
AVX_OP(Psrad, psrad)
AVX_OP(Psrld, psrld)
AVX_OP(Paddd, paddd)

View File

@ -79,8 +79,10 @@
V(psllw, 66, 0F, F1) \
V(pslld, 66, 0F, F2) \
V(psllq, 66, 0F, F3) \
V(pavgb, 66, 0F, E0) \
V(psraw, 66, 0F, E1) \
V(psrad, 66, 0F, E2) \
V(pavgw, 66, 0F, E3) \
V(psrlw, 66, 0F, D1) \
V(psrld, 66, 0F, D2) \
V(psrlq, 66, 0F, D3) \

View File

@ -2095,6 +2095,8 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitI16x8GtU(node);
case IrOpcode::kI16x8GeU:
return MarkAsSimd128(node), VisitI16x8GeU(node);
case IrOpcode::kI16x8RoundingAverageU:
return MarkAsSimd128(node), VisitI16x8RoundingAverageU(node);
case IrOpcode::kI8x16Splat:
return MarkAsSimd128(node), VisitI8x16Splat(node);
case IrOpcode::kI8x16ExtractLaneU:
@ -2149,6 +2151,8 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitI8x16GtU(node);
case IrOpcode::kI8x16GeU:
return MarkAsSimd128(node), VisitI8x16GeU(node);
case IrOpcode::kI8x16RoundingAverageU:
return MarkAsSimd128(node), VisitI8x16RoundingAverageU(node);
case IrOpcode::kS128Zero:
return MarkAsSimd128(node), VisitS128Zero(node);
case IrOpcode::kS128And:
@ -2630,6 +2634,12 @@ void InstructionSelector::VisitF64x2SConvertI64x2(Node* node) {
void InstructionSelector::VisitF64x2UConvertI64x2(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI16x8RoundingAverageU(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI8x16RoundingAverageU(Node* node) {
UNIMPLEMENTED();
}
#if !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitLoadTransform(Node* node) { UNIMPLEMENTED(); }

View File

@ -3339,6 +3339,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ pcmpeqw(dst, src);
break;
}
case kX64I16x8RoundingAverageU: {
__ Pavgw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I8x16Splat: {
CpuFeatureScope sse_scope(tasm(), SSSE3);
XMMRegister dst = i.OutputSimd128Register();
@ -3578,6 +3582,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ pcmpeqb(dst, src);
break;
}
case kX64I8x16RoundingAverageU: {
__ Pavgb(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64S128And: {
__ pand(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;

View File

@ -274,6 +274,7 @@ namespace compiler {
V(X64I16x8MaxU) \
V(X64I16x8GtU) \
V(X64I16x8GeU) \
V(X64I16x8RoundingAverageU) \
V(X64I8x16Splat) \
V(X64I8x16ExtractLaneU) \
V(X64I8x16ExtractLaneS) \
@ -301,6 +302,7 @@ namespace compiler {
V(X64I8x16MaxU) \
V(X64I8x16GtU) \
V(X64I8x16GeU) \
V(X64I8x16RoundingAverageU) \
V(X64S128Zero) \
V(X64S128Not) \
V(X64S128And) \

View File

@ -246,6 +246,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I16x8MaxU:
case kX64I16x8GtU:
case kX64I16x8GeU:
case kX64I16x8RoundingAverageU:
case kX64I8x16Splat:
case kX64I8x16ExtractLaneU:
case kX64I8x16ExtractLaneS:
@ -273,6 +274,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I8x16MaxU:
case kX64I8x16GtU:
case kX64I8x16GeU:
case kX64I8x16RoundingAverageU:
case kX64S128And:
case kX64S128Or:
case kX64S128Xor:

View File

@ -2675,6 +2675,7 @@ VISIT_ATOMIC_BINOP(Xor)
V(I16x8MinU) \
V(I16x8MaxU) \
V(I16x8GeU) \
V(I16x8RoundingAverageU) \
V(I8x16SConvertI16x8) \
V(I8x16Add) \
V(I8x16AddSaturateS) \
@ -2690,6 +2691,7 @@ VISIT_ATOMIC_BINOP(Xor)
V(I8x16MinU) \
V(I8x16MaxU) \
V(I8x16GeU) \
V(I8x16RoundingAverageU) \
V(S128And) \
V(S128Or) \
V(S128Xor)

View File

@ -421,6 +421,7 @@ MachineType AtomicOpType(Operator const* op) {
V(I16x8MaxU, Operator::kCommutative, 2, 0, 1) \
V(I16x8GtU, Operator::kNoProperties, 2, 0, 1) \
V(I16x8GeU, Operator::kNoProperties, 2, 0, 1) \
V(I16x8RoundingAverageU, Operator::kCommutative, 2, 0, 1) \
V(I8x16Splat, Operator::kNoProperties, 1, 0, 1) \
V(I8x16Neg, Operator::kNoProperties, 1, 0, 1) \
V(I8x16Shl, Operator::kNoProperties, 2, 0, 1) \
@ -445,6 +446,7 @@ MachineType AtomicOpType(Operator const* op) {
V(I8x16MaxU, Operator::kCommutative, 2, 0, 1) \
V(I8x16GtU, Operator::kNoProperties, 2, 0, 1) \
V(I8x16GeU, Operator::kNoProperties, 2, 0, 1) \
V(I8x16RoundingAverageU, Operator::kCommutative, 2, 0, 1) \
V(S128Load, Operator::kNoProperties, 2, 0, 1) \
V(S128Store, Operator::kNoProperties, 3, 0, 1) \
V(S128Zero, Operator::kNoProperties, 0, 0, 1) \

View File

@ -666,6 +666,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* I16x8MaxU();
const Operator* I16x8GtU();
const Operator* I16x8GeU();
const Operator* I16x8RoundingAverageU();
const Operator* I8x16Splat();
const Operator* I8x16ExtractLaneU(int32_t);
@ -695,6 +696,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* I8x16MaxU();
const Operator* I8x16GtU();
const Operator* I8x16GeU();
const Operator* I8x16RoundingAverageU();
const Operator* S128Load();
const Operator* S128Store();

View File

@ -876,6 +876,7 @@
V(I16x8LeU) \
V(I16x8GtU) \
V(I16x8GeU) \
V(I16x8RoundingAverageU) \
V(I8x16Splat) \
V(I8x16ExtractLaneU) \
V(I8x16ExtractLaneS) \
@ -907,6 +908,7 @@
V(I8x16LeU) \
V(I8x16GtU) \
V(I8x16GeU) \
V(I8x16RoundingAverageU) \
V(S128Load) \
V(S128Store) \
V(S128Zero) \

View File

@ -4407,6 +4407,9 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
case wasm::kExprI16x8GeU:
return graph()->NewNode(mcgraph()->machine()->I16x8GeU(), inputs[0],
inputs[1]);
case wasm::kExprI16x8RoundingAverageU:
return graph()->NewNode(mcgraph()->machine()->I16x8RoundingAverageU(),
inputs[0], inputs[1]);
case wasm::kExprI8x16Splat:
return graph()->NewNode(mcgraph()->machine()->I8x16Splat(), inputs[0]);
case wasm::kExprI8x16Neg:
@ -4489,6 +4492,9 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
case wasm::kExprI8x16GeU:
return graph()->NewNode(mcgraph()->machine()->I8x16GeU(), inputs[0],
inputs[1]);
case wasm::kExprI8x16RoundingAverageU:
return graph()->NewNode(mcgraph()->machine()->I8x16RoundingAverageU(),
inputs[0], inputs[1]);
case wasm::kExprS128And:
return graph()->NewNode(mcgraph()->machine()->S128And(), inputs[0],
inputs[1]);

View File

@ -2006,10 +2006,14 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
mnemonic = "paddusw";
} else if (opcode == 0xDE) {
mnemonic = "pmaxub";
} else if (opcode == 0xE0) {
mnemonic = "pavgb";
} else if (opcode == 0xE1) {
mnemonic = "psraw";
} else if (opcode == 0xE2) {
mnemonic = "psrad";
} else if (opcode == 0xE3) {
mnemonic = "pavgw";
} else if (opcode == 0xE8) {
mnemonic = "psubsb";
} else if (opcode == 0xE9) {

View File

@ -2325,6 +2325,8 @@ class ThreadImpl {
BINOP_CASE(I16x8AddSaturateU, i16x8, int8, 8, SaturateAdd<uint16_t>(a, b))
BINOP_CASE(I16x8SubSaturateS, i16x8, int8, 8, SaturateSub<int16_t>(a, b))
BINOP_CASE(I16x8SubSaturateU, i16x8, int8, 8, SaturateSub<uint16_t>(a, b))
BINOP_CASE(I16x8RoundingAverageU, i16x8, int8, 8,
base::RoundingAverageUnsigned<uint16_t>(a, b))
BINOP_CASE(I8x16Add, i8x16, int16, 16, base::AddWithWraparound(a, b))
BINOP_CASE(I8x16Sub, i8x16, int16, 16, base::SubWithWraparound(a, b))
BINOP_CASE(I8x16Mul, i8x16, int16, 16, base::MulWithWraparound(a, b))
@ -2340,6 +2342,8 @@ class ThreadImpl {
BINOP_CASE(I8x16SubSaturateS, i8x16, int16, 16, SaturateSub<int8_t>(a, b))
BINOP_CASE(I8x16SubSaturateU, i8x16, int16, 16,
SaturateSub<uint8_t>(a, b))
BINOP_CASE(I8x16RoundingAverageU, i8x16, int16, 16,
base::RoundingAverageUnsigned<uint8_t>(a, b))
#undef BINOP_CASE
#define UNOP_CASE(op, name, stype, count, expr) \
case kExpr##op: { \

View File

@ -334,6 +334,9 @@ const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
CASE_I64x2_OP(Load32x2S, "load32x2_s")
CASE_I64x2_OP(Load32x2U, "load32x2_u")
CASE_I8x16_OP(RoundingAverageU, "avgr_u")
CASE_I16x8_OP(RoundingAverageU, "avgr_u")
// Atomic operations.
CASE_OP(AtomicNotify, "atomic.notify")
CASE_INT_OP(AtomicWait, "atomic.wait")

View File

@ -444,6 +444,8 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, const WasmFeatures&);
V(I32x4Load16x4U, 0xfdd5, s_s) \
V(I64x2Load32x2S, 0xfdd6, s_s) \
V(I64x2Load32x2U, 0xfdd7, s_s) \
V(I8x16RoundingAverageU, 0xfdd9, s_ss) \
V(I16x8RoundingAverageU, 0xfdda, s_ss) \
V(I16x8AddHoriz, 0xfdbd, s_ss) \
V(I32x4AddHoriz, 0xfdbe, s_ss) \
V(F32x4AddHoriz, 0xfdbf, s_ss) \

View File

@ -2059,11 +2059,12 @@ WASM_SIMD_TEST(I16x8Neg) {
base::NegateWithWraparound);
}
template <typename T = int16_t, typename OpType = T (*)(T, T)>
void RunI16x8BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, Int16BinOp expected_op) {
WasmRunner<int32_t, int32_t, int32_t> r(execution_tier, lower_simd);
WasmOpcode opcode, OpType expected_op) {
WasmRunner<int32_t, T, T> r(execution_tier, lower_simd);
// Global to hold output.
int16_t* g = r.builder().AddGlobal<int16_t>(kWasmS128);
T* g = r.builder().template AddGlobal<T>(kWasmS128);
// Build fn to splat test values, perform binop, and write the result.
byte value1 = 0, value2 = 1;
byte temp1 = r.AllocateLocal(kWasmS128);
@ -2074,12 +2075,12 @@ void RunI16x8BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WASM_GET_LOCAL(temp2))),
WASM_ONE);
FOR_INT16_INPUTS(x) {
FOR_INT16_INPUTS(y) {
for (T x : compiler::ValueHelper::GetVector<T>()) {
for (T y : compiler::ValueHelper::GetVector<T>()) {
r.Call(x, y);
int16_t expected = expected_op(x, y);
T expected = expected_op(x, y);
for (int i = 0; i < 8; i++) {
CHECK_EQ(expected, ReadLittleEndianValue<int16_t>(&g[i]));
CHECK_EQ(expected, ReadLittleEndianValue<T>(&g[i]));
}
}
}
@ -2180,6 +2181,14 @@ WASM_SIMD_TEST(I16x8LeU) {
UnsignedLessEqual);
}
#if V8_TARGET_ARCH_X64
WASM_SIMD_TEST_NO_LOWERING(I16x8RoundingAverageU) {
RunI16x8BinOpTest<uint16_t>(execution_tier, lower_simd,
kExprI16x8RoundingAverageU,
base::RoundingAverageUnsigned);
}
#endif // V8_TARGET_ARCH_X64
void RunI16x8ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, Int16ShiftOp expected_op) {
// Intentionally shift by 16, should be no-op.
@ -2276,11 +2285,12 @@ WASM_SIMD_TEST(I8x16ConvertI16x8) {
}
}
template <typename T = int8_t, typename OpType = T (*)(T, T)>
void RunI8x16BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, Int8BinOp expected_op) {
WasmRunner<int32_t, int32_t, int32_t> r(execution_tier, lower_simd);
WasmOpcode opcode, OpType expected_op) {
WasmRunner<int32_t, T, T> r(execution_tier, lower_simd);
// Global to hold output.
int8_t* g = r.builder().AddGlobal<int8_t>(kWasmS128);
T* g = r.builder().template AddGlobal<T>(kWasmS128);
// Build fn to splat test values, perform binop, and write the result.
byte value1 = 0, value2 = 1;
byte temp1 = r.AllocateLocal(kWasmS128);
@ -2291,12 +2301,12 @@ void RunI8x16BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WASM_GET_LOCAL(temp2))),
WASM_ONE);
FOR_INT8_INPUTS(x) {
FOR_INT8_INPUTS(y) {
for (T x : compiler::ValueHelper::GetVector<T>()) {
for (T y : compiler::ValueHelper::GetVector<T>()) {
r.Call(x, y);
int8_t expected = expected_op(x, y);
T expected = expected_op(x, y);
for (int i = 0; i < 16; i++) {
CHECK_EQ(expected, ReadLittleEndianValue<int8_t>(&g[i]));
CHECK_EQ(expected, ReadLittleEndianValue<T>(&g[i]));
}
}
}
@ -2397,6 +2407,14 @@ WASM_SIMD_TEST(I8x16Mul) {
base::MulWithWraparound);
}
#if V8_TARGET_ARCH_X64
WASM_SIMD_TEST_NO_LOWERING(I8x16RoundingAverageU) {
RunI8x16BinOpTest<uint8_t>(execution_tier, lower_simd,
kExprI8x16RoundingAverageU,
base::RoundingAverageUnsigned);
}
#endif // V8_TARGET_ARCH_X64
void RunI8x16ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, Int8ShiftOp expected_op) {
// Intentionally shift by 8, should be no-op.