[relaxed-simd] Fix ordering of relaxed FMA/FNMA operands

New ordering and rationale described here:
https://github.com/WebAssembly/relaxed-simd/issues/27#issuecomment-1190859982

Bug: v8:12284
Change-Id: I61829617b55ee92642485c18400523e659dc0349
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4109474
Reviewed-by: Ilya Rezvov <irezvov@chromium.org>
Commit-Queue: Deepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/main@{#84913}
This commit is contained in:
Deepti Gandluri 2022-12-15 12:58:04 -08:00 committed by V8 LUCI CQ
parent c38e2ce46e
commit b0c2b7797a
5 changed files with 73 additions and 63 deletions

View File

@ -1279,37 +1279,37 @@ void SharedTurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
if (CpuFeatures::IsSupported(FMA3)) { \
CpuFeatureScope fma3_scope(this, FMA3); \
if (dst == src1) { \
vfmadd231##ps_or_pd(dst, src2, src3); \
vfmadd213##ps_or_pd(dst, src2, src3); \
} else if (dst == src2) { \
vfmadd132##ps_or_pd(dst, src1, src3); \
vfmadd213##ps_or_pd(dst, src1, src3); \
} else if (dst == src3) { \
vfmadd213##ps_or_pd(dst, src2, src1); \
vfmadd231##ps_or_pd(dst, src2, src1); \
} else { \
CpuFeatureScope avx_scope(this, AVX); \
vmovups(dst, src1); \
vfmadd231##ps_or_pd(dst, src2, src3); \
vfmadd213##ps_or_pd(dst, src2, src3); \
} \
} else if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(this, AVX); \
vmul##ps_or_pd(tmp, src2, src3); \
vadd##ps_or_pd(dst, src1, tmp); \
vmul##ps_or_pd(tmp, src1, src2); \
vadd##ps_or_pd(dst, tmp, src3); \
} else { \
if (dst == src1) { \
movaps(tmp, src2); \
mul##ps_or_pd(tmp, src3); \
add##ps_or_pd(dst, tmp); \
mul##ps_or_pd(dst, src2); \
add##ps_or_pd(dst, src3); \
} else if (dst == src2) { \
DCHECK_NE(src2, src1); \
mul##ps_or_pd(src2, src3); \
add##ps_or_pd(src2, src1); \
mul##ps_or_pd(dst, src1); \
add##ps_or_pd(dst, src3); \
} else if (dst == src3) { \
DCHECK_NE(src3, src1); \
mul##ps_or_pd(src3, src2); \
add##ps_or_pd(src3, src1); \
movaps(tmp, src1); \
mul##ps_or_pd(tmp, src2); \
add##ps_or_pd(dst, tmp); \
} else { \
movaps(dst, src2); \
mul##ps_or_pd(dst, src3); \
add##ps_or_pd(dst, src1); \
movaps(dst, src1); \
mul##ps_or_pd(dst, src2); \
add##ps_or_pd(dst, src3); \
} \
}
@ -1319,25 +1319,25 @@ void SharedTurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
if (CpuFeatures::IsSupported(FMA3)) { \
CpuFeatureScope fma3_scope(this, FMA3); \
if (dst == src1) { \
vfnmadd231##ps_or_pd(dst, src2, src3); \
vfnmadd213##ps_or_pd(dst, src2, src3); \
} else if (dst == src2) { \
vfnmadd132##ps_or_pd(dst, src1, src3); \
vfnmadd213##ps_or_pd(dst, src1, src3); \
} else if (dst == src3) { \
vfnmadd213##ps_or_pd(dst, src2, src1); \
vfnmadd231##ps_or_pd(dst, src2, src1); \
} else { \
CpuFeatureScope avx_scope(this, AVX); \
vmovups(dst, src1); \
vfnmadd231##ps_or_pd(dst, src2, src3); \
vfnmadd213##ps_or_pd(dst, src2, src3); \
} \
} else if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(this, AVX); \
vmul##ps_or_pd(tmp, src2, src3); \
vsub##ps_or_pd(dst, src1, tmp); \
vmul##ps_or_pd(tmp, src1, src2); \
vsub##ps_or_pd(dst, src3, tmp); \
} else { \
movaps(tmp, src2); \
mul##ps_or_pd(tmp, src3); \
if (dst != src1) { \
movaps(dst, src1); \
movaps(tmp, src1); \
mul##ps_or_pd(tmp, src2); \
if (dst != src3) { \
movaps(dst, src3); \
} \
sub##ps_or_pd(dst, tmp); \
}

View File

@ -2157,6 +2157,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Instr(dst, i.InputSimd128Register(1).Format(f), \
i.InputSimd128Register(2).Format(f)); \
break; \
}
#define SIMD_DESTRUCTIVE_RELAXED_FUSED_CASE(Op, Instr, FORMAT) \
case Op: { \
VRegister dst = i.OutputSimd128Register().V##FORMAT(); \
DCHECK_EQ(dst, i.InputSimd128Register(2).V##FORMAT()); \
__ Instr(dst, i.InputSimd128Register(0).V##FORMAT(), \
i.InputSimd128Register(1).V##FORMAT()); \
break; \
}
SIMD_BINOP_LANE_SIZE_CASE(kArm64FMin, Fmin);
SIMD_BINOP_LANE_SIZE_CASE(kArm64FMax, Fmax);
@ -2273,8 +2281,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
SIMD_FCM_L_CASE(kArm64FLe, le, ge);
SIMD_FCM_G_CASE(kArm64FGt, gt);
SIMD_FCM_G_CASE(kArm64FGe, ge);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F64x2Qfma, Fmla, 2D);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F64x2Qfms, Fmls, 2D);
SIMD_DESTRUCTIVE_RELAXED_FUSED_CASE(kArm64F64x2Qfma, Fmla, 2D);
SIMD_DESTRUCTIVE_RELAXED_FUSED_CASE(kArm64F64x2Qfms, Fmls, 2D);
case kArm64F64x2Pmin: {
VRegister dst = i.OutputSimd128Register().V2D();
VRegister lhs = i.InputSimd128Register(0).V2D();
@ -2307,8 +2315,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1).Format(s_f), i.InputInt8(2));
break;
}
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F32x4Qfma, Fmla, 4S);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F32x4Qfms, Fmls, 4S);
SIMD_DESTRUCTIVE_RELAXED_FUSED_CASE(kArm64F32x4Qfma, Fmla, 4S);
SIMD_DESTRUCTIVE_RELAXED_FUSED_CASE(kArm64F32x4Qfms, Fmls, 4S);
case kArm64F32x4Pmin: {
VRegister dst = i.OutputSimd128Register().V4S();
VRegister lhs = i.InputSimd128Register(0).V4S();
@ -2895,6 +2903,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
#undef SIMD_BINOP_LANE_SIZE_CASE
#undef SIMD_DESTRUCTIVE_BINOP_CASE
#undef SIMD_DESTRUCTIVE_BINOP_LANE_SIZE_CASE
#undef SIMD_DESTRUCTIVE_RELAXED_FUSED_CASE
#undef SIMD_REDUCE_OP_CASE
#undef ASSEMBLE_SIMD_SHIFT_LEFT
#undef ASSEMBLE_SIMD_SHIFT_RIGHT

View File

@ -4303,7 +4303,7 @@ void InstructionSelector::VisitI64x2RelaxedLaneSelect(Node* node) {
#define VISIT_SIMD_QFMOP(op) \
void InstructionSelector::Visit##op(Node* node) { \
Arm64OperandGenerator g(this); \
Emit(kArm64##op, g.DefineSameAsFirst(node), \
Emit(kArm64##op, g.DefineSameAsInput(node, 2), \
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)), \
g.UseRegister(node->InputAt(2))); \
}

View File

@ -66,23 +66,23 @@ constexpr double large_n<double> = 1e200;
template <>
constexpr float large_n<float> = 1e20;
// Fused Multiply-Add performs a + b * c.
// Fused Multiply-Add performs a * b + c.
template <typename T>
static constexpr FMOperation<T> qfma_array[] = {
{1.0f, 2.0f, 3.0f, 7.0f, 7.0f},
// fused: a + b * c = -inf + (positive overflow) = -inf
// unfused: a + b * c = -inf + inf = NaN
{-std::numeric_limits<T>::infinity(), large_n<T>, large_n<T>,
{2.0f, 3.0f, 1.0f, 7.0f, 7.0f},
// fused: a * b + c = (positive overflow) + -inf = -inf
// unfused: a * b + c = inf + -inf = NaN
{large_n<T>, large_n<T>, -std::numeric_limits<T>::infinity(),
-std::numeric_limits<T>::infinity(), std::numeric_limits<T>::quiet_NaN()},
// fused: a + b * c = inf + (negative overflow) = inf
// unfused: a + b * c = inf + -inf = NaN
{std::numeric_limits<T>::infinity(), -large_n<T>, large_n<T>,
// fused: a * b + c = (negative overflow) + inf = inf
// unfused: a * b + c = -inf + inf = NaN
{-large_n<T>, large_n<T>, std::numeric_limits<T>::infinity(),
std::numeric_limits<T>::infinity(), std::numeric_limits<T>::quiet_NaN()},
// NaN
{std::numeric_limits<T>::quiet_NaN(), 2.0f, 3.0f,
{2.0f, 3.0f, std::numeric_limits<T>::quiet_NaN(),
std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()},
// -NaN
{-std::numeric_limits<T>::quiet_NaN(), 2.0f, 3.0f,
{2.0f, 3.0f, -std::numeric_limits<T>::quiet_NaN(),
std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()}};
template <typename T>
@ -90,23 +90,23 @@ static constexpr base::Vector<const FMOperation<T>> qfma_vector() {
return base::ArrayVector(qfma_array<T>);
}
// Fused Multiply-Subtract performs a - b * c.
// Fused Multiply-Subtract performs -(a * b) + c.
template <typename T>
static constexpr FMOperation<T> qfms_array[]{
{1.0f, 2.0f, 3.0f, -5.0f, -5.0f},
// fused: a - b * c = inf - (positive overflow) = inf
// unfused: a - b * c = inf - inf = NaN
{std::numeric_limits<T>::infinity(), large_n<T>, large_n<T>,
{2.0f, 3.0f, 1.0f, -5.0f, -5.0f},
// fused: -(a * b) + c = - (positive overflow) + inf = inf
// unfused: -(a * b) + c = - inf + inf = NaN
{large_n<T>, large_n<T>, std::numeric_limits<T>::infinity(),
std::numeric_limits<T>::infinity(), std::numeric_limits<T>::quiet_NaN()},
// fused: a - b * c = -inf - (negative overflow) = -inf
// unfused: a - b * c = -inf - -inf = NaN
{-std::numeric_limits<T>::infinity(), -large_n<T>, large_n<T>,
// fused: -(a * b) + c = (negative overflow) + -inf = -inf
// unfused: -(a * b) + c = -inf - -inf = NaN
{-large_n<T>, large_n<T>, -std::numeric_limits<T>::infinity(),
-std::numeric_limits<T>::infinity(), std::numeric_limits<T>::quiet_NaN()},
// NaN
{std::numeric_limits<T>::quiet_NaN(), 2.0f, 3.0f,
{2.0f, 3.0f, std::numeric_limits<T>::quiet_NaN(),
std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()},
// -NaN
{-std::numeric_limits<T>::quiet_NaN(), 2.0f, 3.0f,
{2.0f, 3.0f, -std::numeric_limits<T>::quiet_NaN(),
std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()}};
template <typename T>

View File

@ -2865,18 +2865,19 @@ class WasmInterpreterInternals {
REDUCTION_CASE(I16x8AllTrue, i16x8, int8, 8, &)
REDUCTION_CASE(I8x16AllTrue, i8x16, int16, 16, &)
#undef REDUCTION_CASE
#define QFM_CASE(op, name, stype, count, operation) \
case kExpr##op: { \
stype c = Pop().to_s128().to_##name(); \
stype b = Pop().to_s128().to_##name(); \
stype a = Pop().to_s128().to_##name(); \
stype res; \
for (size_t i = 0; i < count; i++) { \
res.val[LANE(i, res)] = \
a.val[LANE(i, a)] operation(b.val[LANE(i, b)] * c.val[LANE(i, c)]); \
} \
Push(WasmValue(Simd128(res))); \
return true; \
#define QFM_CASE(op, name, stype, count, operation) \
case kExpr##op: { \
stype c = Pop().to_s128().to_##name(); \
stype b = Pop().to_s128().to_##name(); \
stype a = Pop().to_s128().to_##name(); \
stype res; \
for (size_t i = 0; i < count; i++) { \
res.val[LANE(i, res)] = \
operation(a.val[LANE(i, a)] * b.val[LANE(i, b)]) + \
c.val[LANE(i, c)]; \
} \
Push(WasmValue(Simd128(res))); \
return true; \
}
QFM_CASE(F32x4Qfma, f32x4, float4, 4, +)
QFM_CASE(F32x4Qfms, f32x4, float4, 4, -)