[relaxed-simd] Fix ordering of relaxed FMA/FNMA operands
New ordering and rationale described here: https://github.com/WebAssembly/relaxed-simd/issues/27#issuecomment-1190859982 Bug: v8:12284 Change-Id: I61829617b55ee92642485c18400523e659dc0349 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4109474 Reviewed-by: Ilya Rezvov <irezvov@chromium.org> Commit-Queue: Deepti Gandluri <gdeepti@chromium.org> Cr-Commit-Position: refs/heads/main@{#84913}
This commit is contained in:
parent
c38e2ce46e
commit
b0c2b7797a
@ -1279,37 +1279,37 @@ void SharedTurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
|
||||
if (CpuFeatures::IsSupported(FMA3)) { \
|
||||
CpuFeatureScope fma3_scope(this, FMA3); \
|
||||
if (dst == src1) { \
|
||||
vfmadd231##ps_or_pd(dst, src2, src3); \
|
||||
vfmadd213##ps_or_pd(dst, src2, src3); \
|
||||
} else if (dst == src2) { \
|
||||
vfmadd132##ps_or_pd(dst, src1, src3); \
|
||||
vfmadd213##ps_or_pd(dst, src1, src3); \
|
||||
} else if (dst == src3) { \
|
||||
vfmadd213##ps_or_pd(dst, src2, src1); \
|
||||
vfmadd231##ps_or_pd(dst, src2, src1); \
|
||||
} else { \
|
||||
CpuFeatureScope avx_scope(this, AVX); \
|
||||
vmovups(dst, src1); \
|
||||
vfmadd231##ps_or_pd(dst, src2, src3); \
|
||||
vfmadd213##ps_or_pd(dst, src2, src3); \
|
||||
} \
|
||||
} else if (CpuFeatures::IsSupported(AVX)) { \
|
||||
CpuFeatureScope avx_scope(this, AVX); \
|
||||
vmul##ps_or_pd(tmp, src2, src3); \
|
||||
vadd##ps_or_pd(dst, src1, tmp); \
|
||||
vmul##ps_or_pd(tmp, src1, src2); \
|
||||
vadd##ps_or_pd(dst, tmp, src3); \
|
||||
} else { \
|
||||
if (dst == src1) { \
|
||||
movaps(tmp, src2); \
|
||||
mul##ps_or_pd(tmp, src3); \
|
||||
add##ps_or_pd(dst, tmp); \
|
||||
mul##ps_or_pd(dst, src2); \
|
||||
add##ps_or_pd(dst, src3); \
|
||||
} else if (dst == src2) { \
|
||||
DCHECK_NE(src2, src1); \
|
||||
mul##ps_or_pd(src2, src3); \
|
||||
add##ps_or_pd(src2, src1); \
|
||||
mul##ps_or_pd(dst, src1); \
|
||||
add##ps_or_pd(dst, src3); \
|
||||
} else if (dst == src3) { \
|
||||
DCHECK_NE(src3, src1); \
|
||||
mul##ps_or_pd(src3, src2); \
|
||||
add##ps_or_pd(src3, src1); \
|
||||
movaps(tmp, src1); \
|
||||
mul##ps_or_pd(tmp, src2); \
|
||||
add##ps_or_pd(dst, tmp); \
|
||||
} else { \
|
||||
movaps(dst, src2); \
|
||||
mul##ps_or_pd(dst, src3); \
|
||||
add##ps_or_pd(dst, src1); \
|
||||
movaps(dst, src1); \
|
||||
mul##ps_or_pd(dst, src2); \
|
||||
add##ps_or_pd(dst, src3); \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -1319,25 +1319,25 @@ void SharedTurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
|
||||
if (CpuFeatures::IsSupported(FMA3)) { \
|
||||
CpuFeatureScope fma3_scope(this, FMA3); \
|
||||
if (dst == src1) { \
|
||||
vfnmadd231##ps_or_pd(dst, src2, src3); \
|
||||
vfnmadd213##ps_or_pd(dst, src2, src3); \
|
||||
} else if (dst == src2) { \
|
||||
vfnmadd132##ps_or_pd(dst, src1, src3); \
|
||||
vfnmadd213##ps_or_pd(dst, src1, src3); \
|
||||
} else if (dst == src3) { \
|
||||
vfnmadd213##ps_or_pd(dst, src2, src1); \
|
||||
vfnmadd231##ps_or_pd(dst, src2, src1); \
|
||||
} else { \
|
||||
CpuFeatureScope avx_scope(this, AVX); \
|
||||
vmovups(dst, src1); \
|
||||
vfnmadd231##ps_or_pd(dst, src2, src3); \
|
||||
vfnmadd213##ps_or_pd(dst, src2, src3); \
|
||||
} \
|
||||
} else if (CpuFeatures::IsSupported(AVX)) { \
|
||||
CpuFeatureScope avx_scope(this, AVX); \
|
||||
vmul##ps_or_pd(tmp, src2, src3); \
|
||||
vsub##ps_or_pd(dst, src1, tmp); \
|
||||
vmul##ps_or_pd(tmp, src1, src2); \
|
||||
vsub##ps_or_pd(dst, src3, tmp); \
|
||||
} else { \
|
||||
movaps(tmp, src2); \
|
||||
mul##ps_or_pd(tmp, src3); \
|
||||
if (dst != src1) { \
|
||||
movaps(dst, src1); \
|
||||
movaps(tmp, src1); \
|
||||
mul##ps_or_pd(tmp, src2); \
|
||||
if (dst != src3) { \
|
||||
movaps(dst, src3); \
|
||||
} \
|
||||
sub##ps_or_pd(dst, tmp); \
|
||||
}
|
||||
|
@ -2157,6 +2157,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
__ Instr(dst, i.InputSimd128Register(1).Format(f), \
|
||||
i.InputSimd128Register(2).Format(f)); \
|
||||
break; \
|
||||
}
|
||||
#define SIMD_DESTRUCTIVE_RELAXED_FUSED_CASE(Op, Instr, FORMAT) \
|
||||
case Op: { \
|
||||
VRegister dst = i.OutputSimd128Register().V##FORMAT(); \
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(2).V##FORMAT()); \
|
||||
__ Instr(dst, i.InputSimd128Register(0).V##FORMAT(), \
|
||||
i.InputSimd128Register(1).V##FORMAT()); \
|
||||
break; \
|
||||
}
|
||||
SIMD_BINOP_LANE_SIZE_CASE(kArm64FMin, Fmin);
|
||||
SIMD_BINOP_LANE_SIZE_CASE(kArm64FMax, Fmax);
|
||||
@ -2273,8 +2281,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
SIMD_FCM_L_CASE(kArm64FLe, le, ge);
|
||||
SIMD_FCM_G_CASE(kArm64FGt, gt);
|
||||
SIMD_FCM_G_CASE(kArm64FGe, ge);
|
||||
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F64x2Qfma, Fmla, 2D);
|
||||
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F64x2Qfms, Fmls, 2D);
|
||||
SIMD_DESTRUCTIVE_RELAXED_FUSED_CASE(kArm64F64x2Qfma, Fmla, 2D);
|
||||
SIMD_DESTRUCTIVE_RELAXED_FUSED_CASE(kArm64F64x2Qfms, Fmls, 2D);
|
||||
case kArm64F64x2Pmin: {
|
||||
VRegister dst = i.OutputSimd128Register().V2D();
|
||||
VRegister lhs = i.InputSimd128Register(0).V2D();
|
||||
@ -2307,8 +2315,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
i.InputSimd128Register(1).Format(s_f), i.InputInt8(2));
|
||||
break;
|
||||
}
|
||||
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F32x4Qfma, Fmla, 4S);
|
||||
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F32x4Qfms, Fmls, 4S);
|
||||
SIMD_DESTRUCTIVE_RELAXED_FUSED_CASE(kArm64F32x4Qfma, Fmla, 4S);
|
||||
SIMD_DESTRUCTIVE_RELAXED_FUSED_CASE(kArm64F32x4Qfms, Fmls, 4S);
|
||||
case kArm64F32x4Pmin: {
|
||||
VRegister dst = i.OutputSimd128Register().V4S();
|
||||
VRegister lhs = i.InputSimd128Register(0).V4S();
|
||||
@ -2895,6 +2903,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
#undef SIMD_BINOP_LANE_SIZE_CASE
|
||||
#undef SIMD_DESTRUCTIVE_BINOP_CASE
|
||||
#undef SIMD_DESTRUCTIVE_BINOP_LANE_SIZE_CASE
|
||||
#undef SIMD_DESTRUCTIVE_RELAXED_FUSED_CASE
|
||||
#undef SIMD_REDUCE_OP_CASE
|
||||
#undef ASSEMBLE_SIMD_SHIFT_LEFT
|
||||
#undef ASSEMBLE_SIMD_SHIFT_RIGHT
|
||||
|
@ -4303,7 +4303,7 @@ void InstructionSelector::VisitI64x2RelaxedLaneSelect(Node* node) {
|
||||
#define VISIT_SIMD_QFMOP(op) \
|
||||
void InstructionSelector::Visit##op(Node* node) { \
|
||||
Arm64OperandGenerator g(this); \
|
||||
Emit(kArm64##op, g.DefineSameAsFirst(node), \
|
||||
Emit(kArm64##op, g.DefineSameAsInput(node, 2), \
|
||||
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)), \
|
||||
g.UseRegister(node->InputAt(2))); \
|
||||
}
|
||||
|
@ -66,23 +66,23 @@ constexpr double large_n<double> = 1e200;
|
||||
template <>
|
||||
constexpr float large_n<float> = 1e20;
|
||||
|
||||
// Fused Multiply-Add performs a + b * c.
|
||||
// Fused Multiply-Add performs a * b + c.
|
||||
template <typename T>
|
||||
static constexpr FMOperation<T> qfma_array[] = {
|
||||
{1.0f, 2.0f, 3.0f, 7.0f, 7.0f},
|
||||
// fused: a + b * c = -inf + (positive overflow) = -inf
|
||||
// unfused: a + b * c = -inf + inf = NaN
|
||||
{-std::numeric_limits<T>::infinity(), large_n<T>, large_n<T>,
|
||||
{2.0f, 3.0f, 1.0f, 7.0f, 7.0f},
|
||||
// fused: a * b + c = (positive overflow) + -inf = -inf
|
||||
// unfused: a * b + c = inf + -inf = NaN
|
||||
{large_n<T>, large_n<T>, -std::numeric_limits<T>::infinity(),
|
||||
-std::numeric_limits<T>::infinity(), std::numeric_limits<T>::quiet_NaN()},
|
||||
// fused: a + b * c = inf + (negative overflow) = inf
|
||||
// unfused: a + b * c = inf + -inf = NaN
|
||||
{std::numeric_limits<T>::infinity(), -large_n<T>, large_n<T>,
|
||||
// fused: a * b + c = (negative overflow) + inf = inf
|
||||
// unfused: a * b + c = -inf + inf = NaN
|
||||
{-large_n<T>, large_n<T>, std::numeric_limits<T>::infinity(),
|
||||
std::numeric_limits<T>::infinity(), std::numeric_limits<T>::quiet_NaN()},
|
||||
// NaN
|
||||
{std::numeric_limits<T>::quiet_NaN(), 2.0f, 3.0f,
|
||||
{2.0f, 3.0f, std::numeric_limits<T>::quiet_NaN(),
|
||||
std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()},
|
||||
// -NaN
|
||||
{-std::numeric_limits<T>::quiet_NaN(), 2.0f, 3.0f,
|
||||
{2.0f, 3.0f, -std::numeric_limits<T>::quiet_NaN(),
|
||||
std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()}};
|
||||
|
||||
template <typename T>
|
||||
@ -90,23 +90,23 @@ static constexpr base::Vector<const FMOperation<T>> qfma_vector() {
|
||||
return base::ArrayVector(qfma_array<T>);
|
||||
}
|
||||
|
||||
// Fused Multiply-Subtract performs a - b * c.
|
||||
// Fused Multiply-Subtract performs -(a * b) + c.
|
||||
template <typename T>
|
||||
static constexpr FMOperation<T> qfms_array[]{
|
||||
{1.0f, 2.0f, 3.0f, -5.0f, -5.0f},
|
||||
// fused: a - b * c = inf - (positive overflow) = inf
|
||||
// unfused: a - b * c = inf - inf = NaN
|
||||
{std::numeric_limits<T>::infinity(), large_n<T>, large_n<T>,
|
||||
{2.0f, 3.0f, 1.0f, -5.0f, -5.0f},
|
||||
// fused: -(a * b) + c = - (positive overflow) + inf = inf
|
||||
// unfused: -(a * b) + c = - inf + inf = NaN
|
||||
{large_n<T>, large_n<T>, std::numeric_limits<T>::infinity(),
|
||||
std::numeric_limits<T>::infinity(), std::numeric_limits<T>::quiet_NaN()},
|
||||
// fused: a - b * c = -inf - (negative overflow) = -inf
|
||||
// unfused: a - b * c = -inf - -inf = NaN
|
||||
{-std::numeric_limits<T>::infinity(), -large_n<T>, large_n<T>,
|
||||
// fused: -(a * b) + c = (negative overflow) + -inf = -inf
|
||||
// unfused: -(a * b) + c = -inf - -inf = NaN
|
||||
{-large_n<T>, large_n<T>, -std::numeric_limits<T>::infinity(),
|
||||
-std::numeric_limits<T>::infinity(), std::numeric_limits<T>::quiet_NaN()},
|
||||
// NaN
|
||||
{std::numeric_limits<T>::quiet_NaN(), 2.0f, 3.0f,
|
||||
{2.0f, 3.0f, std::numeric_limits<T>::quiet_NaN(),
|
||||
std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()},
|
||||
// -NaN
|
||||
{-std::numeric_limits<T>::quiet_NaN(), 2.0f, 3.0f,
|
||||
{2.0f, 3.0f, -std::numeric_limits<T>::quiet_NaN(),
|
||||
std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()}};
|
||||
|
||||
template <typename T>
|
||||
|
@ -2865,18 +2865,19 @@ class WasmInterpreterInternals {
|
||||
REDUCTION_CASE(I16x8AllTrue, i16x8, int8, 8, &)
|
||||
REDUCTION_CASE(I8x16AllTrue, i8x16, int16, 16, &)
|
||||
#undef REDUCTION_CASE
|
||||
#define QFM_CASE(op, name, stype, count, operation) \
|
||||
case kExpr##op: { \
|
||||
stype c = Pop().to_s128().to_##name(); \
|
||||
stype b = Pop().to_s128().to_##name(); \
|
||||
stype a = Pop().to_s128().to_##name(); \
|
||||
stype res; \
|
||||
for (size_t i = 0; i < count; i++) { \
|
||||
res.val[LANE(i, res)] = \
|
||||
a.val[LANE(i, a)] operation(b.val[LANE(i, b)] * c.val[LANE(i, c)]); \
|
||||
} \
|
||||
Push(WasmValue(Simd128(res))); \
|
||||
return true; \
|
||||
#define QFM_CASE(op, name, stype, count, operation) \
|
||||
case kExpr##op: { \
|
||||
stype c = Pop().to_s128().to_##name(); \
|
||||
stype b = Pop().to_s128().to_##name(); \
|
||||
stype a = Pop().to_s128().to_##name(); \
|
||||
stype res; \
|
||||
for (size_t i = 0; i < count; i++) { \
|
||||
res.val[LANE(i, res)] = \
|
||||
operation(a.val[LANE(i, a)] * b.val[LANE(i, b)]) + \
|
||||
c.val[LANE(i, c)]; \
|
||||
} \
|
||||
Push(WasmValue(Simd128(res))); \
|
||||
return true; \
|
||||
}
|
||||
QFM_CASE(F32x4Qfma, f32x4, float4, 4, +)
|
||||
QFM_CASE(F32x4Qfms, f32x4, float4, 4, -)
|
||||
|
Loading…
Reference in New Issue
Block a user