[wasm-simd][ia32] Prototype extended pairwise addition

Codegen is identical to x64.

Tweaked a macro definition to do a dst == src1 check when AVX is not
supported, and updated a single caller in LiftOff.

Bug: v8:11086
Change-Id: Ic9645f3d1bf1c26a1aa6db6bc2fa67fc991f8bbb
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2579928
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71756}
This commit is contained in:
Zhi An Ng 2020-12-14 23:25:01 +00:00 committed by Commit Bot
parent bc4308f37b
commit d7de8fa4cb
9 changed files with 92 additions and 11 deletions

View File

@ -505,15 +505,16 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
#define AVX_OP3_WITH_TYPE_SCOPE(macro_name, name, dst_type, src_type, \
sse_scope) \
void macro_name(dst_type dst, src_type src) { \
void macro_name(dst_type dst, dst_type src1, src_type src2) { \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope scope(this, AVX); \
v##name(dst, dst, src); \
v##name(dst, src1, src2); \
return; \
} \
if (CpuFeatures::IsSupported(sse_scope)) { \
CpuFeatureScope scope(this, sse_scope); \
name(dst, src); \
DCHECK_EQ(dst, src1); \
name(dst, src2); \
return; \
} \
UNREACHABLE(); \
@ -523,6 +524,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP3_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, Operand, SSE4_1)
AVX_OP3_XO_SSE4(Pmaxsd, pmaxsd)
AVX_OP3_WITH_TYPE_SCOPE(Pmaddubsw, pmaddubsw, XMMRegister, XMMRegister, SSSE3)
#undef AVX_OP3_XO_SSE4
#undef AVX_OP3_WITH_TYPE_SCOPE

View File

@ -63,9 +63,10 @@
V(pxor, 66, 0F, EF)
#define SSSE3_INSTRUCTION_LIST(V) \
V(phaddd, 66, 0F, 38, 02) \
V(phaddw, 66, 0F, 38, 01) \
V(pshufb, 66, 0F, 38, 00) \
V(phaddw, 66, 0F, 38, 01) \
V(phaddd, 66, 0F, 38, 02) \
V(pmaddubsw, 66, 0F, 38, 04) \
V(psignb, 66, 0F, 38, 08) \
V(psignw, 66, 0F, 38, 09) \
V(psignd, 66, 0F, 38, 0A)

View File

@ -2323,6 +2323,54 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
break;
}
case kIA32I32x4ExtAddPairwiseI16x8S: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
// kScratchDoubleReg = i16x8.splat(1)
__ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
__ Psrlw(kScratchDoubleReg, byte{15});
// pmaddwd multiplies signed words in kScratchDoubleReg and src, producing
// signed doublewords, then adds pairwise.
// src = |a|b|c|d|e|f|g|h|
// dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 |
__ Pmaddwd(dst, src, kScratchDoubleReg);
break;
}
case kIA32I32x4ExtAddPairwiseI16x8U: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
// src = |a|b|c|d|e|f|g|h|
// kScratchDoubleReg = i32x4.splat(0x0000FFFF)
__ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ Psrld(kScratchDoubleReg, kScratchDoubleReg, uint8_t{16});
// kScratchDoubleReg =|0|b|0|d|0|f|0|h|
__ Pand(kScratchDoubleReg, src);
// dst = |0|a|0|c|0|e|0|g|
__ Psrld(dst, src, byte{16});
// dst = |a+b|c+d|e+f|g+h|
__ Paddd(dst, src, kScratchDoubleReg);
break;
}
case kIA32I16x8ExtAddPairwiseI8x16S: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
DCHECK_NE(dst, src);
// dst = i8x16.splat(1)
__ Move(dst, uint32_t{0x01010101});
__ Pshufd(dst, dst, byte{0});
__ Pmaddubsw(dst, dst, src);
break;
break;
}
case kIA32I16x8ExtAddPairwiseI8x16U: {
XMMRegister dst = i.OutputSimd128Register();
// dst = i8x16.splat(1)
__ Move(kScratchDoubleReg, uint32_t{0x01010101});
__ Pshufd(kScratchDoubleReg, kScratchDoubleReg, byte{0});
__ Pmaddubsw(dst, i.InputSimd128Register(0), kScratchDoubleReg);
break;
}
case kIA32I32x4SignSelect: {
ASSEMBLE_SIMD_SIGN_SELECT(blendvps);
break;

View File

@ -235,6 +235,8 @@ namespace compiler {
V(IA32I32x4ExtMulHighI16x8S) \
V(IA32I32x4ExtMulLowI16x8U) \
V(IA32I32x4ExtMulHighI16x8U) \
V(IA32I32x4ExtAddPairwiseI16x8S) \
V(IA32I32x4ExtAddPairwiseI16x8U) \
V(IA32I16x8Splat) \
V(IA32I16x8ExtractLaneS) \
V(IA32I16x8SConvertI8x16Low) \
@ -293,6 +295,8 @@ namespace compiler {
V(IA32I16x8ExtMulHighI8x16S) \
V(IA32I16x8ExtMulLowI8x16U) \
V(IA32I16x8ExtMulHighI8x16U) \
V(IA32I16x8ExtAddPairwiseI8x16S) \
V(IA32I16x8ExtAddPairwiseI8x16U) \
V(IA32I8x16Splat) \
V(IA32I8x16ExtractLaneS) \
V(IA32Pinsrb) \

View File

@ -217,6 +217,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I32x4ExtMulHighI16x8S:
case kIA32I32x4ExtMulLowI16x8U:
case kIA32I32x4ExtMulHighI16x8U:
case kIA32I32x4ExtAddPairwiseI16x8S:
case kIA32I32x4ExtAddPairwiseI16x8U:
case kIA32I16x8Splat:
case kIA32I16x8ExtractLaneS:
case kIA32I16x8SConvertI8x16Low:
@ -275,6 +277,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I16x8ExtMulHighI8x16S:
case kIA32I16x8ExtMulLowI8x16U:
case kIA32I16x8ExtMulHighI8x16U:
case kIA32I16x8ExtAddPairwiseI8x16S:
case kIA32I16x8ExtAddPairwiseI8x16U:
case kIA32I8x16Splat:
case kIA32I8x16ExtractLaneS:
case kIA32Pinsrb:

View File

@ -2360,7 +2360,7 @@ void InstructionSelector::VisitF64x2Max(Node* node) {
}
void InstructionSelector::VisitF64x2Splat(Node* node) {
VisitRRSimd(this, node, kIA32F64x2Splat, kIA32F64x2Splat);
VisitRRSimd(this, node, kIA32F64x2Splat);
}
void InstructionSelector::VisitF64x2ExtractLane(Node* node) {
@ -2427,7 +2427,7 @@ void InstructionSelector::VisitI64x2Mul(Node* node) {
}
void InstructionSelector::VisitF32x4Splat(Node* node) {
VisitRRSimd(this, node, kIA32F32x4Splat, kIA32F32x4Splat);
VisitRRSimd(this, node, kIA32F32x4Splat);
}
void InstructionSelector::VisitF32x4ExtractLane(Node* node) {
@ -3083,6 +3083,24 @@ void InstructionSelector::VisitI64x2SignSelect(Node* node) {
VisitSignSelect(this, node, kIA32I64x2SignSelect);
}
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) {
VisitRRSimd(this, node, kIA32I32x4ExtAddPairwiseI16x8S);
}
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8U(Node* node) {
VisitRRSimd(this, node, kIA32I32x4ExtAddPairwiseI16x8U);
}
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16S(Node* node) {
IA32OperandGenerator g(this);
Emit(kIA32I16x8ExtAddPairwiseI8x16S, g.DefineAsRegister(node),
g.UseUniqueRegister(node->InputAt(0)));
}
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) {
VisitRRSimd(this, node, kIA32I16x8ExtAddPairwiseI8x16U);
}
// static
MachineOperatorBuilder::Flags
InstructionSelector::SupportedMachineOperatorFlags() {

View File

@ -2814,7 +2814,8 @@ void InstructionSelector::VisitPrefetchNonTemporal(Node* node) {
void InstructionSelector::VisitI8x16Popcnt(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64 && \
!V8_TARGET_ARCH_IA32
// TODO(v8:11086) Prototype extended pairwise add.
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) {
UNIMPLEMENTED();
@ -2829,6 +2830,7 @@ void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) {
UNIMPLEMENTED();
}
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64
// && !V8_TARGET_ARCH_IA32
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64 && \
!V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_MIPS64

View File

@ -4112,7 +4112,7 @@ void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst,
Cvttps2dq(tmp, tmp);
Pxor(tmp, liftoff::kScratchDoubleReg);
Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
Pmaxsd(tmp, liftoff::kScratchDoubleReg);
Pmaxsd(tmp, tmp, liftoff::kScratchDoubleReg);
// Convert to int. Overflow lanes above max_signed will be 0x80000000.
Cvttps2dq(dst.fp(), dst.fp());
// Add (src-max_signed) for overflow lanes.

View File

@ -1884,7 +1884,8 @@ WASM_SIMD_TEST(S128Not) {
[](int32_t x) { return ~x; });
}
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64 || \
V8_TARGET_ARCH_IA32
// TODO(v8:11086) Prototype i32x4.extadd_pairwise_i16x8_{s,u}
template <typename Narrow, typename Wide>
void RunExtAddPairwiseTest(TestExecutionTier execution_tier,
@ -1933,7 +1934,8 @@ WASM_SIMD_TEST_NO_LOWERING(I16x8ExtAddPairwiseI8x16U) {
kExprI16x8ExtAddPairwiseI8x16U,
kExprI8x16Splat);
}
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64 ||
// V8_TARGET_ARCH_IA32
void RunI32x4BinOpTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, Int32BinOp expected_op) {