[wasm-simd][x64] Optimize extended pairwise add
Use external references to hold splat values. Bug: v8:11349,v8:11086 Change-Id: I829d136ae7c7f8e28de991d06f6a321551402ae1 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2648972 Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#72348}
This commit is contained in:
parent
bc67a3b592
commit
03482bb35c
@ -81,12 +81,24 @@ constexpr struct alignas(16) {
|
||||
} wasm_i8x16_popcnt_mask = {uint64_t{0x03020201'02010100},
|
||||
uint64_t{0x04030302'03020201}};
|
||||
|
||||
constexpr struct alignas(16) {
|
||||
uint64_t a;
|
||||
uint64_t b;
|
||||
} wasm_i8x16_splat_0x01 = {uint64_t{0x01010101'01010101},
|
||||
uint64_t{0x01010101'01010101}};
|
||||
|
||||
constexpr struct alignas(16) {
|
||||
uint64_t a;
|
||||
uint64_t b;
|
||||
} wasm_i8x16_splat_0x0f = {uint64_t{0x0F0F0F0F'0F0F0F0F},
|
||||
uint64_t{0x0F0F0F0F'0F0F0F0F}};
|
||||
|
||||
constexpr struct alignas(16) {
|
||||
uint64_t a;
|
||||
uint64_t b;
|
||||
} wasm_i16x8_splat_0x0001 = {uint64_t{0x00010001'00010001},
|
||||
uint64_t{0x00010001'00010001}};
|
||||
|
||||
constexpr struct alignas(16) {
|
||||
uint64_t a;
|
||||
uint64_t b;
|
||||
@ -562,10 +574,18 @@ ExternalReference ExternalReference::address_of_wasm_i8x16_popcnt_mask() {
|
||||
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_popcnt_mask));
|
||||
}
|
||||
|
||||
ExternalReference ExternalReference::address_of_wasm_i8x16_splat_0x01() {
|
||||
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_splat_0x01));
|
||||
}
|
||||
|
||||
ExternalReference ExternalReference::address_of_wasm_i8x16_splat_0x0f() {
|
||||
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_splat_0x0f));
|
||||
}
|
||||
|
||||
ExternalReference ExternalReference::address_of_wasm_i16x8_splat_0x0001() {
|
||||
return ExternalReference(reinterpret_cast<Address>(&wasm_i16x8_splat_0x0001));
|
||||
}
|
||||
|
||||
ExternalReference
|
||||
ExternalReference::address_of_wasm_f64x2_convert_low_i32x4_u_int_mask() {
|
||||
return ExternalReference(
|
||||
|
@ -116,7 +116,9 @@ class StatsCounter;
|
||||
V(address_of_the_hole_nan, "the_hole_nan") \
|
||||
V(address_of_uint32_bias, "uint32_bias") \
|
||||
V(address_of_wasm_i8x16_popcnt_mask, "wasm_i8x16_popcnt_mask") \
|
||||
V(address_of_wasm_i8x16_splat_0x01, "wasm_i8x16_splat_0x01") \
|
||||
V(address_of_wasm_i8x16_splat_0x0f, "wasm_i8x16_splat_0x0f") \
|
||||
V(address_of_wasm_i16x8_splat_0x0001, "wasm_16x8_splat_0x0001") \
|
||||
V(bytecode_size_table_address, "Bytecodes::bytecode_size_table_address") \
|
||||
V(check_object_type, "check_object_type") \
|
||||
V(compute_integer_hash, "ComputeSeededHash") \
|
||||
|
@ -695,6 +695,16 @@ int TurboAssembler::PopCallerSaved(SaveFPRegsMode fp_mode, Register exclusion1,
|
||||
return bytes;
|
||||
}
|
||||
|
||||
void TurboAssembler::Movdqa(XMMRegister dst, Operand src) {
|
||||
// See comments in Movdqa(XMMRegister, XMMRegister).
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vmovdqa(dst, src);
|
||||
} else {
|
||||
movaps(dst, src);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::Movdqa(XMMRegister dst, XMMRegister src) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
@ -1770,6 +1780,16 @@ void TurboAssembler::RetpolineJump(Register reg) {
|
||||
ret(0);
|
||||
}
|
||||
|
||||
void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpmaddwd(dst, src1, src2);
|
||||
} else {
|
||||
DCHECK_EQ(dst, src1);
|
||||
pmaddwd(dst, src2);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
@ -1781,6 +1801,18 @@ void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1,
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
|
||||
Operand src2) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpmaddubsw(dst, src1, src2);
|
||||
} else {
|
||||
CpuFeatureScope ssse3_scope(this, SSSE3);
|
||||
DCHECK_EQ(dst, src1);
|
||||
pmaddubsw(dst, src2);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
|
@ -361,6 +361,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
Label::Distance condition_met_distance = Label::kFar);
|
||||
|
||||
void Movapd(XMMRegister dst, XMMRegister src);
|
||||
void Movdqa(XMMRegister dst, Operand src);
|
||||
void Movdqa(XMMRegister dst, XMMRegister src);
|
||||
|
||||
template <typename Dst, typename Src>
|
||||
@ -536,7 +537,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
void DebugBreak() override;
|
||||
|
||||
// Supports both AVX (dst != src1) and SSE (checks that dst == src1).
|
||||
void Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2);
|
||||
void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
void Pmaddubsw(XMMRegister dst, XMMRegister src1, Operand src2);
|
||||
void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
|
||||
// Shufps that will mov src1 into dst if AVX is not supported.
|
||||
|
@ -3153,15 +3153,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kX64I32x4ExtAddPairwiseI16x8S: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src = i.InputSimd128Register(0);
|
||||
// kScratchDoubleReg = |1|1|1|1|1|1|1|1|
|
||||
__ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
|
||||
__ Psrlw(kScratchDoubleReg, byte{15});
|
||||
// pmaddwd multiplies signed words in kScratchDoubleReg and src, producing
|
||||
// signed doublewords, then adds pairwise.
|
||||
// src = |a|b|c|d|e|f|g|h|
|
||||
XMMRegister src1 = i.InputSimd128Register(0);
|
||||
// pmaddwd multiplies signed words in src1 and src2, producing signed
|
||||
// doublewords, then adds pairwise.
|
||||
// src1 = |a|b|c|d|e|f|g|h|
|
||||
// src2 = |1|1|1|1|1|1|1|1|
|
||||
// dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 |
|
||||
__ Pmaddwd(dst, src, kScratchDoubleReg);
|
||||
Operand src2 = __ ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i16x8_splat_0x0001());
|
||||
__ Pmaddwd(dst, src1, src2);
|
||||
break;
|
||||
}
|
||||
case kX64I32x4ExtAddPairwiseI16x8U: {
|
||||
@ -3402,19 +3402,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src = i.InputSimd128Register(0);
|
||||
DCHECK_NE(dst, src);
|
||||
// dst = i8x16.splat(1)
|
||||
__ Move(dst, uint32_t{0x01010101});
|
||||
__ Pshufd(dst, dst, byte{0});
|
||||
__ Movdqa(dst,
|
||||
__ ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x01()));
|
||||
__ Pmaddubsw(dst, dst, src);
|
||||
break;
|
||||
}
|
||||
case kX64I16x8ExtAddPairwiseI8x16U: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src = i.InputSimd128Register(0);
|
||||
// dst = i8x16.splat(1)
|
||||
__ Move(kScratchDoubleReg, uint32_t{0x01010101});
|
||||
__ Pshufd(kScratchDoubleReg, kScratchDoubleReg, byte{0});
|
||||
__ Pmaddubsw(dst, src, kScratchDoubleReg);
|
||||
XMMRegister src1 = i.InputSimd128Register(0);
|
||||
Operand src2 = __ ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x01());
|
||||
__ Pmaddubsw(dst, src1, src2);
|
||||
break;
|
||||
}
|
||||
case kX64I16x8Q15MulRSatS: {
|
||||
|
Loading…
Reference in New Issue
Block a user