[wasm-simd][x64] Optimize extended pairwise add

Use external references to hold splat values.

Bug: v8:11349,v8:11086
Change-Id: I829d136ae7c7f8e28de991d06f6a321551402ae1
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2648972
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72348}
This commit is contained in:
Ng Zhi An 2021-01-26 11:32:32 -08:00 committed by Commit Bot
parent bc67a3b592
commit 03482bb35c
5 changed files with 72 additions and 16 deletions

View File

@ -81,12 +81,24 @@ constexpr struct alignas(16) {
} wasm_i8x16_popcnt_mask = {uint64_t{0x03020201'02010100},
uint64_t{0x04030302'03020201}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} wasm_i8x16_splat_0x01 = {uint64_t{0x01010101'01010101},
uint64_t{0x01010101'01010101}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} wasm_i8x16_splat_0x0f = {uint64_t{0x0F0F0F0F'0F0F0F0F},
uint64_t{0x0F0F0F0F'0F0F0F0F}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} wasm_i16x8_splat_0x0001 = {uint64_t{0x00010001'00010001},
uint64_t{0x00010001'00010001}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
@ -562,10 +574,18 @@ ExternalReference ExternalReference::address_of_wasm_i8x16_popcnt_mask() {
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_popcnt_mask));
}
ExternalReference ExternalReference::address_of_wasm_i8x16_splat_0x01() {
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_splat_0x01));
}
ExternalReference ExternalReference::address_of_wasm_i8x16_splat_0x0f() {
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_splat_0x0f));
}
ExternalReference ExternalReference::address_of_wasm_i16x8_splat_0x0001() {
return ExternalReference(reinterpret_cast<Address>(&wasm_i16x8_splat_0x0001));
}
ExternalReference
ExternalReference::address_of_wasm_f64x2_convert_low_i32x4_u_int_mask() {
return ExternalReference(

View File

@ -116,7 +116,9 @@ class StatsCounter;
V(address_of_the_hole_nan, "the_hole_nan") \
V(address_of_uint32_bias, "uint32_bias") \
V(address_of_wasm_i8x16_popcnt_mask, "wasm_i8x16_popcnt_mask") \
V(address_of_wasm_i8x16_splat_0x01, "wasm_i8x16_splat_0x01") \
V(address_of_wasm_i8x16_splat_0x0f, "wasm_i8x16_splat_0x0f") \
V(address_of_wasm_i16x8_splat_0x0001, "wasm_16x8_splat_0x0001") \
V(bytecode_size_table_address, "Bytecodes::bytecode_size_table_address") \
V(check_object_type, "check_object_type") \
V(compute_integer_hash, "ComputeSeededHash") \

View File

@ -695,6 +695,16 @@ int TurboAssembler::PopCallerSaved(SaveFPRegsMode fp_mode, Register exclusion1,
return bytes;
}
void TurboAssembler::Movdqa(XMMRegister dst, Operand src) {
// See comments in Movdqa(XMMRegister, XMMRegister).
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqa(dst, src);
} else {
movaps(dst, src);
}
}
void TurboAssembler::Movdqa(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
@ -1770,6 +1780,16 @@ void TurboAssembler::RetpolineJump(Register reg) {
ret(0);
}
void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmaddwd(dst, src1, src2);
} else {
DCHECK_EQ(dst, src1);
pmaddwd(dst, src2);
}
}
void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
if (CpuFeatures::IsSupported(AVX)) {
@ -1781,6 +1801,18 @@ void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1,
}
}
void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
Operand src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmaddubsw(dst, src1, src2);
} else {
CpuFeatureScope ssse3_scope(this, SSSE3);
DCHECK_EQ(dst, src1);
pmaddubsw(dst, src2);
}
}
void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
if (CpuFeatures::IsSupported(AVX)) {

View File

@ -361,6 +361,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
Label::Distance condition_met_distance = Label::kFar);
void Movapd(XMMRegister dst, XMMRegister src);
void Movdqa(XMMRegister dst, Operand src);
void Movdqa(XMMRegister dst, XMMRegister src);
template <typename Dst, typename Src>
@ -536,7 +537,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void DebugBreak() override;
// Supports both AVX (dst != src1) and SSE (checks that dst == src1).
void Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2);
void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void Pmaddubsw(XMMRegister dst, XMMRegister src1, Operand src2);
void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// Shufps that will mov src1 into dst if AVX is not supported.

View File

@ -3153,15 +3153,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I32x4ExtAddPairwiseI16x8S: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
// kScratchDoubleReg = |1|1|1|1|1|1|1|1|
__ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
__ Psrlw(kScratchDoubleReg, byte{15});
// pmaddwd multiplies signed words in kScratchDoubleReg and src, producing
// signed doublewords, then adds pairwise.
// src = |a|b|c|d|e|f|g|h|
XMMRegister src1 = i.InputSimd128Register(0);
// pmaddwd multiplies signed words in src1 and src2, producing signed
// doublewords, then adds pairwise.
// src1 = |a|b|c|d|e|f|g|h|
// src2 = |1|1|1|1|1|1|1|1|
// dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 |
__ Pmaddwd(dst, src, kScratchDoubleReg);
Operand src2 = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i16x8_splat_0x0001());
__ Pmaddwd(dst, src1, src2);
break;
}
case kX64I32x4ExtAddPairwiseI16x8U: {
@ -3402,19 +3402,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
DCHECK_NE(dst, src);
// dst = i8x16.splat(1)
__ Move(dst, uint32_t{0x01010101});
__ Pshufd(dst, dst, byte{0});
__ Movdqa(dst,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01()));
__ Pmaddubsw(dst, dst, src);
break;
}
case kX64I16x8ExtAddPairwiseI8x16U: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
// dst = i8x16.splat(1)
__ Move(kScratchDoubleReg, uint32_t{0x01010101});
__ Pshufd(kScratchDoubleReg, kScratchDoubleReg, byte{0});
__ Pmaddubsw(dst, src, kScratchDoubleReg);
XMMRegister src1 = i.InputSimd128Register(0);
Operand src2 = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01());
__ Pmaddubsw(dst, src1, src2);
break;
}
case kX64I16x8Q15MulRSatS: {