From 03482bb35c06cda14aa0d46e144c88b319ba6615 Mon Sep 17 00:00:00 2001 From: Ng Zhi An Date: Tue, 26 Jan 2021 11:32:32 -0800 Subject: [PATCH] [wasm-simd][x64] Optimize extended pairwise add Use external references to hold splat values. Bug: v8:11349,v8:11086 Change-Id: I829d136ae7c7f8e28de991d06f6a321551402ae1 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2648972 Reviewed-by: Deepti Gandluri Commit-Queue: Zhi An Ng Cr-Commit-Position: refs/heads/master@{#72348} --- src/codegen/external-reference.cc | 20 ++++++++++++ src/codegen/external-reference.h | 2 ++ src/codegen/x64/macro-assembler-x64.cc | 32 +++++++++++++++++++ src/codegen/x64/macro-assembler-x64.h | 3 ++ .../backend/x64/code-generator-x64.cc | 31 +++++++++--------- 5 files changed, 72 insertions(+), 16 deletions(-) diff --git a/src/codegen/external-reference.cc b/src/codegen/external-reference.cc index b0788309e5..bf8076de51 100644 --- a/src/codegen/external-reference.cc +++ b/src/codegen/external-reference.cc @@ -81,12 +81,24 @@ constexpr struct alignas(16) { } wasm_i8x16_popcnt_mask = {uint64_t{0x03020201'02010100}, uint64_t{0x04030302'03020201}}; +constexpr struct alignas(16) { + uint64_t a; + uint64_t b; +} wasm_i8x16_splat_0x01 = {uint64_t{0x01010101'01010101}, + uint64_t{0x01010101'01010101}}; + constexpr struct alignas(16) { uint64_t a; uint64_t b; } wasm_i8x16_splat_0x0f = {uint64_t{0x0F0F0F0F'0F0F0F0F}, uint64_t{0x0F0F0F0F'0F0F0F0F}}; +constexpr struct alignas(16) { + uint64_t a; + uint64_t b; +} wasm_i16x8_splat_0x0001 = {uint64_t{0x00010001'00010001}, + uint64_t{0x00010001'00010001}}; + constexpr struct alignas(16) { uint64_t a; uint64_t b; @@ -562,10 +574,18 @@ ExternalReference ExternalReference::address_of_wasm_i8x16_popcnt_mask() { return ExternalReference(reinterpret_cast
(&wasm_i8x16_popcnt_mask)); } +ExternalReference ExternalReference::address_of_wasm_i8x16_splat_0x01() { + return ExternalReference(reinterpret_cast
(&wasm_i8x16_splat_0x01)); +} + ExternalReference ExternalReference::address_of_wasm_i8x16_splat_0x0f() { return ExternalReference(reinterpret_cast
(&wasm_i8x16_splat_0x0f)); } +ExternalReference ExternalReference::address_of_wasm_i16x8_splat_0x0001() { + return ExternalReference(reinterpret_cast
(&wasm_i16x8_splat_0x0001)); +} + ExternalReference ExternalReference::address_of_wasm_f64x2_convert_low_i32x4_u_int_mask() { return ExternalReference( diff --git a/src/codegen/external-reference.h b/src/codegen/external-reference.h index d466327118..ba09fb1126 100644 --- a/src/codegen/external-reference.h +++ b/src/codegen/external-reference.h @@ -116,7 +116,9 @@ class StatsCounter; V(address_of_the_hole_nan, "the_hole_nan") \ V(address_of_uint32_bias, "uint32_bias") \ V(address_of_wasm_i8x16_popcnt_mask, "wasm_i8x16_popcnt_mask") \ + V(address_of_wasm_i8x16_splat_0x01, "wasm_i8x16_splat_0x01") \ V(address_of_wasm_i8x16_splat_0x0f, "wasm_i8x16_splat_0x0f") \ + V(address_of_wasm_i16x8_splat_0x0001, "wasm_16x8_splat_0x0001") \ V(bytecode_size_table_address, "Bytecodes::bytecode_size_table_address") \ V(check_object_type, "check_object_type") \ V(compute_integer_hash, "ComputeSeededHash") \ diff --git a/src/codegen/x64/macro-assembler-x64.cc b/src/codegen/x64/macro-assembler-x64.cc index b63b72d706..06aba6d2fe 100644 --- a/src/codegen/x64/macro-assembler-x64.cc +++ b/src/codegen/x64/macro-assembler-x64.cc @@ -695,6 +695,16 @@ int TurboAssembler::PopCallerSaved(SaveFPRegsMode fp_mode, Register exclusion1, return bytes; } +void TurboAssembler::Movdqa(XMMRegister dst, Operand src) { + // See comments in Movdqa(XMMRegister, XMMRegister). + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vmovdqa(dst, src); + } else { + movaps(dst, src); + } +} + void TurboAssembler::Movdqa(XMMRegister dst, XMMRegister src) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); @@ -1770,6 +1780,16 @@ void TurboAssembler::RetpolineJump(Register reg) { ret(0); } +void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vpmaddwd(dst, src1, src2); + } else { + DCHECK_EQ(dst, src1); + pmaddwd(dst, src2); + } +} + void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { if (CpuFeatures::IsSupported(AVX)) { @@ -1781,6 +1801,18 @@ void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1, } } +void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1, + Operand src2) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vpmaddubsw(dst, src1, src2); + } else { + CpuFeatureScope ssse3_scope(this, SSSE3); + DCHECK_EQ(dst, src1); + pmaddubsw(dst, src2); + } +} + void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2) { if (CpuFeatures::IsSupported(AVX)) { diff --git a/src/codegen/x64/macro-assembler-x64.h b/src/codegen/x64/macro-assembler-x64.h index 5f2798d6d1..c4636f6f18 100644 --- a/src/codegen/x64/macro-assembler-x64.h +++ b/src/codegen/x64/macro-assembler-x64.h @@ -361,6 +361,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { Label::Distance condition_met_distance = Label::kFar); void Movapd(XMMRegister dst, XMMRegister src); + void Movdqa(XMMRegister dst, Operand src); void Movdqa(XMMRegister dst, XMMRegister src); template @@ -536,7 +537,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { void DebugBreak() override; // Supports both AVX (dst != src1) and SSE (checks that dst == src1). + void Pmaddwd(XMMRegister dst, XMMRegister src1, Operand src2); void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2); + void Pmaddubsw(XMMRegister dst, XMMRegister src1, Operand src2); void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2); // Shufps that will mov src1 into dst if AVX is not supported. diff --git a/src/compiler/backend/x64/code-generator-x64.cc b/src/compiler/backend/x64/code-generator-x64.cc index 36064a868f..299ef9aaf0 100644 --- a/src/compiler/backend/x64/code-generator-x64.cc +++ b/src/compiler/backend/x64/code-generator-x64.cc @@ -3153,15 +3153,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( } case kX64I32x4ExtAddPairwiseI16x8S: { XMMRegister dst = i.OutputSimd128Register(); - XMMRegister src = i.InputSimd128Register(0); - // kScratchDoubleReg = |1|1|1|1|1|1|1|1| - __ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg); - __ Psrlw(kScratchDoubleReg, byte{15}); - // pmaddwd multiplies signed words in kScratchDoubleReg and src, producing - // signed doublewords, then adds pairwise. - // src = |a|b|c|d|e|f|g|h| + XMMRegister src1 = i.InputSimd128Register(0); + // pmaddwd multiplies signed words in src1 and src2, producing signed + // doublewords, then adds pairwise. + // src1 = |a|b|c|d|e|f|g|h| + // src2 = |1|1|1|1|1|1|1|1| // dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 | - __ Pmaddwd(dst, src, kScratchDoubleReg); + Operand src2 = __ ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i16x8_splat_0x0001()); + __ Pmaddwd(dst, src1, src2); break; } case kX64I32x4ExtAddPairwiseI16x8U: { @@ -3402,19 +3402,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( XMMRegister dst = i.OutputSimd128Register(); XMMRegister src = i.InputSimd128Register(0); DCHECK_NE(dst, src); - // dst = i8x16.splat(1) - __ Move(dst, uint32_t{0x01010101}); - __ Pshufd(dst, dst, byte{0}); + __ Movdqa(dst, + __ ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x01())); __ Pmaddubsw(dst, dst, src); break; } case kX64I16x8ExtAddPairwiseI8x16U: { XMMRegister dst = i.OutputSimd128Register(); - XMMRegister src = i.InputSimd128Register(0); - // dst = i8x16.splat(1) - __ Move(kScratchDoubleReg, uint32_t{0x01010101}); - __ Pshufd(kScratchDoubleReg, kScratchDoubleReg, byte{0}); - __ Pmaddubsw(dst, src, kScratchDoubleReg); + XMMRegister src1 = i.InputSimd128Register(0); + Operand src2 = __ ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x01()); + __ Pmaddubsw(dst, src1, src2); break; } case kX64I16x8Q15MulRSatS: {