[wasm-simd][ia32][x64] Optimize extended add pairwise for AVX

Optimize extadd_pairwise when AVX is supported. If SSE4_1 is available,
we can use the SSE version of the same code sequence. However, there is
a potentially better lowering that only requires SSE2, which has less
instructions, but will only work better if we have rip-relative
constants (otherwise even using ExternalReference it would be 2 moves).

Bug: v8:11086,v8:11349
Change-Id: Iac6f31cf8052161846ff5242b4c18c638c83e0f6
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2719298
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73079}
This commit is contained in:
Ng Zhi An 2021-02-25 09:24:30 -08:00 committed by Commit Bot
parent 8380ebb277
commit 1d31814978
2 changed files with 71 additions and 20 deletions

View File

@ -1182,16 +1182,41 @@ void TurboAssembler::I32x4ExtAddPairwiseI16x8S(XMMRegister dst, XMMRegister src,
void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
XMMRegister tmp) {
// src = |a|b|c|d|e|f|g|h|
// tmp = i32x4.splat(0x0000FFFF)
Pcmpeqd(tmp, tmp);
Psrld(tmp, tmp, byte{16});
// tmp =|0|b|0|d|0|f|0|h|
Pand(tmp, src);
// dst = |0|a|0|c|0|e|0|g|
Psrld(dst, src, byte{16});
// dst = |a+b|c+d|e+f|g+h|
Paddd(dst, dst, tmp);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// src = |a|b|c|d|e|f|g|h| (low)
// dst = |0|a|0|c|0|e|0|g|
vpsrld(dst, src, 16);
// scratch = |0|b|0|d|0|f|0|h|
vpblendw(tmp, src, dst, 0xAA);
// dst = |a+b|c+d|e+f|g+h|
vpaddd(dst, dst, tmp);
} else if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1);
// There is a potentially better lowering if we get rip-relative constants,
// see https://github.com/WebAssembly/simd/pull/380.
movaps(tmp, src);
psrld(tmp, 16);
if (dst != src) {
movaps(dst, src);
}
pblendw(dst, tmp, 0xAA);
paddd(dst, tmp);
} else {
// src = |a|b|c|d|e|f|g|h|
// tmp = i32x4.splat(0x0000FFFF)
pcmpeqd(tmp, tmp);
psrld(tmp, byte{16});
// tmp =|0|b|0|d|0|f|0|h|
pand(tmp, src);
// dst = |0|a|0|c|0|e|0|g|
if (dst != src) {
movaps(dst, src);
}
psrld(dst, byte{16});
// dst = |a+b|c+d|e+f|g+h|
paddd(dst, tmp);
}
}
void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {

View File

@ -10,6 +10,7 @@
#include "src/base/utils/random-number-generator.h"
#include "src/codegen/callable.h"
#include "src/codegen/code-factory.h"
#include "src/codegen/cpu-features.h"
#include "src/codegen/external-reference-table.h"
#include "src/codegen/macro-assembler.h"
#include "src/codegen/register-configuration.h"
@ -2589,16 +2590,41 @@ void TurboAssembler::I16x8ExtAddPairwiseI8x16S(XMMRegister dst,
void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
XMMRegister src) {
// src = |a|b|c|d|e|f|g|h|
// kScratchDoubleReg = i32x4.splat(0x0000FFFF)
Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
Psrld(kScratchDoubleReg, byte{16});
// kScratchDoubleReg =|0|b|0|d|0|f|0|h|
Pand(kScratchDoubleReg, src);
// dst = |0|a|0|c|0|e|0|g|
Psrld(dst, src, byte{16});
// dst = |a+b|c+d|e+f|g+h|
Paddd(dst, kScratchDoubleReg);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// src = |a|b|c|d|e|f|g|h| (low)
// dst = |0|a|0|c|0|e|0|g|
vpsrld(dst, src, 16);
// scratch = |0|b|0|d|0|f|0|h|
vpblendw(kScratchDoubleReg, src, dst, 0xAA);
// dst = |a+b|c+d|e+f|g+h|
vpaddd(dst, dst, kScratchDoubleReg);
} else if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1);
// There is a potentially better lowering if we get rip-relative constants,
// see https://github.com/WebAssembly/simd/pull/380.
movaps(kScratchDoubleReg, src);
psrld(kScratchDoubleReg, 16);
if (dst != src) {
movaps(dst, src);
}
pblendw(dst, kScratchDoubleReg, 0xAA);
paddd(dst, kScratchDoubleReg);
} else {
// src = |a|b|c|d|e|f|g|h|
// kScratchDoubleReg = i32x4.splat(0x0000FFFF)
pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
psrld(kScratchDoubleReg, byte{16});
// kScratchDoubleReg =|0|b|0|d|0|f|0|h|
pand(kScratchDoubleReg, src);
// dst = |0|a|0|c|0|e|0|g|
if (dst != src) {
movaps(dst, src);
}
psrld(dst, byte{16});
// dst = |a+b|c+d|e+f|g+h|
paddd(dst, kScratchDoubleReg);
}
}
void TurboAssembler::I8x16Swizzle(XMMRegister dst, XMMRegister src,