[wasm-simd][x64] Optimize some integer widen_high ops

Optimize:
- i32x4.widen_high_i16x8_s
- i32x4.widen_high_i16x8_u
- i16x8.widen_high_i8x16_s
- i16x8.widen_high_i8x16_u

These optimizations were suggested in http://b/175364869.

The main change is to move away from palignr, which has a dependency on
dst, and also the AVX version is 2 bytes longer than the punpckhqdq.

For the signed and unsigned variants, we have slightly different
optimizations. Unsigned variants can use an punpckh* instruction with a
zero-ed scratch register, that effectively zero-extends. Signed variants
use the movhlps instruction to move high half to low half of dst, then
use packed signed extension instructions.

The common fallback for these instructions is to use pshufd, which does
not have a dependency on dst, but is 1 byte longer than the punpckh*
instructions.

FIXED=b/175364869

Change-Id: If28da2aaa8f6e39a58e63b01cc9a81bbbb294606
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2591853
Reviewed-by: Bill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71856}
This commit is contained in:
Zhi An Ng 2020-12-17 01:33:26 +00:00 committed by Commit Bot
parent 8c48059844
commit b145152db2
3 changed files with 98 additions and 12 deletions

View File

@ -14,6 +14,7 @@
#include "src/codegen/register-configuration.h"
#include "src/codegen/string-constants.h"
#include "src/codegen/x64/assembler-x64.h"
#include "src/codegen/x64/register-x64.h"
#include "src/common/external-pointer.h"
#include "src/common/globals.h"
#include "src/debug/debug.h"
@ -1988,6 +1989,88 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
}
}
void TurboAssembler::I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// Copy top half (64-bit) of src into both halves of dst.
vpunpckhqdq(dst, src, src);
vpmovsxwd(dst, dst);
} else {
if (dst == src) {
// 2 bytes shorter than pshufd, but has depdency on dst.
movhlps(dst, src);
pmovsxwd(dst, dst);
} else {
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovsxwd(dst, dst);
}
}
}
void TurboAssembler::I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// scratch = |0|0|0|0|0|0|0|0|
// src = |a|b|c|d|e|f|g|h|
// dst = |0|a|0|b|0|c|0|d|
XMMRegister scratch = dst == src ? kScratchDoubleReg : dst;
vpxor(scratch, scratch, scratch);
vpunpckhwd(dst, src, scratch);
} else {
if (dst == src) {
// xorps can be executed on more ports than pshufd.
xorps(kScratchDoubleReg, kScratchDoubleReg);
punpckhwd(dst, kScratchDoubleReg);
} else {
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovzxwd(dst, dst);
}
}
}
void TurboAssembler::I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// Copy top half (64-bit) of src into both halves of dst.
vpunpckhqdq(dst, src, src);
vpmovsxbw(dst, dst);
} else {
if (dst == src) {
// 2 bytes shorter than pshufd, but has depdency on dst.
movhlps(dst, src);
pmovsxbw(dst, dst);
} else {
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovsxbw(dst, dst);
}
}
}
void TurboAssembler::I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// scratch = |0|0|0|0|0|0|0|0 | 0|0|0|0|0|0|0|0|
// src = |a|b|c|d|e|f|g|h | i|j|k|l|m|n|o|p|
// dst = |0|a|0|b|0|c|0|d | 0|e|0|f|0|g|0|h|
XMMRegister scratch = dst == src ? kScratchDoubleReg : dst;
vpxor(scratch, scratch, scratch);
vpunpckhbw(dst, src, scratch);
} else {
if (dst == src) {
// xorps can be executed on more ports than pshufd.
xorps(kScratchDoubleReg, kScratchDoubleReg);
punpckhbw(dst, kScratchDoubleReg);
} else {
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovzxbw(dst, dst);
}
}
}
void TurboAssembler::Psrld(XMMRegister dst, byte imm8) {
Psrld(dst, dst, imm8);
}

View File

@ -565,6 +565,13 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
// Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
void Pshufb(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// These Wasm SIMD ops do not have direct lowerings on x64. These
// helpers are optimized to produce the fastest and smallest codegen.
void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src);
void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src);
void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src);
void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src);
void CompareRoot(Register with, RootIndex index);
void CompareRoot(Operand with, RootIndex index);

View File

@ -2963,9 +2963,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I32x4SConvertI16x8High: {
XMMRegister dst = i.OutputSimd128Register();
__ Palignr(dst, i.InputSimd128Register(0), uint8_t{8});
__ Pmovsxwd(dst, dst);
__ I32x4SConvertI16x8High(i.OutputSimd128Register(),
i.InputSimd128Register(0));
break;
}
case kX64I32x4Neg: {
@ -3069,9 +3068,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I32x4UConvertI16x8High: {
XMMRegister dst = i.OutputSimd128Register();
__ Palignr(dst, i.InputSimd128Register(0), uint8_t{8});
__ Pmovzxwd(dst, dst);
__ I32x4UConvertI16x8High(i.OutputSimd128Register(),
i.InputSimd128Register(0));
break;
}
case kX64I32x4ShrU: {
@ -3188,9 +3186,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I16x8SConvertI8x16High: {
XMMRegister dst = i.OutputSimd128Register();
__ Palignr(dst, i.InputSimd128Register(0), uint8_t{8});
__ Pmovsxbw(dst, dst);
__ I16x8SConvertI8x16High(i.OutputSimd128Register(),
i.InputSimd128Register(0));
break;
}
case kX64I16x8Neg: {
@ -3278,9 +3275,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I16x8UConvertI8x16High: {
XMMRegister dst = i.OutputSimd128Register();
__ Palignr(dst, i.InputSimd128Register(0), uint8_t{8});
__ Pmovzxbw(dst, dst);
__ I16x8UConvertI8x16High(i.OutputSimd128Register(),
i.InputSimd128Register(0));
break;
}
case kX64I16x8ShrU: {