[wasm-simd] Share i8x16 shr_s shr_u implementation

Move the implementation into shared macro-assembler. TurboFan and
Liftoff for both ia32 and x64 can now share the implementation. No
functionality change expected.

Bug: v8:11589
Change-Id: I8d3567ef6e4a430fe8e007e44d5d55cf8e8a6a7a
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3088273
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#76264}
This commit is contained in:
Ng Zhi An 2021-08-12 10:01:33 -07:00 committed by V8 LUCI CQ
parent 682affed8d
commit 4955ecfc68
6 changed files with 151 additions and 187 deletions

View File

@ -6,6 +6,7 @@
#include "src/codegen/assembler.h"
#include "src/codegen/cpu-features.h"
#include "src/codegen/register-arch.h"
#if V8_TARGET_ARCH_IA32
#include "src/codegen/ia32/register-ia32.h"
@ -18,6 +19,17 @@
namespace v8 {
namespace internal {
void SharedTurboAssembler::Move(Register dst, uint32_t src) {
// Helper to paper over the different assembler function names.
#if V8_TARGET_ARCH_IA32
mov(dst, Immediate(src));
#elif V8_TARGET_ARCH_X64
movl(dst, Immediate(src));
#else
#error Unsupported target architecture.
#endif
}
void SharedTurboAssembler::Move(Register dst, Register src) {
// Helper to paper over the different assembler function names.
if (dst != src) {
@ -31,6 +43,17 @@ void SharedTurboAssembler::Move(Register dst, Register src) {
}
}
void SharedTurboAssembler::Add(Register dst, Immediate src) {
// Helper to paper over the different assembler function names.
#if V8_TARGET_ARCH_IA32
add(dst, src);
#elif V8_TARGET_ARCH_X64
addq(dst, src);
#else
#error Unsupported target architecture.
#endif
}
void SharedTurboAssembler::And(Register dst, Immediate src) {
// Helper to paper over the different assembler function names.
#if V8_TARGET_ARCH_IA32
@ -231,6 +254,80 @@ void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
}
}
void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
uint8_t src2, XMMRegister tmp2) {
// Unpack bytes into words, do word (16-bit) shifts, and repack.
DCHECK_NE(dst, tmp2);
uint8_t shift = truncate_to_int3(src2) + 8;
Punpckhbw(tmp2, src1);
Punpcklbw(dst, src1);
Psraw(tmp2, shift);
Psraw(dst, shift);
Packsswb(dst, tmp2);
}
void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
Register src2, Register tmp1,
XMMRegister tmp2, XMMRegister tmp3) {
DCHECK(!AreAliased(dst, tmp2, tmp3));
DCHECK_NE(src1, tmp2);
// Unpack the bytes into words, do arithmetic shifts, and repack.
Punpckhbw(tmp2, src1);
Punpcklbw(dst, src1);
// Prepare shift value
Move(tmp1, src2);
// Take shift value modulo 8.
And(tmp1, Immediate(7));
Add(tmp1, Immediate(8));
Movd(tmp3, tmp1);
Psraw(tmp2, tmp3);
Psraw(dst, tmp3);
Packsswb(dst, tmp2);
}
void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
uint8_t src2, Register tmp1,
XMMRegister tmp2) {
DCHECK_NE(dst, tmp2);
if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
movaps(dst, src1);
src1 = dst;
}
// Perform 16-bit shift, then mask away high bits.
uint8_t shift = truncate_to_int3(src2);
Psrlw(dst, src1, shift);
uint8_t bmask = 0xff >> shift;
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
Move(tmp1, mask);
Movd(tmp2, tmp1);
Pshufd(tmp2, tmp2, byte{0});
Pand(dst, tmp2);
}
void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
Register src2, Register tmp1,
XMMRegister tmp2, XMMRegister tmp3) {
DCHECK(!AreAliased(dst, tmp2, tmp3));
DCHECK_NE(src1, tmp2);
// Unpack the bytes into words, do logical shifts, and repack.
Punpckhbw(tmp2, src1);
Punpcklbw(dst, src1);
// Prepare shift value.
Move(tmp1, src2);
// Take shift value modulo 8.
And(tmp1, Immediate(7));
Add(tmp1, Immediate(8));
Movd(tmp3, tmp1);
Psrlw(tmp2, tmp3);
Psrlw(dst, tmp3);
Packuswb(dst, tmp2);
}
void SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch,
bool is_signed) {

View File

@ -33,8 +33,10 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
public:
using TurboAssemblerBase::TurboAssemblerBase;
void Move(Register dst, uint32_t src);
// Move if registers are not identical.
void Move(Register dst, Register src);
void Add(Register dst, Immediate src);
void And(Register dst, Immediate src);
void Movapd(XMMRegister dst, XMMRegister src);
@ -298,6 +300,14 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
void F32x4Splat(XMMRegister dst, DoubleRegister src);
void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane);
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void I8x16ShrS(XMMRegister dst, XMMRegister src1, uint8_t src2,
XMMRegister tmp2);
void I8x16ShrS(XMMRegister dst, XMMRegister src1, Register src2,
Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
void I8x16ShrU(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1,
XMMRegister tmp2);
void I8x16ShrU(XMMRegister dst, XMMRegister src1, Register src2,
Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scrat, bool is_signed);
void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2,

View File

@ -3147,28 +3147,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kIA32I8x16ShrS: {
XMMRegister dst = i.OutputSimd128Register();
// TODO(zhin): remove this restriction from instruction-selector.
DCHECK_EQ(dst, i.InputSimd128Register(0));
if (HasImmediateInput(instr, 1)) {
__ Punpckhbw(kScratchDoubleReg, dst);
__ Punpcklbw(dst, dst);
uint8_t shift = i.InputInt3(1) + 8;
__ Psraw(kScratchDoubleReg, shift);
__ Psraw(dst, shift);
__ Packsswb(dst, kScratchDoubleReg);
__ I8x16ShrS(dst, i.InputSimd128Register(0), i.InputInt3(1),
kScratchDoubleReg);
} else {
Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1);
// Unpack the bytes into words, do arithmetic shifts, and repack.
__ Punpckhbw(kScratchDoubleReg, dst);
__ Punpcklbw(dst, dst);
__ mov(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ and_(tmp, 7);
__ add(tmp, Immediate(8));
__ Movd(tmp_simd, tmp);
__ Psraw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
__ Psraw(dst, dst, tmp_simd);
__ Packsswb(dst, kScratchDoubleReg);
__ I8x16ShrS(dst, i.InputSimd128Register(0), i.InputRegister(1),
i.TempRegister(0), kScratchDoubleReg,
i.TempSimd128Register(1));
}
break;
}
@ -3271,34 +3258,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kIA32I8x16ShrU: {
XMMRegister dst = i.OutputSimd128Register();
// TODO(zhin): remove this restriction from instruction-selector.
DCHECK_EQ(dst, i.InputSimd128Register(0));
Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1);
if (HasImmediateInput(instr, 1)) {
// Perform 16-bit shift, then mask away high bits.
uint8_t shift = i.InputInt3(1);
__ Psrlw(dst, dst, byte{shift});
uint8_t bmask = 0xff >> shift;
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
__ mov(tmp, mask);
__ Movd(tmp_simd, tmp);
__ Pshufd(tmp_simd, tmp_simd, uint8_t{0});
__ Pand(dst, tmp_simd);
__ I8x16ShrU(dst, i.InputSimd128Register(0), i.InputInt3(1), tmp,
kScratchDoubleReg);
} else {
// Unpack the bytes into words, do logical shifts, and repack.
__ Punpckhbw(kScratchDoubleReg, dst);
__ Punpcklbw(dst, dst);
__ mov(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ and_(tmp, 7);
__ add(tmp, Immediate(8));
__ Movd(tmp_simd, tmp);
__ Psrlw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
__ Psrlw(dst, dst, tmp_simd);
__ Packuswb(dst, kScratchDoubleReg);
__ I8x16ShrU(dst, i.InputSimd128Register(0), i.InputRegister(1), tmp,
kScratchDoubleReg, i.TempSimd128Register(1));
}
break;
}
case kIA32I8x16MinU: {

View File

@ -3528,30 +3528,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I8x16ShrS: {
XMMRegister dst = i.OutputSimd128Register();
// TODO(zhin): remove this restriction from instruction-selector.
DCHECK_EQ(dst, i.InputSimd128Register(0));
if (HasImmediateInput(instr, 1)) {
__ Punpckhbw(kScratchDoubleReg, dst);
__ Punpcklbw(dst, dst);
uint8_t shift = i.InputInt3(1) + 8;
__ Psraw(kScratchDoubleReg, shift);
__ Psraw(dst, shift);
__ Packsswb(dst, kScratchDoubleReg);
__ I8x16ShrS(dst, i.InputSimd128Register(0), i.InputInt3(1),
kScratchDoubleReg);
} else {
// Temp registers for shift mask andadditional moves to XMM registers.
Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1);
// Unpack the bytes into words, do arithmetic shifts, and repack.
__ Punpckhbw(kScratchDoubleReg, dst);
__ Punpcklbw(dst, dst);
// Prepare shift value
__ movq(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ andq(tmp, Immediate(7));
__ addq(tmp, Immediate(8));
__ Movq(tmp_simd, tmp);
__ Psraw(kScratchDoubleReg, tmp_simd);
__ Psraw(dst, tmp_simd);
__ Packsswb(dst, kScratchDoubleReg);
// TODO(zhin): use kScratchRegister instead of TempRegister.
__ I8x16ShrS(dst, i.InputSimd128Register(0), i.InputRegister(1),
i.TempRegister(0), kScratchDoubleReg,
i.TempSimd128Register(1));
}
break;
}
@ -3607,34 +3593,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I8x16ShrU: {
XMMRegister dst = i.OutputSimd128Register();
// Unpack the bytes into words, do logical shifts, and repack.
// TODO(zhin): remove this restriction from instruction-selector.
DCHECK_EQ(dst, i.InputSimd128Register(0));
// Temp registers for shift mask andadditional moves to XMM registers.
Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1);
// TODO(zhin): use kScratchRegister instead of tmp.
Register tmp = i.TempRegister(0);
if (HasImmediateInput(instr, 1)) {
// Perform 16-bit shift, then mask away high bits.
uint8_t shift = i.InputInt3(1);
__ Psrlw(dst, byte{shift});
uint8_t bmask = 0xff >> shift;
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
__ movl(tmp, Immediate(mask));
__ Movd(tmp_simd, tmp);
__ Pshufd(tmp_simd, tmp_simd, byte{0});
__ Pand(dst, tmp_simd);
__ I8x16ShrU(dst, i.InputSimd128Register(0), i.InputInt3(1), tmp,
kScratchDoubleReg);
} else {
__ Punpckhbw(kScratchDoubleReg, dst);
__ Punpcklbw(dst, dst);
// Prepare shift value
__ movq(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ andq(tmp, Immediate(7));
__ addq(tmp, Immediate(8));
__ Movq(tmp_simd, tmp);
__ Psrlw(kScratchDoubleReg, tmp_simd);
__ Psrlw(dst, tmp_simd);
__ Packuswb(dst, kScratchDoubleReg);
__ I8x16ShrU(dst, i.InputSimd128Register(0), i.InputRegister(1), tmp,
kScratchDoubleReg, i.TempSimd128Register(1));
}
break;
}

View File

@ -2718,40 +2718,6 @@ void EmitSimdShiftOpImm(LiftoffAssembler* assm, LiftoffRegister dst,
}
}
enum class ShiftSignedness { kSigned, kUnsigned };
template <bool is_signed>
void EmitI8x16Shr(LiftoffAssembler* assm, LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister rhs) {
// Same algorithm is used for both signed and unsigned shifts, the only
// difference is the actual shift and pack in the end. This is the same
// algorithm as used in code-generator-ia32.cc
Register tmp =
assm->GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(rhs)).gp();
XMMRegister tmp_simd =
assm->GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs)).fp();
// Unpack the bytes into words, do logical shifts, and repack.
assm->Punpckhbw(liftoff::kScratchDoubleReg, lhs.fp());
assm->Punpcklbw(dst.fp(), lhs.fp());
assm->mov(tmp, rhs.gp());
// Take shift value modulo 8.
assm->and_(tmp, 7);
assm->add(tmp, Immediate(8));
assm->Movd(tmp_simd, tmp);
if (is_signed) {
assm->Psraw(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg,
tmp_simd);
assm->Psraw(dst.fp(), dst.fp(), tmp_simd);
assm->Packsswb(dst.fp(), liftoff::kScratchDoubleReg);
} else {
assm->Psrlw(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg,
tmp_simd);
assm->Psrlw(dst.fp(), dst.fp(), tmp_simd);
assm->Packuswb(dst.fp(), liftoff::kScratchDoubleReg);
}
}
inline void EmitAnyTrue(LiftoffAssembler* assm, LiftoffRegister dst,
LiftoffRegister src) {
Register tmp =
@ -3416,39 +3382,32 @@ void LiftoffAssembler::emit_i8x16_shli(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_shr_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitI8x16Shr</*is_signed=*/true>(this, dst, lhs, rhs);
Register tmp = GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(rhs)).gp();
XMMRegister tmp_simd =
GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs)).fp();
I8x16ShrS(dst.fp(), lhs.fp(), rhs.gp(), tmp, liftoff::kScratchDoubleReg,
tmp_simd);
}
void LiftoffAssembler::emit_i8x16_shri_s(LiftoffRegister dst,
LiftoffRegister lhs, int32_t rhs) {
Punpckhbw(liftoff::kScratchDoubleReg, lhs.fp());
Punpcklbw(dst.fp(), lhs.fp());
uint8_t shift = (rhs & 7) + 8;
Psraw(liftoff::kScratchDoubleReg, shift);
Psraw(dst.fp(), shift);
Packsswb(dst.fp(), liftoff::kScratchDoubleReg);
I8x16ShrS(dst.fp(), lhs.fp(), rhs, liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_i8x16_shr_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitI8x16Shr</*is_signed=*/false>(this, dst, lhs, rhs);
Register tmp = GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(rhs)).gp();
XMMRegister tmp_simd =
GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs)).fp();
I8x16ShrU(dst.fp(), lhs.fp(), rhs.gp(), tmp, liftoff::kScratchDoubleReg,
tmp_simd);
}
void LiftoffAssembler::emit_i8x16_shri_u(LiftoffRegister dst,
LiftoffRegister lhs, int32_t rhs) {
Register tmp = GetUnusedRegister(kGpReg, {}).gp();
// Perform 16-bit shift, then mask away high bits.
uint8_t shift = rhs & 7;
liftoff::EmitSimdShiftOpImm<&Assembler::vpsrlw, &Assembler::psrlw, 3>(
this, dst, lhs, rhs);
uint8_t bmask = 0xff >> shift;
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
mov(tmp, mask);
Movd(liftoff::kScratchDoubleReg, tmp);
Pshufd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, uint8_t{0});
Pand(dst.fp(), liftoff::kScratchDoubleReg);
I8x16ShrU(dst.fp(), lhs.fp(), rhs, tmp, liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,

View File

@ -2335,29 +2335,6 @@ void EmitSimdShiftOpImm(LiftoffAssembler* assm, LiftoffRegister dst,
}
}
template <bool is_signed>
void EmitI8x16Shr(LiftoffAssembler* assm, LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister rhs) {
// Same algorithm as the one in code-generator-x64.cc.
assm->Punpckhbw(kScratchDoubleReg, lhs.fp());
assm->Punpcklbw(dst.fp(), lhs.fp());
// Prepare shift value
assm->movq(kScratchRegister, rhs.gp());
// Take shift value modulo 8.
assm->andq(kScratchRegister, Immediate(7));
assm->addq(kScratchRegister, Immediate(8));
assm->Movq(liftoff::kScratchDoubleReg2, kScratchRegister);
if (is_signed) {
assm->Psraw(kScratchDoubleReg, liftoff::kScratchDoubleReg2);
assm->Psraw(dst.fp(), liftoff::kScratchDoubleReg2);
assm->Packsswb(dst.fp(), kScratchDoubleReg);
} else {
assm->Psrlw(kScratchDoubleReg, liftoff::kScratchDoubleReg2);
assm->Psrlw(dst.fp(), liftoff::kScratchDoubleReg2);
assm->Packuswb(dst.fp(), kScratchDoubleReg);
}
}
inline void EmitAnyTrue(LiftoffAssembler* assm, LiftoffRegister dst,
LiftoffRegister src) {
assm->xorq(dst.gp(), dst.gp());
@ -2973,43 +2950,25 @@ void LiftoffAssembler::emit_i8x16_shli(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_shr_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitI8x16Shr</*is_signed=*/true>(this, dst, lhs, rhs);
I8x16ShrS(dst.fp(), lhs.fp(), rhs.gp(), kScratchRegister, kScratchDoubleReg,
liftoff::kScratchDoubleReg2);
}
void LiftoffAssembler::emit_i8x16_shri_s(LiftoffRegister dst,
LiftoffRegister lhs, int32_t rhs) {
Punpckhbw(kScratchDoubleReg, lhs.fp());
Punpcklbw(dst.fp(), lhs.fp());
uint8_t shift = (rhs & 7) + 8;
Psraw(kScratchDoubleReg, shift);
Psraw(dst.fp(), shift);
Packsswb(dst.fp(), kScratchDoubleReg);
I8x16ShrS(dst.fp(), lhs.fp(), rhs, kScratchDoubleReg);
}
void LiftoffAssembler::emit_i8x16_shr_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitI8x16Shr</*is_signed=*/false>(this, dst, lhs, rhs);
I8x16ShrU(dst.fp(), lhs.fp(), rhs.gp(), kScratchRegister, kScratchDoubleReg,
liftoff::kScratchDoubleReg2);
}
void LiftoffAssembler::emit_i8x16_shri_u(LiftoffRegister dst,
LiftoffRegister lhs, int32_t rhs) {
// Perform 16-bit shift, then mask away high bits.
uint8_t shift = rhs & 7; // i.InputInt3(1);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpsrlw(dst.fp(), lhs.fp(), byte{shift});
} else if (dst != lhs) {
Movaps(dst.fp(), lhs.fp());
psrlw(dst.fp(), byte{shift});
}
uint8_t bmask = 0xff >> shift;
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
movl(kScratchRegister, Immediate(mask));
Movd(kScratchDoubleReg, kScratchRegister);
Pshufd(kScratchDoubleReg, kScratchDoubleReg, byte{0});
Pand(dst.fp(), kScratchDoubleReg);
I8x16ShrU(dst.fp(), lhs.fp(), rhs, kScratchRegister, kScratchDoubleReg);
}
void LiftoffAssembler::emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,