[wasm-simd] Share i8x16 shr_s shr_u implementation
Move the implementation into shared macro-assembler. TurboFan and Liftoff for both ia32 and x64 can now share the implementation. No functionality change expected. Bug: v8:11589 Change-Id: I8d3567ef6e4a430fe8e007e44d5d55cf8e8a6a7a Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3088273 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Clemens Backes <clemensb@chromium.org> Cr-Commit-Position: refs/heads/master@{#76264}
This commit is contained in:
parent
682affed8d
commit
4955ecfc68
@ -6,6 +6,7 @@
|
||||
|
||||
#include "src/codegen/assembler.h"
|
||||
#include "src/codegen/cpu-features.h"
|
||||
#include "src/codegen/register-arch.h"
|
||||
|
||||
#if V8_TARGET_ARCH_IA32
|
||||
#include "src/codegen/ia32/register-ia32.h"
|
||||
@ -18,6 +19,17 @@
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
void SharedTurboAssembler::Move(Register dst, uint32_t src) {
|
||||
// Helper to paper over the different assembler function names.
|
||||
#if V8_TARGET_ARCH_IA32
|
||||
mov(dst, Immediate(src));
|
||||
#elif V8_TARGET_ARCH_X64
|
||||
movl(dst, Immediate(src));
|
||||
#else
|
||||
#error Unsupported target architecture.
|
||||
#endif
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::Move(Register dst, Register src) {
|
||||
// Helper to paper over the different assembler function names.
|
||||
if (dst != src) {
|
||||
@ -31,6 +43,17 @@ void SharedTurboAssembler::Move(Register dst, Register src) {
|
||||
}
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::Add(Register dst, Immediate src) {
|
||||
// Helper to paper over the different assembler function names.
|
||||
#if V8_TARGET_ARCH_IA32
|
||||
add(dst, src);
|
||||
#elif V8_TARGET_ARCH_X64
|
||||
addq(dst, src);
|
||||
#else
|
||||
#error Unsupported target architecture.
|
||||
#endif
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::And(Register dst, Immediate src) {
|
||||
// Helper to paper over the different assembler function names.
|
||||
#if V8_TARGET_ARCH_IA32
|
||||
@ -231,6 +254,80 @@ void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
|
||||
}
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
|
||||
uint8_t src2, XMMRegister tmp2) {
|
||||
// Unpack bytes into words, do word (16-bit) shifts, and repack.
|
||||
DCHECK_NE(dst, tmp2);
|
||||
uint8_t shift = truncate_to_int3(src2) + 8;
|
||||
|
||||
Punpckhbw(tmp2, src1);
|
||||
Punpcklbw(dst, src1);
|
||||
Psraw(tmp2, shift);
|
||||
Psraw(dst, shift);
|
||||
Packsswb(dst, tmp2);
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
|
||||
Register src2, Register tmp1,
|
||||
XMMRegister tmp2, XMMRegister tmp3) {
|
||||
DCHECK(!AreAliased(dst, tmp2, tmp3));
|
||||
DCHECK_NE(src1, tmp2);
|
||||
|
||||
// Unpack the bytes into words, do arithmetic shifts, and repack.
|
||||
Punpckhbw(tmp2, src1);
|
||||
Punpcklbw(dst, src1);
|
||||
// Prepare shift value
|
||||
Move(tmp1, src2);
|
||||
// Take shift value modulo 8.
|
||||
And(tmp1, Immediate(7));
|
||||
Add(tmp1, Immediate(8));
|
||||
Movd(tmp3, tmp1);
|
||||
Psraw(tmp2, tmp3);
|
||||
Psraw(dst, tmp3);
|
||||
Packsswb(dst, tmp2);
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
|
||||
uint8_t src2, Register tmp1,
|
||||
XMMRegister tmp2) {
|
||||
DCHECK_NE(dst, tmp2);
|
||||
if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
|
||||
movaps(dst, src1);
|
||||
src1 = dst;
|
||||
}
|
||||
|
||||
// Perform 16-bit shift, then mask away high bits.
|
||||
uint8_t shift = truncate_to_int3(src2);
|
||||
Psrlw(dst, src1, shift);
|
||||
|
||||
uint8_t bmask = 0xff >> shift;
|
||||
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
|
||||
Move(tmp1, mask);
|
||||
Movd(tmp2, tmp1);
|
||||
Pshufd(tmp2, tmp2, byte{0});
|
||||
Pand(dst, tmp2);
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
|
||||
Register src2, Register tmp1,
|
||||
XMMRegister tmp2, XMMRegister tmp3) {
|
||||
DCHECK(!AreAliased(dst, tmp2, tmp3));
|
||||
DCHECK_NE(src1, tmp2);
|
||||
|
||||
// Unpack the bytes into words, do logical shifts, and repack.
|
||||
Punpckhbw(tmp2, src1);
|
||||
Punpcklbw(dst, src1);
|
||||
// Prepare shift value.
|
||||
Move(tmp1, src2);
|
||||
// Take shift value modulo 8.
|
||||
And(tmp1, Immediate(7));
|
||||
Add(tmp1, Immediate(8));
|
||||
Movd(tmp3, tmp1);
|
||||
Psrlw(tmp2, tmp3);
|
||||
Psrlw(dst, tmp3);
|
||||
Packuswb(dst, tmp2);
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch,
|
||||
bool is_signed) {
|
||||
|
@ -33,8 +33,10 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
public:
|
||||
using TurboAssemblerBase::TurboAssemblerBase;
|
||||
|
||||
void Move(Register dst, uint32_t src);
|
||||
// Move if registers are not identical.
|
||||
void Move(Register dst, Register src);
|
||||
void Add(Register dst, Immediate src);
|
||||
void And(Register dst, Immediate src);
|
||||
|
||||
void Movapd(XMMRegister dst, XMMRegister src);
|
||||
@ -298,6 +300,14 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
void F32x4Splat(XMMRegister dst, DoubleRegister src);
|
||||
void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane);
|
||||
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
|
||||
void I8x16ShrS(XMMRegister dst, XMMRegister src1, uint8_t src2,
|
||||
XMMRegister tmp2);
|
||||
void I8x16ShrS(XMMRegister dst, XMMRegister src1, Register src2,
|
||||
Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
|
||||
void I8x16ShrU(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1,
|
||||
XMMRegister tmp2);
|
||||
void I8x16ShrU(XMMRegister dst, XMMRegister src1, Register src2,
|
||||
Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
|
||||
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scrat, bool is_signed);
|
||||
void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
|
@ -3147,28 +3147,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kIA32I8x16ShrS: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
// TODO(zhin): remove this restriction from instruction-selector.
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
if (HasImmediateInput(instr, 1)) {
|
||||
__ Punpckhbw(kScratchDoubleReg, dst);
|
||||
__ Punpcklbw(dst, dst);
|
||||
uint8_t shift = i.InputInt3(1) + 8;
|
||||
__ Psraw(kScratchDoubleReg, shift);
|
||||
__ Psraw(dst, shift);
|
||||
__ Packsswb(dst, kScratchDoubleReg);
|
||||
__ I8x16ShrS(dst, i.InputSimd128Register(0), i.InputInt3(1),
|
||||
kScratchDoubleReg);
|
||||
} else {
|
||||
Register tmp = i.ToRegister(instr->TempAt(0));
|
||||
XMMRegister tmp_simd = i.TempSimd128Register(1);
|
||||
// Unpack the bytes into words, do arithmetic shifts, and repack.
|
||||
__ Punpckhbw(kScratchDoubleReg, dst);
|
||||
__ Punpcklbw(dst, dst);
|
||||
__ mov(tmp, i.InputRegister(1));
|
||||
// Take shift value modulo 8.
|
||||
__ and_(tmp, 7);
|
||||
__ add(tmp, Immediate(8));
|
||||
__ Movd(tmp_simd, tmp);
|
||||
__ Psraw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
|
||||
__ Psraw(dst, dst, tmp_simd);
|
||||
__ Packsswb(dst, kScratchDoubleReg);
|
||||
__ I8x16ShrS(dst, i.InputSimd128Register(0), i.InputRegister(1),
|
||||
i.TempRegister(0), kScratchDoubleReg,
|
||||
i.TempSimd128Register(1));
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -3271,34 +3258,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kIA32I8x16ShrU: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
// TODO(zhin): remove this restriction from instruction-selector.
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
Register tmp = i.ToRegister(instr->TempAt(0));
|
||||
XMMRegister tmp_simd = i.TempSimd128Register(1);
|
||||
|
||||
if (HasImmediateInput(instr, 1)) {
|
||||
// Perform 16-bit shift, then mask away high bits.
|
||||
uint8_t shift = i.InputInt3(1);
|
||||
__ Psrlw(dst, dst, byte{shift});
|
||||
|
||||
uint8_t bmask = 0xff >> shift;
|
||||
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
|
||||
__ mov(tmp, mask);
|
||||
__ Movd(tmp_simd, tmp);
|
||||
__ Pshufd(tmp_simd, tmp_simd, uint8_t{0});
|
||||
__ Pand(dst, tmp_simd);
|
||||
__ I8x16ShrU(dst, i.InputSimd128Register(0), i.InputInt3(1), tmp,
|
||||
kScratchDoubleReg);
|
||||
} else {
|
||||
// Unpack the bytes into words, do logical shifts, and repack.
|
||||
__ Punpckhbw(kScratchDoubleReg, dst);
|
||||
__ Punpcklbw(dst, dst);
|
||||
__ mov(tmp, i.InputRegister(1));
|
||||
// Take shift value modulo 8.
|
||||
__ and_(tmp, 7);
|
||||
__ add(tmp, Immediate(8));
|
||||
__ Movd(tmp_simd, tmp);
|
||||
__ Psrlw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
|
||||
__ Psrlw(dst, dst, tmp_simd);
|
||||
__ Packuswb(dst, kScratchDoubleReg);
|
||||
__ I8x16ShrU(dst, i.InputSimd128Register(0), i.InputRegister(1), tmp,
|
||||
kScratchDoubleReg, i.TempSimd128Register(1));
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case kIA32I8x16MinU: {
|
||||
|
@ -3528,30 +3528,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kX64I8x16ShrS: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
// TODO(zhin): remove this restriction from instruction-selector.
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
if (HasImmediateInput(instr, 1)) {
|
||||
__ Punpckhbw(kScratchDoubleReg, dst);
|
||||
__ Punpcklbw(dst, dst);
|
||||
uint8_t shift = i.InputInt3(1) + 8;
|
||||
__ Psraw(kScratchDoubleReg, shift);
|
||||
__ Psraw(dst, shift);
|
||||
__ Packsswb(dst, kScratchDoubleReg);
|
||||
__ I8x16ShrS(dst, i.InputSimd128Register(0), i.InputInt3(1),
|
||||
kScratchDoubleReg);
|
||||
} else {
|
||||
// Temp registers for shift mask andadditional moves to XMM registers.
|
||||
Register tmp = i.ToRegister(instr->TempAt(0));
|
||||
XMMRegister tmp_simd = i.TempSimd128Register(1);
|
||||
// Unpack the bytes into words, do arithmetic shifts, and repack.
|
||||
__ Punpckhbw(kScratchDoubleReg, dst);
|
||||
__ Punpcklbw(dst, dst);
|
||||
// Prepare shift value
|
||||
__ movq(tmp, i.InputRegister(1));
|
||||
// Take shift value modulo 8.
|
||||
__ andq(tmp, Immediate(7));
|
||||
__ addq(tmp, Immediate(8));
|
||||
__ Movq(tmp_simd, tmp);
|
||||
__ Psraw(kScratchDoubleReg, tmp_simd);
|
||||
__ Psraw(dst, tmp_simd);
|
||||
__ Packsswb(dst, kScratchDoubleReg);
|
||||
// TODO(zhin): use kScratchRegister instead of TempRegister.
|
||||
__ I8x16ShrS(dst, i.InputSimd128Register(0), i.InputRegister(1),
|
||||
i.TempRegister(0), kScratchDoubleReg,
|
||||
i.TempSimd128Register(1));
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -3607,34 +3593,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kX64I8x16ShrU: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
// Unpack the bytes into words, do logical shifts, and repack.
|
||||
// TODO(zhin): remove this restriction from instruction-selector.
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
// Temp registers for shift mask andadditional moves to XMM registers.
|
||||
Register tmp = i.ToRegister(instr->TempAt(0));
|
||||
XMMRegister tmp_simd = i.TempSimd128Register(1);
|
||||
// TODO(zhin): use kScratchRegister instead of tmp.
|
||||
Register tmp = i.TempRegister(0);
|
||||
if (HasImmediateInput(instr, 1)) {
|
||||
// Perform 16-bit shift, then mask away high bits.
|
||||
uint8_t shift = i.InputInt3(1);
|
||||
__ Psrlw(dst, byte{shift});
|
||||
|
||||
uint8_t bmask = 0xff >> shift;
|
||||
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
|
||||
__ movl(tmp, Immediate(mask));
|
||||
__ Movd(tmp_simd, tmp);
|
||||
__ Pshufd(tmp_simd, tmp_simd, byte{0});
|
||||
__ Pand(dst, tmp_simd);
|
||||
__ I8x16ShrU(dst, i.InputSimd128Register(0), i.InputInt3(1), tmp,
|
||||
kScratchDoubleReg);
|
||||
} else {
|
||||
__ Punpckhbw(kScratchDoubleReg, dst);
|
||||
__ Punpcklbw(dst, dst);
|
||||
// Prepare shift value
|
||||
__ movq(tmp, i.InputRegister(1));
|
||||
// Take shift value modulo 8.
|
||||
__ andq(tmp, Immediate(7));
|
||||
__ addq(tmp, Immediate(8));
|
||||
__ Movq(tmp_simd, tmp);
|
||||
__ Psrlw(kScratchDoubleReg, tmp_simd);
|
||||
__ Psrlw(dst, tmp_simd);
|
||||
__ Packuswb(dst, kScratchDoubleReg);
|
||||
__ I8x16ShrU(dst, i.InputSimd128Register(0), i.InputRegister(1), tmp,
|
||||
kScratchDoubleReg, i.TempSimd128Register(1));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -2718,40 +2718,6 @@ void EmitSimdShiftOpImm(LiftoffAssembler* assm, LiftoffRegister dst,
|
||||
}
|
||||
}
|
||||
|
||||
enum class ShiftSignedness { kSigned, kUnsigned };
|
||||
|
||||
template <bool is_signed>
|
||||
void EmitI8x16Shr(LiftoffAssembler* assm, LiftoffRegister dst,
|
||||
LiftoffRegister lhs, LiftoffRegister rhs) {
|
||||
// Same algorithm is used for both signed and unsigned shifts, the only
|
||||
// difference is the actual shift and pack in the end. This is the same
|
||||
// algorithm as used in code-generator-ia32.cc
|
||||
Register tmp =
|
||||
assm->GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(rhs)).gp();
|
||||
XMMRegister tmp_simd =
|
||||
assm->GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs)).fp();
|
||||
|
||||
// Unpack the bytes into words, do logical shifts, and repack.
|
||||
assm->Punpckhbw(liftoff::kScratchDoubleReg, lhs.fp());
|
||||
assm->Punpcklbw(dst.fp(), lhs.fp());
|
||||
assm->mov(tmp, rhs.gp());
|
||||
// Take shift value modulo 8.
|
||||
assm->and_(tmp, 7);
|
||||
assm->add(tmp, Immediate(8));
|
||||
assm->Movd(tmp_simd, tmp);
|
||||
if (is_signed) {
|
||||
assm->Psraw(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg,
|
||||
tmp_simd);
|
||||
assm->Psraw(dst.fp(), dst.fp(), tmp_simd);
|
||||
assm->Packsswb(dst.fp(), liftoff::kScratchDoubleReg);
|
||||
} else {
|
||||
assm->Psrlw(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg,
|
||||
tmp_simd);
|
||||
assm->Psrlw(dst.fp(), dst.fp(), tmp_simd);
|
||||
assm->Packuswb(dst.fp(), liftoff::kScratchDoubleReg);
|
||||
}
|
||||
}
|
||||
|
||||
inline void EmitAnyTrue(LiftoffAssembler* assm, LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
Register tmp =
|
||||
@ -3416,39 +3382,32 @@ void LiftoffAssembler::emit_i8x16_shli(LiftoffRegister dst, LiftoffRegister lhs,
|
||||
void LiftoffAssembler::emit_i8x16_shr_s(LiftoffRegister dst,
|
||||
LiftoffRegister lhs,
|
||||
LiftoffRegister rhs) {
|
||||
liftoff::EmitI8x16Shr</*is_signed=*/true>(this, dst, lhs, rhs);
|
||||
Register tmp = GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(rhs)).gp();
|
||||
XMMRegister tmp_simd =
|
||||
GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs)).fp();
|
||||
I8x16ShrS(dst.fp(), lhs.fp(), rhs.gp(), tmp, liftoff::kScratchDoubleReg,
|
||||
tmp_simd);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_shri_s(LiftoffRegister dst,
|
||||
LiftoffRegister lhs, int32_t rhs) {
|
||||
Punpckhbw(liftoff::kScratchDoubleReg, lhs.fp());
|
||||
Punpcklbw(dst.fp(), lhs.fp());
|
||||
uint8_t shift = (rhs & 7) + 8;
|
||||
Psraw(liftoff::kScratchDoubleReg, shift);
|
||||
Psraw(dst.fp(), shift);
|
||||
Packsswb(dst.fp(), liftoff::kScratchDoubleReg);
|
||||
I8x16ShrS(dst.fp(), lhs.fp(), rhs, liftoff::kScratchDoubleReg);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_shr_u(LiftoffRegister dst,
|
||||
LiftoffRegister lhs,
|
||||
LiftoffRegister rhs) {
|
||||
liftoff::EmitI8x16Shr</*is_signed=*/false>(this, dst, lhs, rhs);
|
||||
Register tmp = GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(rhs)).gp();
|
||||
XMMRegister tmp_simd =
|
||||
GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs)).fp();
|
||||
I8x16ShrU(dst.fp(), lhs.fp(), rhs.gp(), tmp, liftoff::kScratchDoubleReg,
|
||||
tmp_simd);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_shri_u(LiftoffRegister dst,
|
||||
LiftoffRegister lhs, int32_t rhs) {
|
||||
Register tmp = GetUnusedRegister(kGpReg, {}).gp();
|
||||
// Perform 16-bit shift, then mask away high bits.
|
||||
uint8_t shift = rhs & 7;
|
||||
liftoff::EmitSimdShiftOpImm<&Assembler::vpsrlw, &Assembler::psrlw, 3>(
|
||||
this, dst, lhs, rhs);
|
||||
|
||||
uint8_t bmask = 0xff >> shift;
|
||||
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
|
||||
mov(tmp, mask);
|
||||
Movd(liftoff::kScratchDoubleReg, tmp);
|
||||
Pshufd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, uint8_t{0});
|
||||
Pand(dst.fp(), liftoff::kScratchDoubleReg);
|
||||
I8x16ShrU(dst.fp(), lhs.fp(), rhs, tmp, liftoff::kScratchDoubleReg);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,
|
||||
|
@ -2335,29 +2335,6 @@ void EmitSimdShiftOpImm(LiftoffAssembler* assm, LiftoffRegister dst,
|
||||
}
|
||||
}
|
||||
|
||||
template <bool is_signed>
|
||||
void EmitI8x16Shr(LiftoffAssembler* assm, LiftoffRegister dst,
|
||||
LiftoffRegister lhs, LiftoffRegister rhs) {
|
||||
// Same algorithm as the one in code-generator-x64.cc.
|
||||
assm->Punpckhbw(kScratchDoubleReg, lhs.fp());
|
||||
assm->Punpcklbw(dst.fp(), lhs.fp());
|
||||
// Prepare shift value
|
||||
assm->movq(kScratchRegister, rhs.gp());
|
||||
// Take shift value modulo 8.
|
||||
assm->andq(kScratchRegister, Immediate(7));
|
||||
assm->addq(kScratchRegister, Immediate(8));
|
||||
assm->Movq(liftoff::kScratchDoubleReg2, kScratchRegister);
|
||||
if (is_signed) {
|
||||
assm->Psraw(kScratchDoubleReg, liftoff::kScratchDoubleReg2);
|
||||
assm->Psraw(dst.fp(), liftoff::kScratchDoubleReg2);
|
||||
assm->Packsswb(dst.fp(), kScratchDoubleReg);
|
||||
} else {
|
||||
assm->Psrlw(kScratchDoubleReg, liftoff::kScratchDoubleReg2);
|
||||
assm->Psrlw(dst.fp(), liftoff::kScratchDoubleReg2);
|
||||
assm->Packuswb(dst.fp(), kScratchDoubleReg);
|
||||
}
|
||||
}
|
||||
|
||||
inline void EmitAnyTrue(LiftoffAssembler* assm, LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
assm->xorq(dst.gp(), dst.gp());
|
||||
@ -2973,43 +2950,25 @@ void LiftoffAssembler::emit_i8x16_shli(LiftoffRegister dst, LiftoffRegister lhs,
|
||||
void LiftoffAssembler::emit_i8x16_shr_s(LiftoffRegister dst,
|
||||
LiftoffRegister lhs,
|
||||
LiftoffRegister rhs) {
|
||||
liftoff::EmitI8x16Shr</*is_signed=*/true>(this, dst, lhs, rhs);
|
||||
I8x16ShrS(dst.fp(), lhs.fp(), rhs.gp(), kScratchRegister, kScratchDoubleReg,
|
||||
liftoff::kScratchDoubleReg2);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_shri_s(LiftoffRegister dst,
|
||||
LiftoffRegister lhs, int32_t rhs) {
|
||||
Punpckhbw(kScratchDoubleReg, lhs.fp());
|
||||
Punpcklbw(dst.fp(), lhs.fp());
|
||||
uint8_t shift = (rhs & 7) + 8;
|
||||
Psraw(kScratchDoubleReg, shift);
|
||||
Psraw(dst.fp(), shift);
|
||||
Packsswb(dst.fp(), kScratchDoubleReg);
|
||||
I8x16ShrS(dst.fp(), lhs.fp(), rhs, kScratchDoubleReg);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_shr_u(LiftoffRegister dst,
|
||||
LiftoffRegister lhs,
|
||||
LiftoffRegister rhs) {
|
||||
liftoff::EmitI8x16Shr</*is_signed=*/false>(this, dst, lhs, rhs);
|
||||
I8x16ShrU(dst.fp(), lhs.fp(), rhs.gp(), kScratchRegister, kScratchDoubleReg,
|
||||
liftoff::kScratchDoubleReg2);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_shri_u(LiftoffRegister dst,
|
||||
LiftoffRegister lhs, int32_t rhs) {
|
||||
// Perform 16-bit shift, then mask away high bits.
|
||||
uint8_t shift = rhs & 7; // i.InputInt3(1);
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope scope(this, AVX);
|
||||
vpsrlw(dst.fp(), lhs.fp(), byte{shift});
|
||||
} else if (dst != lhs) {
|
||||
Movaps(dst.fp(), lhs.fp());
|
||||
psrlw(dst.fp(), byte{shift});
|
||||
}
|
||||
|
||||
uint8_t bmask = 0xff >> shift;
|
||||
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
|
||||
movl(kScratchRegister, Immediate(mask));
|
||||
Movd(kScratchDoubleReg, kScratchRegister);
|
||||
Pshufd(kScratchDoubleReg, kScratchDoubleReg, byte{0});
|
||||
Pand(dst.fp(), kScratchDoubleReg);
|
||||
I8x16ShrU(dst.fp(), lhs.fp(), rhs, kScratchRegister, kScratchDoubleReg);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,
|
||||
|
Loading…
Reference in New Issue
Block a user