[wasm-simd] Move extmul into SharedTurboAssembler

Left i16x8.extmul_low in the arch-specific macro-assemblers because
they rely on other functions defined in the same file. We can come back
and move it afterwards.

Bug: v8:11589
Change-Id: I2ea81c50ed52cc3e59e001b5e80aaf6b93a6572c
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2786280
Reviewed-by: Bill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73688}
This commit is contained in:
Ng Zhi An 2021-03-25 11:21:02 -07:00 committed by Commit Bot
parent 950b281ffe
commit 6acd0e4ab0
8 changed files with 161 additions and 283 deletions

View File

@ -642,60 +642,6 @@ void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
}
}
// 1. Unpack src0, src1 into even-number elements of scratch.
// 2. Unpack src1, src0 into even-number elements of dst.
// 3. Multiply 1. with 2.
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
void TurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch,
bool low, bool is_signed) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
if (low) {
vpunpckldq(scratch, src1, src1);
vpunpckldq(dst, src2, src2);
} else {
vpunpckhdq(scratch, src1, src1);
vpunpckhdq(dst, src2, src2);
}
if (is_signed) {
vpmuldq(dst, scratch, dst);
} else {
vpmuludq(dst, scratch, dst);
}
} else {
uint8_t mask = low ? 0x50 : 0xFA;
pshufd(scratch, src1, mask);
pshufd(dst, src2, mask);
if (is_signed) {
CpuFeatureScope sse4_scope(this, SSE4_1);
pmuldq(dst, scratch);
} else {
pmuludq(dst, scratch);
}
}
}
// 1. Multiply low word into scratch.
// 2. Multiply high word (can be signed or unsigned) into dst.
// 3. Unpack and interleave scratch and dst into dst.
void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch,
bool low, bool is_signed) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmullw(scratch, src1, src2);
is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
} else {
DCHECK_EQ(dst, src1);
movaps(scratch, src1);
pmullw(dst, src2);
is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2);
low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
}
}
void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch,
bool is_signed) {
@ -704,76 +650,6 @@ void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
Pmullw(dst, scratch);
}
void TurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpunpckhbw(scratch, src1, src1);
vpsraw(scratch, scratch, 8);
vpunpckhbw(dst, src2, src2);
vpsraw(dst, dst, 8);
vpmullw(dst, dst, scratch);
} else {
if (dst != src1) {
movaps(dst, src1);
}
movaps(scratch, src2);
punpckhbw(dst, dst);
psraw(dst, 8);
punpckhbw(scratch, scratch);
psraw(scratch, 8);
pmullw(dst, scratch);
}
}
void TurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch) {
// The logic here is slightly complicated to handle all the cases of register
// aliasing. This allows flexibility for callers in TurboFan and Liftoff.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
if (src1 == src2) {
vpxor(scratch, scratch, scratch);
vpunpckhbw(dst, src1, scratch);
vpmullw(dst, dst, dst);
} else {
if (dst == src2) {
// We overwrite dst, then use src2, so swap src1 and src2.
std::swap(src1, src2);
}
vpxor(scratch, scratch, scratch);
vpunpckhbw(dst, src1, scratch);
vpunpckhbw(scratch, src2, scratch);
vpmullw(dst, dst, scratch);
}
} else {
if (src1 == src2) {
xorps(scratch, scratch);
if (dst != src1) {
movaps(dst, src1);
}
punpckhbw(dst, scratch);
pmullw(dst, scratch);
} else {
// When dst == src1, nothing special needs to be done.
// When dst == src2, swap src1 and src2, since we overwrite dst.
// When dst is unique, copy src1 to dst first.
if (dst == src2) {
std::swap(src1, src2);
// Now, dst == src1.
} else if (dst != src1) {
// dst != src1 && dst != src2.
movaps(dst, src1);
}
xorps(scratch, scratch);
punpckhbw(dst, scratch);
punpckhbw(scratch, src2);
psrlw(scratch, 8);
pmullw(dst, scratch);
}
}
}
void TurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
XMMRegister src1, XMMRegister src2,
XMMRegister scratch) {

View File

@ -712,17 +712,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
// These Wasm SIMD ops do not have direct lowerings on IA32. These
// helpers are optimized to produce the fastest and smallest codegen.
// Defined here to allow usage on both TurboFan and Liftoff.
void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed);
// Requires that dst == src1 if AVX is not supported.
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed);
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool is_signed);
void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
// Requires dst == mask when AVX is not supported.
void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
XMMRegister src2, XMMRegister scratch);

View File

@ -18,6 +18,78 @@
namespace v8 {
namespace internal {
void SharedTurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
XMMRegister src2,
XMMRegister scratch) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpunpckhbw(scratch, src1, src1);
vpsraw(scratch, scratch, 8);
vpunpckhbw(dst, src2, src2);
vpsraw(dst, dst, 8);
vpmullw(dst, dst, scratch);
} else {
if (dst != src1) {
movaps(dst, src1);
}
movaps(scratch, src2);
punpckhbw(dst, dst);
psraw(dst, 8);
punpckhbw(scratch, scratch);
psraw(scratch, 8);
pmullw(dst, scratch);
}
}
void SharedTurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
XMMRegister src2,
XMMRegister scratch) {
// The logic here is slightly complicated to handle all the cases of register
// aliasing. This allows flexibility for callers in TurboFan and Liftoff.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
if (src1 == src2) {
vpxor(scratch, scratch, scratch);
vpunpckhbw(dst, src1, scratch);
vpmullw(dst, dst, dst);
} else {
if (dst == src2) {
// We overwrite dst, then use src2, so swap src1 and src2.
std::swap(src1, src2);
}
vpxor(scratch, scratch, scratch);
vpunpckhbw(dst, src1, scratch);
vpunpckhbw(scratch, src2, scratch);
vpmullw(dst, dst, scratch);
}
} else {
if (src1 == src2) {
xorps(scratch, scratch);
if (dst != src1) {
movaps(dst, src1);
}
punpckhbw(dst, scratch);
pmullw(dst, scratch);
} else {
// When dst == src1, nothing special needs to be done.
// When dst == src2, swap src1 and src2, since we overwrite dst.
// When dst is unique, copy src1 to dst first.
if (dst == src2) {
std::swap(src1, src2);
// Now, dst == src1.
} else if (dst != src1) {
// dst != src1 && dst != src2.
movaps(dst, src1);
}
xorps(scratch, scratch);
punpckhbw(dst, scratch);
punpckhbw(scratch, src2);
psrlw(scratch, 8);
pmullw(dst, scratch);
}
}
}
void SharedTurboAssembler::I16x8SConvertI8x16High(XMMRegister dst,
XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
@ -65,6 +137,26 @@ void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst,
}
}
// 1. Multiply low word into scratch.
// 2. Multiply high word (can be signed or unsigned) into dst.
// 3. Unpack and interleave scratch and dst into dst.
void SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch,
bool low, bool is_signed) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmullw(scratch, src1, src2);
is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
} else {
DCHECK_EQ(dst, src1);
movaps(scratch, src1);
pmullw(dst, src2);
is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2);
low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
}
}
void SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst,
XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
@ -112,6 +204,40 @@ void SharedTurboAssembler::I32x4UConvertI16x8High(XMMRegister dst,
}
}
// 1. Unpack src0, src1 into even-number elements of scratch.
// 2. Unpack src1, src0 into even-number elements of dst.
// 3. Multiply 1. with 2.
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
void SharedTurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch,
bool low, bool is_signed) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
if (low) {
vpunpckldq(scratch, src1, src1);
vpunpckldq(dst, src2, src2);
} else {
vpunpckhdq(scratch, src1, src1);
vpunpckhdq(dst, src2, src2);
}
if (is_signed) {
vpmuldq(dst, scratch, dst);
} else {
vpmuludq(dst, scratch, dst);
}
} else {
uint8_t mask = low ? 0x50 : 0xFA;
pshufd(scratch, src1, mask);
pshufd(dst, src2, mask);
if (is_signed) {
CpuFeatureScope sse4_scope(this, SSE4_1);
pmuldq(dst, scratch);
} else {
pmuludq(dst, scratch);
}
}
}
void SharedTurboAssembler::I64x2SConvertI32x4High(XMMRegister dst,
XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {

View File

@ -23,12 +23,21 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
public:
using TurboAssemblerBase::TurboAssemblerBase;
void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src);
void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src,
XMMRegister scratch);
// Requires that dst == src1 if AVX is not supported.
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed);
void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src);
void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src,
XMMRegister scratch);
void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed);
void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src);
void I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src,
XMMRegister scratch);

View File

@ -2132,60 +2132,6 @@ void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
}
}
// 1. Unpack src0, src0 into even-number elements of scratch.
// 2. Unpack src1, src1 into even-number elements of dst.
// 3. Multiply 1. with 2.
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
void TurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
XMMRegister src2, bool low, bool is_signed) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
if (low) {
vpunpckldq(kScratchDoubleReg, src1, src1);
vpunpckldq(dst, src2, src2);
} else {
vpunpckhdq(kScratchDoubleReg, src1, src1);
vpunpckhdq(dst, src2, src2);
}
if (is_signed) {
vpmuldq(dst, kScratchDoubleReg, dst);
} else {
vpmuludq(dst, kScratchDoubleReg, dst);
}
} else {
uint8_t mask = low ? 0x50 : 0xFA;
pshufd(kScratchDoubleReg, src1, mask);
pshufd(dst, src2, mask);
if (is_signed) {
CpuFeatureScope avx_scope(this, SSE4_1);
pmuldq(dst, kScratchDoubleReg);
} else {
pmuludq(dst, kScratchDoubleReg);
}
}
}
// 1. Multiply low word into scratch.
// 2. Multiply high word (can be signed or unsigned) into dst.
// 3. Unpack and interleave scratch and dst into dst.
void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
XMMRegister src2, bool low, bool is_signed) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmullw(kScratchDoubleReg, src1, src2);
is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
low ? vpunpcklwd(dst, kScratchDoubleReg, dst)
: vpunpckhwd(dst, kScratchDoubleReg, dst);
} else {
DCHECK_EQ(dst, src1);
movaps(kScratchDoubleReg, src1);
pmullw(dst, src2);
is_signed ? pmulhw(kScratchDoubleReg, src2)
: pmulhuw(kScratchDoubleReg, src2);
low ? punpcklwd(dst, kScratchDoubleReg) : punpckhwd(dst, kScratchDoubleReg);
}
}
void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
XMMRegister src2, bool is_signed) {
is_signed ? Pmovsxbw(kScratchDoubleReg, src1)
@ -2194,76 +2140,6 @@ void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
Pmullw(dst, kScratchDoubleReg);
}
void TurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpunpckhbw(kScratchDoubleReg, src1, src1);
vpsraw(kScratchDoubleReg, kScratchDoubleReg, 8);
vpunpckhbw(dst, src2, src2);
vpsraw(dst, dst, 8);
vpmullw(dst, dst, kScratchDoubleReg);
} else {
if (dst != src1) {
movaps(dst, src1);
}
movaps(kScratchDoubleReg, src2);
punpckhbw(dst, dst);
psraw(dst, 8);
punpckhbw(kScratchDoubleReg, kScratchDoubleReg);
psraw(kScratchDoubleReg, 8);
pmullw(dst, kScratchDoubleReg);
}
}
void TurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
// The logic here is slightly complicated to handle all the cases of register
// aliasing. This allows flexibility for callers in TurboFan and Liftoff.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
if (src1 == src2) {
vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
vpunpckhbw(dst, src1, kScratchDoubleReg);
vpmullw(dst, dst, dst);
} else {
if (dst == src2) {
// We overwrite dst, then use src2, so swap src1 and src2.
std::swap(src1, src2);
}
vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
vpunpckhbw(dst, src1, kScratchDoubleReg);
vpunpckhbw(kScratchDoubleReg, src2, kScratchDoubleReg);
vpmullw(dst, dst, kScratchDoubleReg);
}
} else {
if (src1 == src2) {
xorps(kScratchDoubleReg, kScratchDoubleReg);
if (dst != src1) {
movaps(dst, src1);
}
punpckhbw(dst, kScratchDoubleReg);
pmullw(dst, kScratchDoubleReg);
} else {
// When dst == src1, nothing special needs to be done.
// When dst == src2, swap src1 and src2, since we overwrite dst.
// When dst is unique, copy src1 to dst first.
if (dst == src2) {
std::swap(src1, src2);
// Now, dst == src1.
} else if (dst != src1) {
// dst != src1 && dst != src2.
movaps(dst, src1);
}
xorps(kScratchDoubleReg, kScratchDoubleReg);
punpckhbw(dst, kScratchDoubleReg);
punpckhbw(kScratchDoubleReg, src2);
psrlw(kScratchDoubleReg, 8);
pmullw(dst, kScratchDoubleReg);
}
}
}
void TurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
// k = i16x8.splat(0x8000)

View File

@ -612,15 +612,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
XMMRegister src2);
void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
bool low, bool is_signed);
// Requires that dst == src1 if AVX is not supported.
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
bool low, bool is_signed);
// TODO(zhin): Move this into shared-ia32-x64-macro-assembler.
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
bool is_signed);
void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2);

View File

@ -2884,25 +2884,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I64x2ExtMulLowI32x4S: {
__ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/true,
i.InputSimd128Register(1), kScratchDoubleReg, /*low=*/true,
/*is_signed=*/true);
break;
}
case kX64I64x2ExtMulHighI32x4S: {
__ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/false,
i.InputSimd128Register(1), kScratchDoubleReg,
/*low=*/false,
/*is_signed=*/true);
break;
}
case kX64I64x2ExtMulLowI32x4U: {
__ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/true,
i.InputSimd128Register(1), kScratchDoubleReg, /*low=*/true,
/*is_signed=*/false);
break;
}
case kX64I64x2ExtMulHighI32x4U: {
__ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/false,
i.InputSimd128Register(1), kScratchDoubleReg,
/*low=*/false,
/*is_signed=*/false);
break;
}
@ -3320,7 +3322,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I16x8ExtMulHighI8x16S: {
__ I16x8ExtMulHighS(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
i.InputSimd128Register(1), kScratchDoubleReg);
break;
}
case kX64I16x8ExtMulLowI8x16U: {
@ -3330,7 +3332,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I16x8ExtMulHighI8x16U: {
__ I16x8ExtMulHighU(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
i.InputSimd128Register(1), kScratchDoubleReg);
break;
}
case kX64I16x8ExtAddPairwiseI8x16S: {
@ -3632,25 +3634,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I32x4ExtMulLowI16x8S: {
__ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/true,
i.InputSimd128Register(1), kScratchDoubleReg, /*low=*/true,
/*is_signed=*/true);
break;
}
case kX64I32x4ExtMulHighI16x8S: {
__ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/false,
i.InputSimd128Register(1), kScratchDoubleReg,
/*low=*/false,
/*is_signed=*/true);
break;
}
case kX64I32x4ExtMulLowI16x8U: {
__ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/true,
i.InputSimd128Register(1), kScratchDoubleReg, /*low=*/true,
/*is_signed=*/false);
break;
}
case kX64I32x4ExtMulHighI16x8U: {
__ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/false,
i.InputSimd128Register(1), kScratchDoubleReg,
/*low=*/false,
/*is_signed=*/false);
break;
}

View File

@ -9,6 +9,7 @@
#include "src/codegen/assembler.h"
#include "src/codegen/cpu-features.h"
#include "src/codegen/machine-type.h"
#include "src/codegen/x64/register-x64.h"
#include "src/heap/memory-chunk.h"
#include "src/wasm/baseline/liftoff-assembler.h"
#include "src/wasm/simd-shuffle.h"
@ -3233,13 +3234,13 @@ void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I16x8ExtMulHighS(dst.fp(), src1.fp(), src2.fp());
I16x8ExtMulHighS(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg);
}
void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_u(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I16x8ExtMulHighU(dst.fp(), src1.fp(), src2.fp());
I16x8ExtMulHighU(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg);
}
void LiftoffAssembler::emit_i16x8_q15mulr_sat_s(LiftoffRegister dst,
@ -3380,16 +3381,16 @@ inline void I32x4ExtMulHelper(LiftoffAssembler* assm, XMMRegister dst,
bool is_signed) {
// I32x4ExtMul requires dst == src1 if AVX is not supported.
if (CpuFeatures::IsSupported(AVX) || dst == src1) {
assm->I32x4ExtMul(dst, src1, src2, low, is_signed);
assm->I32x4ExtMul(dst, src1, src2, kScratchDoubleReg, low, is_signed);
} else if (dst != src2) {
// dst != src1 && dst != src2
assm->movaps(dst, src1);
assm->I32x4ExtMul(dst, dst, src2, low, is_signed);
assm->I32x4ExtMul(dst, dst, src2, kScratchDoubleReg, low, is_signed);
} else {
// dst == src2
// Extended multiplication is commutative,
assm->movaps(dst, src2);
assm->I32x4ExtMul(dst, dst, src1, low, is_signed);
assm->I32x4ExtMul(dst, dst, src1, kScratchDoubleReg, low, is_signed);
}
}
} // namespace liftoff
@ -3521,27 +3522,28 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i64x2_extmul_low_i32x4_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/true, /*is_signed=*/true);
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg, /*low=*/true,
/*is_signed=*/true);
}
void LiftoffAssembler::emit_i64x2_extmul_low_i32x4_u(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/true,
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg, /*low=*/true,
/*is_signed=*/false);
}
void LiftoffAssembler::emit_i64x2_extmul_high_i32x4_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/false,
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg, /*low=*/false,
/*is_signed=*/true);
}
void LiftoffAssembler::emit_i64x2_extmul_high_i32x4_u(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/false,
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg, /*low=*/false,
/*is_signed=*/false);
}