[wasm-simd] Move extmul into SharedTurboAssembler
Left i16x8.extmul_low in the arch-specific macro-assemblers because they rely on other functions defined in the same file. We can come back and move it afterwards. Bug: v8:11589 Change-Id: I2ea81c50ed52cc3e59e001b5e80aaf6b93a6572c Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2786280 Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#73688}
This commit is contained in:
parent
950b281ffe
commit
6acd0e4ab0
@ -642,60 +642,6 @@ void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Unpack src0, src1 into even-number elements of scratch.
|
||||
// 2. Unpack src1, src0 into even-number elements of dst.
|
||||
// 3. Multiply 1. with 2.
|
||||
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
|
||||
void TurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch,
|
||||
bool low, bool is_signed) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
if (low) {
|
||||
vpunpckldq(scratch, src1, src1);
|
||||
vpunpckldq(dst, src2, src2);
|
||||
} else {
|
||||
vpunpckhdq(scratch, src1, src1);
|
||||
vpunpckhdq(dst, src2, src2);
|
||||
}
|
||||
if (is_signed) {
|
||||
vpmuldq(dst, scratch, dst);
|
||||
} else {
|
||||
vpmuludq(dst, scratch, dst);
|
||||
}
|
||||
} else {
|
||||
uint8_t mask = low ? 0x50 : 0xFA;
|
||||
pshufd(scratch, src1, mask);
|
||||
pshufd(dst, src2, mask);
|
||||
if (is_signed) {
|
||||
CpuFeatureScope sse4_scope(this, SSE4_1);
|
||||
pmuldq(dst, scratch);
|
||||
} else {
|
||||
pmuludq(dst, scratch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Multiply low word into scratch.
|
||||
// 2. Multiply high word (can be signed or unsigned) into dst.
|
||||
// 3. Unpack and interleave scratch and dst into dst.
|
||||
void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch,
|
||||
bool low, bool is_signed) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpmullw(scratch, src1, src2);
|
||||
is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
|
||||
low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
|
||||
} else {
|
||||
DCHECK_EQ(dst, src1);
|
||||
movaps(scratch, src1);
|
||||
pmullw(dst, src2);
|
||||
is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2);
|
||||
low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch,
|
||||
bool is_signed) {
|
||||
@ -704,76 +650,6 @@ void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
|
||||
Pmullw(dst, scratch);
|
||||
}
|
||||
|
||||
void TurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpunpckhbw(scratch, src1, src1);
|
||||
vpsraw(scratch, scratch, 8);
|
||||
vpunpckhbw(dst, src2, src2);
|
||||
vpsraw(dst, dst, 8);
|
||||
vpmullw(dst, dst, scratch);
|
||||
} else {
|
||||
if (dst != src1) {
|
||||
movaps(dst, src1);
|
||||
}
|
||||
movaps(scratch, src2);
|
||||
punpckhbw(dst, dst);
|
||||
psraw(dst, 8);
|
||||
punpckhbw(scratch, scratch);
|
||||
psraw(scratch, 8);
|
||||
pmullw(dst, scratch);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch) {
|
||||
// The logic here is slightly complicated to handle all the cases of register
|
||||
// aliasing. This allows flexibility for callers in TurboFan and Liftoff.
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
if (src1 == src2) {
|
||||
vpxor(scratch, scratch, scratch);
|
||||
vpunpckhbw(dst, src1, scratch);
|
||||
vpmullw(dst, dst, dst);
|
||||
} else {
|
||||
if (dst == src2) {
|
||||
// We overwrite dst, then use src2, so swap src1 and src2.
|
||||
std::swap(src1, src2);
|
||||
}
|
||||
vpxor(scratch, scratch, scratch);
|
||||
vpunpckhbw(dst, src1, scratch);
|
||||
vpunpckhbw(scratch, src2, scratch);
|
||||
vpmullw(dst, dst, scratch);
|
||||
}
|
||||
} else {
|
||||
if (src1 == src2) {
|
||||
xorps(scratch, scratch);
|
||||
if (dst != src1) {
|
||||
movaps(dst, src1);
|
||||
}
|
||||
punpckhbw(dst, scratch);
|
||||
pmullw(dst, scratch);
|
||||
} else {
|
||||
// When dst == src1, nothing special needs to be done.
|
||||
// When dst == src2, swap src1 and src2, since we overwrite dst.
|
||||
// When dst is unique, copy src1 to dst first.
|
||||
if (dst == src2) {
|
||||
std::swap(src1, src2);
|
||||
// Now, dst == src1.
|
||||
} else if (dst != src1) {
|
||||
// dst != src1 && dst != src2.
|
||||
movaps(dst, src1);
|
||||
}
|
||||
xorps(scratch, scratch);
|
||||
punpckhbw(dst, scratch);
|
||||
punpckhbw(scratch, src2);
|
||||
psrlw(scratch, 8);
|
||||
pmullw(dst, scratch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
|
||||
XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch) {
|
||||
|
@ -712,17 +712,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
|
||||
// These Wasm SIMD ops do not have direct lowerings on IA32. These
|
||||
// helpers are optimized to produce the fastest and smallest codegen.
|
||||
// Defined here to allow usage on both TurboFan and Liftoff.
|
||||
void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch, bool low, bool is_signed);
|
||||
// Requires that dst == src1 if AVX is not supported.
|
||||
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch, bool low, bool is_signed);
|
||||
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch, bool is_signed);
|
||||
void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch);
|
||||
void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch);
|
||||
// Requires dst == mask when AVX is not supported.
|
||||
void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch);
|
||||
|
@ -18,6 +18,78 @@
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
void SharedTurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2,
|
||||
XMMRegister scratch) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpunpckhbw(scratch, src1, src1);
|
||||
vpsraw(scratch, scratch, 8);
|
||||
vpunpckhbw(dst, src2, src2);
|
||||
vpsraw(dst, dst, 8);
|
||||
vpmullw(dst, dst, scratch);
|
||||
} else {
|
||||
if (dst != src1) {
|
||||
movaps(dst, src1);
|
||||
}
|
||||
movaps(scratch, src2);
|
||||
punpckhbw(dst, dst);
|
||||
psraw(dst, 8);
|
||||
punpckhbw(scratch, scratch);
|
||||
psraw(scratch, 8);
|
||||
pmullw(dst, scratch);
|
||||
}
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2,
|
||||
XMMRegister scratch) {
|
||||
// The logic here is slightly complicated to handle all the cases of register
|
||||
// aliasing. This allows flexibility for callers in TurboFan and Liftoff.
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
if (src1 == src2) {
|
||||
vpxor(scratch, scratch, scratch);
|
||||
vpunpckhbw(dst, src1, scratch);
|
||||
vpmullw(dst, dst, dst);
|
||||
} else {
|
||||
if (dst == src2) {
|
||||
// We overwrite dst, then use src2, so swap src1 and src2.
|
||||
std::swap(src1, src2);
|
||||
}
|
||||
vpxor(scratch, scratch, scratch);
|
||||
vpunpckhbw(dst, src1, scratch);
|
||||
vpunpckhbw(scratch, src2, scratch);
|
||||
vpmullw(dst, dst, scratch);
|
||||
}
|
||||
} else {
|
||||
if (src1 == src2) {
|
||||
xorps(scratch, scratch);
|
||||
if (dst != src1) {
|
||||
movaps(dst, src1);
|
||||
}
|
||||
punpckhbw(dst, scratch);
|
||||
pmullw(dst, scratch);
|
||||
} else {
|
||||
// When dst == src1, nothing special needs to be done.
|
||||
// When dst == src2, swap src1 and src2, since we overwrite dst.
|
||||
// When dst is unique, copy src1 to dst first.
|
||||
if (dst == src2) {
|
||||
std::swap(src1, src2);
|
||||
// Now, dst == src1.
|
||||
} else if (dst != src1) {
|
||||
// dst != src1 && dst != src2.
|
||||
movaps(dst, src1);
|
||||
}
|
||||
xorps(scratch, scratch);
|
||||
punpckhbw(dst, scratch);
|
||||
punpckhbw(scratch, src2);
|
||||
psrlw(scratch, 8);
|
||||
pmullw(dst, scratch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I16x8SConvertI8x16High(XMMRegister dst,
|
||||
XMMRegister src) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
@ -65,6 +137,26 @@ void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst,
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Multiply low word into scratch.
|
||||
// 2. Multiply high word (can be signed or unsigned) into dst.
|
||||
// 3. Unpack and interleave scratch and dst into dst.
|
||||
void SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch,
|
||||
bool low, bool is_signed) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpmullw(scratch, src1, src2);
|
||||
is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
|
||||
low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
|
||||
} else {
|
||||
DCHECK_EQ(dst, src1);
|
||||
movaps(scratch, src1);
|
||||
pmullw(dst, src2);
|
||||
is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2);
|
||||
low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
|
||||
}
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst,
|
||||
XMMRegister src) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
@ -112,6 +204,40 @@ void SharedTurboAssembler::I32x4UConvertI16x8High(XMMRegister dst,
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Unpack src0, src1 into even-number elements of scratch.
|
||||
// 2. Unpack src1, src0 into even-number elements of dst.
|
||||
// 3. Multiply 1. with 2.
|
||||
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
|
||||
void SharedTurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch,
|
||||
bool low, bool is_signed) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
if (low) {
|
||||
vpunpckldq(scratch, src1, src1);
|
||||
vpunpckldq(dst, src2, src2);
|
||||
} else {
|
||||
vpunpckhdq(scratch, src1, src1);
|
||||
vpunpckhdq(dst, src2, src2);
|
||||
}
|
||||
if (is_signed) {
|
||||
vpmuldq(dst, scratch, dst);
|
||||
} else {
|
||||
vpmuludq(dst, scratch, dst);
|
||||
}
|
||||
} else {
|
||||
uint8_t mask = low ? 0x50 : 0xFA;
|
||||
pshufd(scratch, src1, mask);
|
||||
pshufd(dst, src2, mask);
|
||||
if (is_signed) {
|
||||
CpuFeatureScope sse4_scope(this, SSE4_1);
|
||||
pmuldq(dst, scratch);
|
||||
} else {
|
||||
pmuludq(dst, scratch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SharedTurboAssembler::I64x2SConvertI32x4High(XMMRegister dst,
|
||||
XMMRegister src) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
|
@ -23,12 +23,21 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
|
||||
public:
|
||||
using TurboAssemblerBase::TurboAssemblerBase;
|
||||
|
||||
void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch);
|
||||
void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch);
|
||||
void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src);
|
||||
void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src,
|
||||
XMMRegister scratch);
|
||||
// Requires that dst == src1 if AVX is not supported.
|
||||
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch, bool low, bool is_signed);
|
||||
void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src);
|
||||
void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src,
|
||||
XMMRegister scratch);
|
||||
void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch, bool low, bool is_signed);
|
||||
void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src);
|
||||
void I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src,
|
||||
XMMRegister scratch);
|
||||
|
@ -2132,60 +2132,6 @@ void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Unpack src0, src0 into even-number elements of scratch.
|
||||
// 2. Unpack src1, src1 into even-number elements of dst.
|
||||
// 3. Multiply 1. with 2.
|
||||
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
|
||||
void TurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, bool low, bool is_signed) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
if (low) {
|
||||
vpunpckldq(kScratchDoubleReg, src1, src1);
|
||||
vpunpckldq(dst, src2, src2);
|
||||
} else {
|
||||
vpunpckhdq(kScratchDoubleReg, src1, src1);
|
||||
vpunpckhdq(dst, src2, src2);
|
||||
}
|
||||
if (is_signed) {
|
||||
vpmuldq(dst, kScratchDoubleReg, dst);
|
||||
} else {
|
||||
vpmuludq(dst, kScratchDoubleReg, dst);
|
||||
}
|
||||
} else {
|
||||
uint8_t mask = low ? 0x50 : 0xFA;
|
||||
pshufd(kScratchDoubleReg, src1, mask);
|
||||
pshufd(dst, src2, mask);
|
||||
if (is_signed) {
|
||||
CpuFeatureScope avx_scope(this, SSE4_1);
|
||||
pmuldq(dst, kScratchDoubleReg);
|
||||
} else {
|
||||
pmuludq(dst, kScratchDoubleReg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Multiply low word into scratch.
|
||||
// 2. Multiply high word (can be signed or unsigned) into dst.
|
||||
// 3. Unpack and interleave scratch and dst into dst.
|
||||
void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, bool low, bool is_signed) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpmullw(kScratchDoubleReg, src1, src2);
|
||||
is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
|
||||
low ? vpunpcklwd(dst, kScratchDoubleReg, dst)
|
||||
: vpunpckhwd(dst, kScratchDoubleReg, dst);
|
||||
} else {
|
||||
DCHECK_EQ(dst, src1);
|
||||
movaps(kScratchDoubleReg, src1);
|
||||
pmullw(dst, src2);
|
||||
is_signed ? pmulhw(kScratchDoubleReg, src2)
|
||||
: pmulhuw(kScratchDoubleReg, src2);
|
||||
low ? punpcklwd(dst, kScratchDoubleReg) : punpckhwd(dst, kScratchDoubleReg);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, bool is_signed) {
|
||||
is_signed ? Pmovsxbw(kScratchDoubleReg, src1)
|
||||
@ -2194,76 +2140,6 @@ void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
|
||||
Pmullw(dst, kScratchDoubleReg);
|
||||
}
|
||||
|
||||
void TurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpunpckhbw(kScratchDoubleReg, src1, src1);
|
||||
vpsraw(kScratchDoubleReg, kScratchDoubleReg, 8);
|
||||
vpunpckhbw(dst, src2, src2);
|
||||
vpsraw(dst, dst, 8);
|
||||
vpmullw(dst, dst, kScratchDoubleReg);
|
||||
} else {
|
||||
if (dst != src1) {
|
||||
movaps(dst, src1);
|
||||
}
|
||||
movaps(kScratchDoubleReg, src2);
|
||||
punpckhbw(dst, dst);
|
||||
psraw(dst, 8);
|
||||
punpckhbw(kScratchDoubleReg, kScratchDoubleReg);
|
||||
psraw(kScratchDoubleReg, 8);
|
||||
pmullw(dst, kScratchDoubleReg);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2) {
|
||||
// The logic here is slightly complicated to handle all the cases of register
|
||||
// aliasing. This allows flexibility for callers in TurboFan and Liftoff.
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
if (src1 == src2) {
|
||||
vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
|
||||
vpunpckhbw(dst, src1, kScratchDoubleReg);
|
||||
vpmullw(dst, dst, dst);
|
||||
} else {
|
||||
if (dst == src2) {
|
||||
// We overwrite dst, then use src2, so swap src1 and src2.
|
||||
std::swap(src1, src2);
|
||||
}
|
||||
vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
|
||||
vpunpckhbw(dst, src1, kScratchDoubleReg);
|
||||
vpunpckhbw(kScratchDoubleReg, src2, kScratchDoubleReg);
|
||||
vpmullw(dst, dst, kScratchDoubleReg);
|
||||
}
|
||||
} else {
|
||||
if (src1 == src2) {
|
||||
xorps(kScratchDoubleReg, kScratchDoubleReg);
|
||||
if (dst != src1) {
|
||||
movaps(dst, src1);
|
||||
}
|
||||
punpckhbw(dst, kScratchDoubleReg);
|
||||
pmullw(dst, kScratchDoubleReg);
|
||||
} else {
|
||||
// When dst == src1, nothing special needs to be done.
|
||||
// When dst == src2, swap src1 and src2, since we overwrite dst.
|
||||
// When dst is unique, copy src1 to dst first.
|
||||
if (dst == src2) {
|
||||
std::swap(src1, src2);
|
||||
// Now, dst == src1.
|
||||
} else if (dst != src1) {
|
||||
// dst != src1 && dst != src2.
|
||||
movaps(dst, src1);
|
||||
}
|
||||
xorps(kScratchDoubleReg, kScratchDoubleReg);
|
||||
punpckhbw(dst, kScratchDoubleReg);
|
||||
punpckhbw(kScratchDoubleReg, src2);
|
||||
psrlw(kScratchDoubleReg, 8);
|
||||
pmullw(dst, kScratchDoubleReg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2) {
|
||||
// k = i16x8.splat(0x8000)
|
||||
|
@ -612,15 +612,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
|
||||
void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
|
||||
XMMRegister src2);
|
||||
|
||||
void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
bool low, bool is_signed);
|
||||
// Requires that dst == src1 if AVX is not supported.
|
||||
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
bool low, bool is_signed);
|
||||
// TODO(zhin): Move this into shared-ia32-x64-macro-assembler.
|
||||
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
bool is_signed);
|
||||
void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
|
||||
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2);
|
||||
|
||||
|
@ -2884,25 +2884,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kX64I64x2ExtMulLowI32x4S: {
|
||||
__ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), /*low=*/true,
|
||||
i.InputSimd128Register(1), kScratchDoubleReg, /*low=*/true,
|
||||
/*is_signed=*/true);
|
||||
break;
|
||||
}
|
||||
case kX64I64x2ExtMulHighI32x4S: {
|
||||
__ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), /*low=*/false,
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/false,
|
||||
/*is_signed=*/true);
|
||||
break;
|
||||
}
|
||||
case kX64I64x2ExtMulLowI32x4U: {
|
||||
__ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), /*low=*/true,
|
||||
i.InputSimd128Register(1), kScratchDoubleReg, /*low=*/true,
|
||||
/*is_signed=*/false);
|
||||
break;
|
||||
}
|
||||
case kX64I64x2ExtMulHighI32x4U: {
|
||||
__ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), /*low=*/false,
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/false,
|
||||
/*is_signed=*/false);
|
||||
break;
|
||||
}
|
||||
@ -3320,7 +3322,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kX64I16x8ExtMulHighI8x16S: {
|
||||
__ I16x8ExtMulHighS(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1));
|
||||
i.InputSimd128Register(1), kScratchDoubleReg);
|
||||
break;
|
||||
}
|
||||
case kX64I16x8ExtMulLowI8x16U: {
|
||||
@ -3330,7 +3332,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kX64I16x8ExtMulHighI8x16U: {
|
||||
__ I16x8ExtMulHighU(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1));
|
||||
i.InputSimd128Register(1), kScratchDoubleReg);
|
||||
break;
|
||||
}
|
||||
case kX64I16x8ExtAddPairwiseI8x16S: {
|
||||
@ -3632,25 +3634,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kX64I32x4ExtMulLowI16x8S: {
|
||||
__ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), /*low=*/true,
|
||||
i.InputSimd128Register(1), kScratchDoubleReg, /*low=*/true,
|
||||
/*is_signed=*/true);
|
||||
break;
|
||||
}
|
||||
case kX64I32x4ExtMulHighI16x8S: {
|
||||
__ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), /*low=*/false,
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/false,
|
||||
/*is_signed=*/true);
|
||||
break;
|
||||
}
|
||||
case kX64I32x4ExtMulLowI16x8U: {
|
||||
__ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), /*low=*/true,
|
||||
i.InputSimd128Register(1), kScratchDoubleReg, /*low=*/true,
|
||||
/*is_signed=*/false);
|
||||
break;
|
||||
}
|
||||
case kX64I32x4ExtMulHighI16x8U: {
|
||||
__ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), /*low=*/false,
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/false,
|
||||
/*is_signed=*/false);
|
||||
break;
|
||||
}
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include "src/codegen/assembler.h"
|
||||
#include "src/codegen/cpu-features.h"
|
||||
#include "src/codegen/machine-type.h"
|
||||
#include "src/codegen/x64/register-x64.h"
|
||||
#include "src/heap/memory-chunk.h"
|
||||
#include "src/wasm/baseline/liftoff-assembler.h"
|
||||
#include "src/wasm/simd-shuffle.h"
|
||||
@ -3233,13 +3234,13 @@ void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_u(LiftoffRegister dst,
|
||||
void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_s(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
I16x8ExtMulHighS(dst.fp(), src1.fp(), src2.fp());
|
||||
I16x8ExtMulHighS(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_u(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
I16x8ExtMulHighU(dst.fp(), src1.fp(), src2.fp());
|
||||
I16x8ExtMulHighU(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i16x8_q15mulr_sat_s(LiftoffRegister dst,
|
||||
@ -3380,16 +3381,16 @@ inline void I32x4ExtMulHelper(LiftoffAssembler* assm, XMMRegister dst,
|
||||
bool is_signed) {
|
||||
// I32x4ExtMul requires dst == src1 if AVX is not supported.
|
||||
if (CpuFeatures::IsSupported(AVX) || dst == src1) {
|
||||
assm->I32x4ExtMul(dst, src1, src2, low, is_signed);
|
||||
assm->I32x4ExtMul(dst, src1, src2, kScratchDoubleReg, low, is_signed);
|
||||
} else if (dst != src2) {
|
||||
// dst != src1 && dst != src2
|
||||
assm->movaps(dst, src1);
|
||||
assm->I32x4ExtMul(dst, dst, src2, low, is_signed);
|
||||
assm->I32x4ExtMul(dst, dst, src2, kScratchDoubleReg, low, is_signed);
|
||||
} else {
|
||||
// dst == src2
|
||||
// Extended multiplication is commutative,
|
||||
assm->movaps(dst, src2);
|
||||
assm->I32x4ExtMul(dst, dst, src1, low, is_signed);
|
||||
assm->I32x4ExtMul(dst, dst, src1, kScratchDoubleReg, low, is_signed);
|
||||
}
|
||||
}
|
||||
} // namespace liftoff
|
||||
@ -3521,27 +3522,28 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
|
||||
void LiftoffAssembler::emit_i64x2_extmul_low_i32x4_s(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/true, /*is_signed=*/true);
|
||||
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg, /*low=*/true,
|
||||
/*is_signed=*/true);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i64x2_extmul_low_i32x4_u(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/true,
|
||||
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg, /*low=*/true,
|
||||
/*is_signed=*/false);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i64x2_extmul_high_i32x4_s(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/false,
|
||||
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg, /*low=*/false,
|
||||
/*is_signed=*/true);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i64x2_extmul_high_i32x4_u(LiftoffRegister dst,
|
||||
LiftoffRegister src1,
|
||||
LiftoffRegister src2) {
|
||||
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/false,
|
||||
I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg, /*low=*/false,
|
||||
/*is_signed=*/false);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user