[wasm-simd][ia32] Convert ext mul macros into macro-assembler functions
This will make these functions usable from Liftoff when we later implement extended multiply instructions in Liftoff. This is similar to the x64 versions, except that it takes a scratch register as a parameter. Bug: v8:11262 Change-Id: Ief3d8cdde59da9e05a468286315bcae6d13863d9 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2603768 Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#71907}
This commit is contained in:
parent
4cfbe2d1d9
commit
afd3f63e30
@ -650,6 +650,76 @@ void TurboAssembler::Roundpd(XMMRegister dst, XMMRegister src,
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Unpack src0, src1 into even-number elements of scratch.
|
||||
// 2. Unpack src1, src0 into even-number elements of dst.
|
||||
// 3. Multiply 1. with 2.
|
||||
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
|
||||
void TurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch,
|
||||
bool low, bool is_signed) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
if (low) {
|
||||
vpunpckldq(scratch, src1, src1);
|
||||
vpunpckldq(dst, src2, src2);
|
||||
} else {
|
||||
vpunpckhdq(scratch, src1, src1);
|
||||
vpunpckhdq(dst, src2, src2);
|
||||
}
|
||||
if (is_signed) {
|
||||
vpmuldq(dst, scratch, dst);
|
||||
} else {
|
||||
vpmuludq(dst, scratch, dst);
|
||||
}
|
||||
} else {
|
||||
uint8_t mask = low ? 0x50 : 0xFA;
|
||||
pshufd(scratch, src1, mask);
|
||||
pshufd(dst, src2, mask);
|
||||
if (is_signed) {
|
||||
CpuFeatureScope sse4_scope(this, SSE4_1);
|
||||
pmuldq(dst, scratch);
|
||||
} else {
|
||||
pmuludq(dst, scratch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Multiply low word into scratch.
|
||||
// 2. Multiply high word (can be signed or unsigned) into dst.
|
||||
// 3. Unpack and interleave scratch and dst into dst.
|
||||
void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch,
|
||||
bool low, bool is_signed) {
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vpmullw(scratch, src1, src2);
|
||||
is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
|
||||
low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
|
||||
} else {
|
||||
DCHECK_EQ(dst, src1);
|
||||
movdqu(scratch, src1);
|
||||
pmullw(dst, src2);
|
||||
is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2);
|
||||
low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::I16x8ExtMul(XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2, XMMRegister scratch,
|
||||
bool low, bool is_signed) {
|
||||
if (low) {
|
||||
is_signed ? Pmovsxbw(scratch, src1) : Pmovzxbw(scratch, src1);
|
||||
is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2);
|
||||
Pmullw(dst, scratch);
|
||||
} else {
|
||||
Palignr(scratch, src1, uint8_t{8});
|
||||
is_signed ? Pmovsxbw(scratch, scratch) : Pmovzxbw(scratch, scratch);
|
||||
Palignr(dst, src2, uint8_t{8});
|
||||
is_signed ? Pmovsxbw(dst, dst) : Pmovzxbw(dst, dst);
|
||||
Pmullw(dst, scratch);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {
|
||||
DCHECK_GE(63, shift);
|
||||
if (shift >= 32) {
|
||||
|
@ -611,6 +611,17 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
void Roundps(XMMRegister dst, XMMRegister src, RoundingMode mode);
|
||||
void Roundpd(XMMRegister dst, XMMRegister src, RoundingMode mode);
|
||||
|
||||
// These Wasm SIMD ops do not have direct lowerings on IA32. These
|
||||
// helpers are optimized to produce the fastest and smallest codegen.
|
||||
// Defined here to allow usage on both TurboFan and Liftoff.
|
||||
void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch, bool low, bool is_signed);
|
||||
// Requires that dst == src1 if AVX is not supported.
|
||||
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch, bool low, bool is_signed);
|
||||
void I16x8ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister scratch, bool low, bool is_signed);
|
||||
|
||||
void Push(Register src) { push(src); }
|
||||
void Push(Operand src) { push(src); }
|
||||
void Push(Immediate value);
|
||||
|
@ -561,74 +561,6 @@ class OutOfLineRecordWrite final : public OutOfLineCode {
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
// 1. Unpack src0, src1 into even-number elements of scratch.
|
||||
// 2. Unpack src1, src0 into even-number elements of dst.
|
||||
// 3. Multiply 1. with 2.
|
||||
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
|
||||
// We only need SSE4_1 for pmuldq (singed ext mul), but enable in both signed
|
||||
// and unsigned cases to reduce macro duplication.
|
||||
#define ASSEMBLE_SIMD_I64X2_EXT_MUL(UNPACK_INSTR, MUL_INSTR, SHUFFLE_CONST) \
|
||||
do { \
|
||||
XMMRegister dst = i.OutputSimd128Register(); \
|
||||
XMMRegister src0 = i.InputSimd128Register(0); \
|
||||
Operand src1 = i.InputOperand(1); \
|
||||
if (CpuFeatures::IsSupported(AVX)) { \
|
||||
CpuFeatureScope avx_scope(tasm(), AVX); \
|
||||
__ movdqu(kScratchDoubleReg, src1); \
|
||||
__ v##UNPACK_INSTR(kScratchDoubleReg, kScratchDoubleReg, \
|
||||
kScratchDoubleReg); \
|
||||
__ v##UNPACK_INSTR(dst, src0, src0); \
|
||||
__ v##MUL_INSTR(dst, kScratchDoubleReg, dst); \
|
||||
} else { \
|
||||
CpuFeatureScope sse4_scope(tasm(), SSE4_1); \
|
||||
DCHECK_EQ(dst, src0); \
|
||||
__ pshufd(kScratchDoubleReg, src0, SHUFFLE_CONST); \
|
||||
__ pshufd(dst, src1, SHUFFLE_CONST); \
|
||||
__ MUL_INSTR(dst, kScratchDoubleReg); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
// 1. Multiply low word into scratch.
|
||||
// 2. Multiply high word (can be signed or unsigned) into dst.
|
||||
// 3. Unpack and interleave scratch and dst into dst.
|
||||
#define ASSEMBLE_SIMD_I32X4_EXT_MUL(MUL_HIGH_INSTR, UNPACK_INSTR) \
|
||||
do { \
|
||||
XMMRegister dst = i.OutputSimd128Register(); \
|
||||
XMMRegister src0 = i.InputSimd128Register(0); \
|
||||
Operand src1 = i.InputOperand(1); \
|
||||
if (CpuFeatures::IsSupported(AVX)) { \
|
||||
CpuFeatureScope avx_scope(tasm(), AVX); \
|
||||
__ vpmullw(kScratchDoubleReg, src0, src1); \
|
||||
__ v##MUL_HIGH_INSTR(dst, src0, src1); \
|
||||
__ v##UNPACK_INSTR(dst, kScratchDoubleReg, dst); \
|
||||
} else { \
|
||||
DCHECK_EQ(dst, src0); \
|
||||
__ movdqu(kScratchDoubleReg, src0); \
|
||||
__ pmullw(kScratchDoubleReg, src1); \
|
||||
__ MUL_HIGH_INSTR(dst, src1); \
|
||||
__ UNPACK_INSTR(kScratchDoubleReg, dst); \
|
||||
__ movdqu(dst, kScratchDoubleReg); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(EXTEND_MACRO_INSTR) \
|
||||
do { \
|
||||
XMMRegister dst = i.OutputSimd128Register(); \
|
||||
__ EXTEND_MACRO_INSTR(kScratchDoubleReg, i.InputSimd128Register(0)); \
|
||||
__ EXTEND_MACRO_INSTR(dst, i.InputOperand(1)); \
|
||||
__ Pmullw(dst, kScratchDoubleReg); \
|
||||
} while (false)
|
||||
|
||||
#define ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(EXTEND_MACRO_INSTR) \
|
||||
do { \
|
||||
XMMRegister dst = i.OutputSimd128Register(); \
|
||||
__ Palignr(kScratchDoubleReg, i.InputSimd128Register(0), uint8_t{8}); \
|
||||
__ EXTEND_MACRO_INSTR(kScratchDoubleReg, kScratchDoubleReg); \
|
||||
__ Palignr(dst, i.InputOperand(1), uint8_t{8}); \
|
||||
__ EXTEND_MACRO_INSTR(dst, dst); \
|
||||
__ Pmullw(dst, kScratchDoubleReg); \
|
||||
} while (false)
|
||||
|
||||
void CodeGenerator::AssembleDeconstructFrame() {
|
||||
__ mov(esp, ebp);
|
||||
__ pop(ebp);
|
||||
@ -2160,51 +2092,75 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kIA32I64x2ExtMulLowI32x4S: {
|
||||
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuldq, 0x50);
|
||||
__ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/true, /*is_signed=*/true);
|
||||
break;
|
||||
}
|
||||
case kIA32I64x2ExtMulHighI32x4S: {
|
||||
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuldq, 0xFA);
|
||||
__ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/false, /*is_signed=*/true);
|
||||
break;
|
||||
}
|
||||
case kIA32I64x2ExtMulLowI32x4U: {
|
||||
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuludq, 0x50);
|
||||
__ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/true, /*is_signed=*/false);
|
||||
break;
|
||||
}
|
||||
case kIA32I64x2ExtMulHighI32x4U: {
|
||||
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuludq, 0xFA);
|
||||
__ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/false, /*is_signed=*/false);
|
||||
break;
|
||||
}
|
||||
case kIA32I32x4ExtMulLowI16x8S: {
|
||||
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpcklwd);
|
||||
__ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/true, /*is_signed=*/true);
|
||||
break;
|
||||
}
|
||||
case kIA32I32x4ExtMulHighI16x8S: {
|
||||
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpckhwd);
|
||||
__ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/false, /*is_signed=*/true);
|
||||
break;
|
||||
}
|
||||
case kIA32I32x4ExtMulLowI16x8U: {
|
||||
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpcklwd);
|
||||
__ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/true, /*is_signed=*/false);
|
||||
break;
|
||||
}
|
||||
case kIA32I32x4ExtMulHighI16x8U: {
|
||||
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpckhwd);
|
||||
__ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/false, /*is_signed=*/false);
|
||||
break;
|
||||
}
|
||||
case kIA32I16x8ExtMulLowI8x16S: {
|
||||
ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovsxbw);
|
||||
__ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/true, /*is_signed=*/true);
|
||||
break;
|
||||
}
|
||||
case kIA32I16x8ExtMulHighI8x16S: {
|
||||
ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovsxbw);
|
||||
__ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/false, /*is_signed=*/true);
|
||||
break;
|
||||
}
|
||||
case kIA32I16x8ExtMulLowI8x16U: {
|
||||
ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovzxbw);
|
||||
__ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/true, /*is_signed=*/false);
|
||||
break;
|
||||
}
|
||||
case kIA32I16x8ExtMulHighI8x16U: {
|
||||
ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovzxbw);
|
||||
__ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputSimd128Register(1), kScratchDoubleReg,
|
||||
/*low=*/false, /*is_signed=*/false);
|
||||
break;
|
||||
}
|
||||
case kIA32I64x2SplatI32Pair: {
|
||||
@ -5350,10 +5306,6 @@ void CodeGenerator::AssembleJumpTable(Label** targets, size_t target_count) {
|
||||
#undef ASSEMBLE_SIMD_SHIFT
|
||||
#undef ASSEMBLE_SIMD_PINSR
|
||||
#undef ASSEMBLE_SIMD_SIGN_SELECT
|
||||
#undef ASSEMBLE_SIMD_I64X2_EXT_MUL
|
||||
#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW
|
||||
#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH
|
||||
#undef ASSEMBLE_SIMD_I32X4_EXT_MUL
|
||||
|
||||
} // namespace compiler
|
||||
} // namespace internal
|
||||
|
Loading…
Reference in New Issue
Block a user