[wasm-simd][x64] Convert ext mul macros into macro-assembler functions

This will make these functions usable from Liftoff when we later
implement extended multiply instructions in Liftoff.

Bug: v8:11262
Change-Id: I5fb105bc0184675eb60cd8ae63cc13955b0f767d
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2601876
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71885}
This commit is contained in:
Zhi An Ng 2020-12-29 00:50:44 +00:00 committed by Commit Bot
parent 506c09797c
commit 7ddcd92ea9
3 changed files with 116 additions and 82 deletions

View File

@ -2114,6 +2114,77 @@ void TurboAssembler::I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src) {
} }
} }
// 1. Unpack src0, src0 into even-number elements of scratch.
// 2. Unpack src1, src1 into even-number elements of dst.
// 3. Multiply 1. with 2.
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
void TurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
XMMRegister src2, bool low, bool is_signed) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
if (low) {
vpunpckldq(kScratchDoubleReg, src1, src1);
vpunpckldq(dst, src2, src2);
} else {
vpunpckhdq(kScratchDoubleReg, src1, src1);
vpunpckhdq(dst, src2, src2);
}
if (is_signed) {
vpmuldq(dst, kScratchDoubleReg, dst);
} else {
vpmuludq(dst, kScratchDoubleReg, dst);
}
} else {
uint8_t mask = low ? 0x50 : 0xFA;
pshufd(kScratchDoubleReg, src1, mask);
pshufd(dst, src2, mask);
if (is_signed) {
CpuFeatureScope avx_scope(this, SSE4_1);
pmuldq(dst, kScratchDoubleReg);
} else {
pmuludq(dst, kScratchDoubleReg);
}
}
}
// 1. Multiply low word into scratch.
// 2. Multiply high word (can be signed or unsigned) into dst.
// 3. Unpack and interleave scratch and dst into dst.
void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
XMMRegister src2, bool low, bool is_signed) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmullw(kScratchDoubleReg, src1, src2);
is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
low ? vpunpcklwd(dst, kScratchDoubleReg, dst)
: vpunpckhwd(dst, kScratchDoubleReg, dst);
} else {
DCHECK_EQ(dst, src1);
movdqu(kScratchDoubleReg, src1);
pmullw(dst, src2);
is_signed ? pmulhw(kScratchDoubleReg, src2)
: pmulhuw(kScratchDoubleReg, src2);
low ? punpcklwd(dst, kScratchDoubleReg) : punpckhwd(dst, kScratchDoubleReg);
}
}
void TurboAssembler::I16x8ExtMul(XMMRegister dst, XMMRegister src1,
XMMRegister src2, bool low, bool is_signed) {
if (low) {
is_signed ? Pmovsxbw(kScratchDoubleReg, src1)
: Pmovzxbw(kScratchDoubleReg, src1);
is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2);
Pmullw(dst, kScratchDoubleReg);
} else {
Palignr(kScratchDoubleReg, src1, uint8_t{8});
is_signed ? Pmovsxbw(kScratchDoubleReg, kScratchDoubleReg)
: Pmovzxbw(kScratchDoubleReg, kScratchDoubleReg);
Palignr(dst, src2, uint8_t{8});
is_signed ? Pmovsxbw(dst, dst) : Pmovzxbw(dst, dst);
Pmullw(dst, kScratchDoubleReg);
}
}
void TurboAssembler::Psrld(XMMRegister dst, byte imm8) { void TurboAssembler::Psrld(XMMRegister dst, byte imm8) {
Psrld(dst, dst, imm8); Psrld(dst, dst, imm8);
} }

View File

@ -571,11 +571,20 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
// These Wasm SIMD ops do not have direct lowerings on x64. These // These Wasm SIMD ops do not have direct lowerings on x64. These
// helpers are optimized to produce the fastest and smallest codegen. // helpers are optimized to produce the fastest and smallest codegen.
// Defined here to allow usage on both TurboFan and Liftoff.
void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src); void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src);
void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src); void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src);
void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src); void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src);
void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src); void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src);
void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
bool low, bool is_signed);
// Requires that dst == src1 if AVX is not supported.
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
bool low, bool is_signed);
void I16x8ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
bool low, bool is_signed);
void CompareRoot(Register with, RootIndex index); void CompareRoot(Register with, RootIndex index);
void CompareRoot(Operand with, RootIndex index); void CompareRoot(Operand with, RootIndex index);

View File

@ -697,72 +697,6 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
} \ } \
} while (false) } while (false)
#define ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(EXTEND_MACRO_INSTR) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
__ EXTEND_MACRO_INSTR(kScratchDoubleReg, i.InputSimd128Register(0)); \
__ EXTEND_MACRO_INSTR(dst, i.InputSimd128Register(1)); \
__ Pmullw(dst, kScratchDoubleReg); \
} while (false)
#define ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(EXTEND_MACRO_INSTR) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
__ Palignr(kScratchDoubleReg, i.InputSimd128Register(0), uint8_t{8}); \
__ EXTEND_MACRO_INSTR(kScratchDoubleReg, kScratchDoubleReg); \
__ Palignr(dst, i.InputSimd128Register(1), uint8_t{8}); \
__ EXTEND_MACRO_INSTR(dst, dst); \
__ Pmullw(dst, kScratchDoubleReg); \
} while (false)
// 1. Multiply low word into scratch.
// 2. Multiply high word (can be signed or unsigned) into dst.
// 3. Unpack and interleave scratch and dst into dst.
#define ASSEMBLE_SIMD_I32X4_EXT_MUL(MUL_HIGH_INSTR, UNPACK_INSTR) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
XMMRegister src0 = i.InputSimd128Register(0); \
XMMRegister src1 = i.InputSimd128Register(1); \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(tasm(), AVX); \
__ vpmullw(kScratchDoubleReg, src0, src1); \
__ v##MUL_HIGH_INSTR(dst, src0, src1); \
__ v##UNPACK_INSTR(dst, kScratchDoubleReg, dst); \
} else { \
DCHECK_EQ(dst, src0); \
__ movdqu(kScratchDoubleReg, src0); \
__ pmullw(kScratchDoubleReg, src1); \
__ MUL_HIGH_INSTR(dst, src1); \
__ UNPACK_INSTR(kScratchDoubleReg, dst); \
__ movdqu(dst, kScratchDoubleReg); \
} \
} while (false)
// 1. Unpack src0, src0 into even-number elements of scratch.
// 2. Unpack src1, src1 into even-number elements of dst.
// 3. Multiply 1. with 2.
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
// We only need SSE4_1 for pmuldq (signed ext mul), but to reduce macro
// duplication we enable it in all cases.
#define ASSEMBLE_SIMD_I64X2_EXT_MUL(UNPACK_INSTR, MUL_INSTR, SHUFFLE_CONST) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
XMMRegister src0 = i.InputSimd128Register(0); \
XMMRegister src1 = i.InputSimd128Register(1); \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(tasm(), AVX); \
__ v##UNPACK_INSTR(kScratchDoubleReg, src0, src0); \
__ v##UNPACK_INSTR(dst, src1, src1); \
__ v##MUL_INSTR(dst, kScratchDoubleReg, dst); \
} else { \
CpuFeatureScope avx_scope(tasm(), SSE4_1); \
DCHECK_EQ(dst, src0); \
__ pshufd(kScratchDoubleReg, src0, SHUFFLE_CONST); \
__ pshufd(dst, src1, SHUFFLE_CONST); \
__ MUL_INSTR(dst, kScratchDoubleReg); \
} \
} while (false)
void CodeGenerator::AssembleDeconstructFrame() { void CodeGenerator::AssembleDeconstructFrame() {
unwinding_info_writer_.MarkFrameDeconstructed(__ pc_offset()); unwinding_info_writer_.MarkFrameDeconstructed(__ pc_offset());
__ movq(rsp, rbp); __ movq(rsp, rbp);
@ -2917,19 +2851,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I64x2ExtMulLowI32x4S: { case kX64I64x2ExtMulLowI32x4S: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuldq, 0x50); __ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/true,
/*is_signed=*/true);
break; break;
} }
case kX64I64x2ExtMulHighI32x4S: { case kX64I64x2ExtMulHighI32x4S: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuldq, 0xFA); __ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/false,
/*is_signed=*/true);
break; break;
} }
case kX64I64x2ExtMulLowI32x4U: { case kX64I64x2ExtMulLowI32x4U: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuludq, 0x50); __ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/true,
/*is_signed=*/false);
break; break;
} }
case kX64I64x2ExtMulHighI32x4U: { case kX64I64x2ExtMulHighI32x4U: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuludq, 0xFA); __ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/false,
/*is_signed=*/false);
break; break;
} }
case kX64I32x4Splat: { case kX64I32x4Splat: {
@ -3346,19 +3288,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I16x8ExtMulLowI8x16S: { case kX64I16x8ExtMulLowI8x16S: {
ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovsxbw); __ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/true,
/*is_signed=*/true);
break; break;
} }
case kX64I16x8ExtMulHighI8x16S: { case kX64I16x8ExtMulHighI8x16S: {
ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovsxbw); __ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/false,
/*is_signed=*/true);
break; break;
} }
case kX64I16x8ExtMulLowI8x16U: { case kX64I16x8ExtMulLowI8x16U: {
ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovzxbw); __ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/true,
/*is_signed=*/false);
break; break;
} }
case kX64I16x8ExtMulHighI8x16U: { case kX64I16x8ExtMulHighI8x16U: {
ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovzxbw); __ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/false,
/*is_signed=*/false);
break; break;
} }
case kX64I16x8ExtAddPairwiseI8x16S: { case kX64I16x8ExtAddPairwiseI8x16S: {
@ -3711,19 +3661,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I32x4ExtMulLowI16x8S: { case kX64I32x4ExtMulLowI16x8S: {
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpcklwd); __ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/true,
/*is_signed=*/true);
break; break;
} }
case kX64I32x4ExtMulHighI16x8S: { case kX64I32x4ExtMulHighI16x8S: {
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpckhwd); __ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/false,
/*is_signed=*/true);
break; break;
} }
case kX64I32x4ExtMulLowI16x8U: { case kX64I32x4ExtMulLowI16x8U: {
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpcklwd); __ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/true,
/*is_signed=*/false);
break; break;
} }
case kX64I32x4ExtMulHighI16x8U: { case kX64I32x4ExtMulHighI16x8U: {
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpckhwd); __ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/false,
/*is_signed=*/false);
break; break;
} }
case kX64I64x2SignSelect: { case kX64I64x2SignSelect: {
@ -4353,10 +4311,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
return kSuccess; return kSuccess;
} // NOLadability/fn_size) } // NOLadability/fn_size)
#undef ASSEMBLE_SIMD_I64X2_EXT_MUL
#undef ASSEMBLE_SIMD_I32X4_EXT_MUL
#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH
#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW
#undef ASSEMBLE_PINSR #undef ASSEMBLE_PINSR
#undef ASSEMBLE_UNOP #undef ASSEMBLE_UNOP
#undef ASSEMBLE_BINOP #undef ASSEMBLE_BINOP