From afd3f63e30bae9daaf7755c816dbebe4b062da49 Mon Sep 17 00:00:00 2001 From: Zhi An Ng Date: Tue, 29 Dec 2020 05:04:14 +0000 Subject: [PATCH] [wasm-simd][ia32] Convert ext mul macros into macro-assembler functions This will make these functions usable from Liftoff when we later implement extended multiply instructions in Liftoff. This is similar to the x64 versions, except that it takes a scratch register as a parameter. Bug: v8:11262 Change-Id: Ief3d8cdde59da9e05a468286315bcae6d13863d9 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2603768 Reviewed-by: Bill Budge Commit-Queue: Zhi An Ng Cr-Commit-Position: refs/heads/master@{#71907} --- src/codegen/ia32/macro-assembler-ia32.cc | 70 ++++++++++ src/codegen/ia32/macro-assembler-ia32.h | 11 ++ .../backend/ia32/code-generator-ia32.cc | 120 ++++++------------ 3 files changed, 117 insertions(+), 84 deletions(-) diff --git a/src/codegen/ia32/macro-assembler-ia32.cc b/src/codegen/ia32/macro-assembler-ia32.cc index b3f7e8604d..b18a76b387 100644 --- a/src/codegen/ia32/macro-assembler-ia32.cc +++ b/src/codegen/ia32/macro-assembler-ia32.cc @@ -650,6 +650,76 @@ void TurboAssembler::Roundpd(XMMRegister dst, XMMRegister src, } } +// 1. Unpack src0, src1 into even-number elements of scratch. +// 2. Unpack src1, src0 into even-number elements of dst. +// 3. Multiply 1. with 2. +// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq. +void TurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1, + XMMRegister src2, XMMRegister scratch, + bool low, bool is_signed) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + if (low) { + vpunpckldq(scratch, src1, src1); + vpunpckldq(dst, src2, src2); + } else { + vpunpckhdq(scratch, src1, src1); + vpunpckhdq(dst, src2, src2); + } + if (is_signed) { + vpmuldq(dst, scratch, dst); + } else { + vpmuludq(dst, scratch, dst); + } + } else { + uint8_t mask = low ? 0x50 : 0xFA; + pshufd(scratch, src1, mask); + pshufd(dst, src2, mask); + if (is_signed) { + CpuFeatureScope sse4_scope(this, SSE4_1); + pmuldq(dst, scratch); + } else { + pmuludq(dst, scratch); + } + } +} + +// 1. Multiply low word into scratch. +// 2. Multiply high word (can be signed or unsigned) into dst. +// 3. Unpack and interleave scratch and dst into dst. +void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1, + XMMRegister src2, XMMRegister scratch, + bool low, bool is_signed) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vpmullw(scratch, src1, src2); + is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2); + low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst); + } else { + DCHECK_EQ(dst, src1); + movdqu(scratch, src1); + pmullw(dst, src2); + is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2); + low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch); + } +} + +void TurboAssembler::I16x8ExtMul(XMMRegister dst, XMMRegister src1, + XMMRegister src2, XMMRegister scratch, + bool low, bool is_signed) { + if (low) { + is_signed ? Pmovsxbw(scratch, src1) : Pmovzxbw(scratch, src1); + is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2); + Pmullw(dst, scratch); + } else { + Palignr(scratch, src1, uint8_t{8}); + is_signed ? Pmovsxbw(scratch, scratch) : Pmovzxbw(scratch, scratch); + Palignr(dst, src2, uint8_t{8}); + is_signed ? Pmovsxbw(dst, dst) : Pmovzxbw(dst, dst); + Pmullw(dst, scratch); + } +} + void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) { DCHECK_GE(63, shift); if (shift >= 32) { diff --git a/src/codegen/ia32/macro-assembler-ia32.h b/src/codegen/ia32/macro-assembler-ia32.h index 798a2fedc4..f9e6cb77d5 100644 --- a/src/codegen/ia32/macro-assembler-ia32.h +++ b/src/codegen/ia32/macro-assembler-ia32.h @@ -611,6 +611,17 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { void Roundps(XMMRegister dst, XMMRegister src, RoundingMode mode); void Roundpd(XMMRegister dst, XMMRegister src, RoundingMode mode); + // These Wasm SIMD ops do not have direct lowerings on IA32. These + // helpers are optimized to produce the fastest and smallest codegen. + // Defined here to allow usage on both TurboFan and Liftoff. + void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, + XMMRegister scratch, bool low, bool is_signed); + // Requires that dst == src1 if AVX is not supported. + void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, + XMMRegister scratch, bool low, bool is_signed); + void I16x8ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, + XMMRegister scratch, bool low, bool is_signed); + void Push(Register src) { push(src); } void Push(Operand src) { push(src); } void Push(Immediate value); diff --git a/src/compiler/backend/ia32/code-generator-ia32.cc b/src/compiler/backend/ia32/code-generator-ia32.cc index c0f363f5c3..cde04a9463 100644 --- a/src/compiler/backend/ia32/code-generator-ia32.cc +++ b/src/compiler/backend/ia32/code-generator-ia32.cc @@ -561,74 +561,6 @@ class OutOfLineRecordWrite final : public OutOfLineCode { } \ } while (false) -// 1. Unpack src0, src1 into even-number elements of scratch. -// 2. Unpack src1, src0 into even-number elements of dst. -// 3. Multiply 1. with 2. -// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq. -// We only need SSE4_1 for pmuldq (singed ext mul), but enable in both signed -// and unsigned cases to reduce macro duplication. -#define ASSEMBLE_SIMD_I64X2_EXT_MUL(UNPACK_INSTR, MUL_INSTR, SHUFFLE_CONST) \ - do { \ - XMMRegister dst = i.OutputSimd128Register(); \ - XMMRegister src0 = i.InputSimd128Register(0); \ - Operand src1 = i.InputOperand(1); \ - if (CpuFeatures::IsSupported(AVX)) { \ - CpuFeatureScope avx_scope(tasm(), AVX); \ - __ movdqu(kScratchDoubleReg, src1); \ - __ v##UNPACK_INSTR(kScratchDoubleReg, kScratchDoubleReg, \ - kScratchDoubleReg); \ - __ v##UNPACK_INSTR(dst, src0, src0); \ - __ v##MUL_INSTR(dst, kScratchDoubleReg, dst); \ - } else { \ - CpuFeatureScope sse4_scope(tasm(), SSE4_1); \ - DCHECK_EQ(dst, src0); \ - __ pshufd(kScratchDoubleReg, src0, SHUFFLE_CONST); \ - __ pshufd(dst, src1, SHUFFLE_CONST); \ - __ MUL_INSTR(dst, kScratchDoubleReg); \ - } \ - } while (false) - -// 1. Multiply low word into scratch. -// 2. Multiply high word (can be signed or unsigned) into dst. -// 3. Unpack and interleave scratch and dst into dst. -#define ASSEMBLE_SIMD_I32X4_EXT_MUL(MUL_HIGH_INSTR, UNPACK_INSTR) \ - do { \ - XMMRegister dst = i.OutputSimd128Register(); \ - XMMRegister src0 = i.InputSimd128Register(0); \ - Operand src1 = i.InputOperand(1); \ - if (CpuFeatures::IsSupported(AVX)) { \ - CpuFeatureScope avx_scope(tasm(), AVX); \ - __ vpmullw(kScratchDoubleReg, src0, src1); \ - __ v##MUL_HIGH_INSTR(dst, src0, src1); \ - __ v##UNPACK_INSTR(dst, kScratchDoubleReg, dst); \ - } else { \ - DCHECK_EQ(dst, src0); \ - __ movdqu(kScratchDoubleReg, src0); \ - __ pmullw(kScratchDoubleReg, src1); \ - __ MUL_HIGH_INSTR(dst, src1); \ - __ UNPACK_INSTR(kScratchDoubleReg, dst); \ - __ movdqu(dst, kScratchDoubleReg); \ - } \ - } while (false) - -#define ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(EXTEND_MACRO_INSTR) \ - do { \ - XMMRegister dst = i.OutputSimd128Register(); \ - __ EXTEND_MACRO_INSTR(kScratchDoubleReg, i.InputSimd128Register(0)); \ - __ EXTEND_MACRO_INSTR(dst, i.InputOperand(1)); \ - __ Pmullw(dst, kScratchDoubleReg); \ - } while (false) - -#define ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(EXTEND_MACRO_INSTR) \ - do { \ - XMMRegister dst = i.OutputSimd128Register(); \ - __ Palignr(kScratchDoubleReg, i.InputSimd128Register(0), uint8_t{8}); \ - __ EXTEND_MACRO_INSTR(kScratchDoubleReg, kScratchDoubleReg); \ - __ Palignr(dst, i.InputOperand(1), uint8_t{8}); \ - __ EXTEND_MACRO_INSTR(dst, dst); \ - __ Pmullw(dst, kScratchDoubleReg); \ - } while (false) - void CodeGenerator::AssembleDeconstructFrame() { __ mov(esp, ebp); __ pop(ebp); @@ -2160,51 +2092,75 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( break; } case kIA32I64x2ExtMulLowI32x4S: { - ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuldq, 0x50); + __ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/true, /*is_signed=*/true); break; } case kIA32I64x2ExtMulHighI32x4S: { - ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuldq, 0xFA); + __ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/false, /*is_signed=*/true); break; } case kIA32I64x2ExtMulLowI32x4U: { - ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuludq, 0x50); + __ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/true, /*is_signed=*/false); break; } case kIA32I64x2ExtMulHighI32x4U: { - ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuludq, 0xFA); + __ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/false, /*is_signed=*/false); break; } case kIA32I32x4ExtMulLowI16x8S: { - ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpcklwd); + __ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/true, /*is_signed=*/true); break; } case kIA32I32x4ExtMulHighI16x8S: { - ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpckhwd); + __ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/false, /*is_signed=*/true); break; } case kIA32I32x4ExtMulLowI16x8U: { - ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpcklwd); + __ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/true, /*is_signed=*/false); break; } case kIA32I32x4ExtMulHighI16x8U: { - ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpckhwd); + __ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/false, /*is_signed=*/false); break; } case kIA32I16x8ExtMulLowI8x16S: { - ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovsxbw); + __ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/true, /*is_signed=*/true); break; } case kIA32I16x8ExtMulHighI8x16S: { - ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovsxbw); + __ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/false, /*is_signed=*/true); break; } case kIA32I16x8ExtMulLowI8x16U: { - ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovzxbw); + __ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/true, /*is_signed=*/false); break; } case kIA32I16x8ExtMulHighI8x16U: { - ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovzxbw); + __ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/false, /*is_signed=*/false); break; } case kIA32I64x2SplatI32Pair: { @@ -5350,10 +5306,6 @@ void CodeGenerator::AssembleJumpTable(Label** targets, size_t target_count) { #undef ASSEMBLE_SIMD_SHIFT #undef ASSEMBLE_SIMD_PINSR #undef ASSEMBLE_SIMD_SIGN_SELECT -#undef ASSEMBLE_SIMD_I64X2_EXT_MUL -#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW -#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH -#undef ASSEMBLE_SIMD_I32X4_EXT_MUL } // namespace compiler } // namespace internal