From 6acd0e4ab05e47c6f77b422da6b144ebbaaf3683 Mon Sep 17 00:00:00 2001 From: Ng Zhi An Date: Thu, 25 Mar 2021 11:21:02 -0700 Subject: [PATCH] [wasm-simd] Move extmul into SharedTurboAssembler Left i16x8.extmul_low in the arch-specific macro-assemblers because they rely on other functions defined in the same file. We can come back and move it afterwards. Bug: v8:11589 Change-Id: I2ea81c50ed52cc3e59e001b5e80aaf6b93a6572c Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2786280 Reviewed-by: Bill Budge Commit-Queue: Zhi An Ng Cr-Commit-Position: refs/heads/master@{#73688} --- src/codegen/ia32/macro-assembler-ia32.cc | 124 ----------------- src/codegen/ia32/macro-assembler-ia32.h | 9 -- .../macro-assembler-shared-ia32-x64.cc | 126 ++++++++++++++++++ .../macro-assembler-shared-ia32-x64.h | 9 ++ src/codegen/x64/macro-assembler-x64.cc | 124 ----------------- src/codegen/x64/macro-assembler-x64.h | 8 +- .../backend/x64/code-generator-x64.cc | 24 ++-- src/wasm/baseline/x64/liftoff-assembler-x64.h | 20 +-- 8 files changed, 161 insertions(+), 283 deletions(-) diff --git a/src/codegen/ia32/macro-assembler-ia32.cc b/src/codegen/ia32/macro-assembler-ia32.cc index 58d2298347..18e9560b73 100644 --- a/src/codegen/ia32/macro-assembler-ia32.cc +++ b/src/codegen/ia32/macro-assembler-ia32.cc @@ -642,60 +642,6 @@ void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1, } } -// 1. Unpack src0, src1 into even-number elements of scratch. -// 2. Unpack src1, src0 into even-number elements of dst. -// 3. Multiply 1. with 2. -// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq. -void TurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1, - XMMRegister src2, XMMRegister scratch, - bool low, bool is_signed) { - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope avx_scope(this, AVX); - if (low) { - vpunpckldq(scratch, src1, src1); - vpunpckldq(dst, src2, src2); - } else { - vpunpckhdq(scratch, src1, src1); - vpunpckhdq(dst, src2, src2); - } - if (is_signed) { - vpmuldq(dst, scratch, dst); - } else { - vpmuludq(dst, scratch, dst); - } - } else { - uint8_t mask = low ? 0x50 : 0xFA; - pshufd(scratch, src1, mask); - pshufd(dst, src2, mask); - if (is_signed) { - CpuFeatureScope sse4_scope(this, SSE4_1); - pmuldq(dst, scratch); - } else { - pmuludq(dst, scratch); - } - } -} - -// 1. Multiply low word into scratch. -// 2. Multiply high word (can be signed or unsigned) into dst. -// 3. Unpack and interleave scratch and dst into dst. -void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1, - XMMRegister src2, XMMRegister scratch, - bool low, bool is_signed) { - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope avx_scope(this, AVX); - vpmullw(scratch, src1, src2); - is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2); - low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst); - } else { - DCHECK_EQ(dst, src1); - movaps(scratch, src1); - pmullw(dst, src2); - is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2); - low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch); - } -} - void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister scratch, bool is_signed) { @@ -704,76 +650,6 @@ void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, Pmullw(dst, scratch); } -void TurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, - XMMRegister src2, XMMRegister scratch) { - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope avx_scope(this, AVX); - vpunpckhbw(scratch, src1, src1); - vpsraw(scratch, scratch, 8); - vpunpckhbw(dst, src2, src2); - vpsraw(dst, dst, 8); - vpmullw(dst, dst, scratch); - } else { - if (dst != src1) { - movaps(dst, src1); - } - movaps(scratch, src2); - punpckhbw(dst, dst); - psraw(dst, 8); - punpckhbw(scratch, scratch); - psraw(scratch, 8); - pmullw(dst, scratch); - } -} - -void TurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, - XMMRegister src2, XMMRegister scratch) { - // The logic here is slightly complicated to handle all the cases of register - // aliasing. This allows flexibility for callers in TurboFan and Liftoff. - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope avx_scope(this, AVX); - if (src1 == src2) { - vpxor(scratch, scratch, scratch); - vpunpckhbw(dst, src1, scratch); - vpmullw(dst, dst, dst); - } else { - if (dst == src2) { - // We overwrite dst, then use src2, so swap src1 and src2. - std::swap(src1, src2); - } - vpxor(scratch, scratch, scratch); - vpunpckhbw(dst, src1, scratch); - vpunpckhbw(scratch, src2, scratch); - vpmullw(dst, dst, scratch); - } - } else { - if (src1 == src2) { - xorps(scratch, scratch); - if (dst != src1) { - movaps(dst, src1); - } - punpckhbw(dst, scratch); - pmullw(dst, scratch); - } else { - // When dst == src1, nothing special needs to be done. - // When dst == src2, swap src1 and src2, since we overwrite dst. - // When dst is unique, copy src1 to dst first. - if (dst == src2) { - std::swap(src1, src2); - // Now, dst == src1. - } else if (dst != src1) { - // dst != src1 && dst != src2. - movaps(dst, src1); - } - xorps(scratch, scratch); - punpckhbw(dst, scratch); - punpckhbw(scratch, src2); - psrlw(scratch, 8); - pmullw(dst, scratch); - } - } -} - void TurboAssembler::S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1, XMMRegister src2, XMMRegister scratch) { diff --git a/src/codegen/ia32/macro-assembler-ia32.h b/src/codegen/ia32/macro-assembler-ia32.h index e90bba5338..b06f3ff8a1 100644 --- a/src/codegen/ia32/macro-assembler-ia32.h +++ b/src/codegen/ia32/macro-assembler-ia32.h @@ -712,17 +712,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler { // These Wasm SIMD ops do not have direct lowerings on IA32. These // helpers are optimized to produce the fastest and smallest codegen. // Defined here to allow usage on both TurboFan and Liftoff. - void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, - XMMRegister scratch, bool low, bool is_signed); - // Requires that dst == src1 if AVX is not supported. - void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, - XMMRegister scratch, bool low, bool is_signed); void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister scratch, bool is_signed); - void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2, - XMMRegister scratch); - void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2, - XMMRegister scratch); // Requires dst == mask when AVX is not supported. void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1, XMMRegister src2, XMMRegister scratch); diff --git a/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc b/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc index 2626b84246..7e00637107 100644 --- a/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc +++ b/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc @@ -18,6 +18,78 @@ namespace v8 { namespace internal { +void SharedTurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, + XMMRegister src2, + XMMRegister scratch) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vpunpckhbw(scratch, src1, src1); + vpsraw(scratch, scratch, 8); + vpunpckhbw(dst, src2, src2); + vpsraw(dst, dst, 8); + vpmullw(dst, dst, scratch); + } else { + if (dst != src1) { + movaps(dst, src1); + } + movaps(scratch, src2); + punpckhbw(dst, dst); + psraw(dst, 8); + punpckhbw(scratch, scratch); + psraw(scratch, 8); + pmullw(dst, scratch); + } +} + +void SharedTurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, + XMMRegister src2, + XMMRegister scratch) { + // The logic here is slightly complicated to handle all the cases of register + // aliasing. This allows flexibility for callers in TurboFan and Liftoff. + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + if (src1 == src2) { + vpxor(scratch, scratch, scratch); + vpunpckhbw(dst, src1, scratch); + vpmullw(dst, dst, dst); + } else { + if (dst == src2) { + // We overwrite dst, then use src2, so swap src1 and src2. + std::swap(src1, src2); + } + vpxor(scratch, scratch, scratch); + vpunpckhbw(dst, src1, scratch); + vpunpckhbw(scratch, src2, scratch); + vpmullw(dst, dst, scratch); + } + } else { + if (src1 == src2) { + xorps(scratch, scratch); + if (dst != src1) { + movaps(dst, src1); + } + punpckhbw(dst, scratch); + pmullw(dst, scratch); + } else { + // When dst == src1, nothing special needs to be done. + // When dst == src2, swap src1 and src2, since we overwrite dst. + // When dst is unique, copy src1 to dst first. + if (dst == src2) { + std::swap(src1, src2); + // Now, dst == src1. + } else if (dst != src1) { + // dst != src1 && dst != src2. + movaps(dst, src1); + } + xorps(scratch, scratch); + punpckhbw(dst, scratch); + punpckhbw(scratch, src2); + psrlw(scratch, 8); + pmullw(dst, scratch); + } + } +} + void SharedTurboAssembler::I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src) { if (CpuFeatures::IsSupported(AVX)) { @@ -65,6 +137,26 @@ void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst, } } +// 1. Multiply low word into scratch. +// 2. Multiply high word (can be signed or unsigned) into dst. +// 3. Unpack and interleave scratch and dst into dst. +void SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1, + XMMRegister src2, XMMRegister scratch, + bool low, bool is_signed) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vpmullw(scratch, src1, src2); + is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2); + low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst); + } else { + DCHECK_EQ(dst, src1); + movaps(scratch, src1); + pmullw(dst, src2); + is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2); + low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch); + } +} + void SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src) { if (CpuFeatures::IsSupported(AVX)) { @@ -112,6 +204,40 @@ void SharedTurboAssembler::I32x4UConvertI16x8High(XMMRegister dst, } } +// 1. Unpack src0, src1 into even-number elements of scratch. +// 2. Unpack src1, src0 into even-number elements of dst. +// 3. Multiply 1. with 2. +// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq. +void SharedTurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1, + XMMRegister src2, XMMRegister scratch, + bool low, bool is_signed) { + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + if (low) { + vpunpckldq(scratch, src1, src1); + vpunpckldq(dst, src2, src2); + } else { + vpunpckhdq(scratch, src1, src1); + vpunpckhdq(dst, src2, src2); + } + if (is_signed) { + vpmuldq(dst, scratch, dst); + } else { + vpmuludq(dst, scratch, dst); + } + } else { + uint8_t mask = low ? 0x50 : 0xFA; + pshufd(scratch, src1, mask); + pshufd(dst, src2, mask); + if (is_signed) { + CpuFeatureScope sse4_scope(this, SSE4_1); + pmuldq(dst, scratch); + } else { + pmuludq(dst, scratch); + } + } +} + void SharedTurboAssembler::I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src) { if (CpuFeatures::IsSupported(AVX)) { diff --git a/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h b/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h index 8c599547ec..baf3096961 100644 --- a/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h +++ b/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h @@ -23,12 +23,21 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { public: using TurboAssemblerBase::TurboAssemblerBase; + void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2, + XMMRegister scratch); + void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2, + XMMRegister scratch); void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src); void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src, XMMRegister scratch); + // Requires that dst == src1 if AVX is not supported. + void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, + XMMRegister scratch, bool low, bool is_signed); void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src); void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src, XMMRegister scratch); + void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, + XMMRegister scratch, bool low, bool is_signed); void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src); void I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src, XMMRegister scratch); diff --git a/src/codegen/x64/macro-assembler-x64.cc b/src/codegen/x64/macro-assembler-x64.cc index 8a195ecb02..390f900c30 100644 --- a/src/codegen/x64/macro-assembler-x64.cc +++ b/src/codegen/x64/macro-assembler-x64.cc @@ -2132,60 +2132,6 @@ void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1, } } -// 1. Unpack src0, src0 into even-number elements of scratch. -// 2. Unpack src1, src1 into even-number elements of dst. -// 3. Multiply 1. with 2. -// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq. -void TurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1, - XMMRegister src2, bool low, bool is_signed) { - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope avx_scope(this, AVX); - if (low) { - vpunpckldq(kScratchDoubleReg, src1, src1); - vpunpckldq(dst, src2, src2); - } else { - vpunpckhdq(kScratchDoubleReg, src1, src1); - vpunpckhdq(dst, src2, src2); - } - if (is_signed) { - vpmuldq(dst, kScratchDoubleReg, dst); - } else { - vpmuludq(dst, kScratchDoubleReg, dst); - } - } else { - uint8_t mask = low ? 0x50 : 0xFA; - pshufd(kScratchDoubleReg, src1, mask); - pshufd(dst, src2, mask); - if (is_signed) { - CpuFeatureScope avx_scope(this, SSE4_1); - pmuldq(dst, kScratchDoubleReg); - } else { - pmuludq(dst, kScratchDoubleReg); - } - } -} - -// 1. Multiply low word into scratch. -// 2. Multiply high word (can be signed or unsigned) into dst. -// 3. Unpack and interleave scratch and dst into dst. -void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1, - XMMRegister src2, bool low, bool is_signed) { - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope avx_scope(this, AVX); - vpmullw(kScratchDoubleReg, src1, src2); - is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2); - low ? vpunpcklwd(dst, kScratchDoubleReg, dst) - : vpunpckhwd(dst, kScratchDoubleReg, dst); - } else { - DCHECK_EQ(dst, src1); - movaps(kScratchDoubleReg, src1); - pmullw(dst, src2); - is_signed ? pmulhw(kScratchDoubleReg, src2) - : pmulhuw(kScratchDoubleReg, src2); - low ? punpcklwd(dst, kScratchDoubleReg) : punpckhwd(dst, kScratchDoubleReg); - } -} - void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_signed) { is_signed ? Pmovsxbw(kScratchDoubleReg, src1) @@ -2194,76 +2140,6 @@ void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, Pmullw(dst, kScratchDoubleReg); } -void TurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, - XMMRegister src2) { - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope avx_scope(this, AVX); - vpunpckhbw(kScratchDoubleReg, src1, src1); - vpsraw(kScratchDoubleReg, kScratchDoubleReg, 8); - vpunpckhbw(dst, src2, src2); - vpsraw(dst, dst, 8); - vpmullw(dst, dst, kScratchDoubleReg); - } else { - if (dst != src1) { - movaps(dst, src1); - } - movaps(kScratchDoubleReg, src2); - punpckhbw(dst, dst); - psraw(dst, 8); - punpckhbw(kScratchDoubleReg, kScratchDoubleReg); - psraw(kScratchDoubleReg, 8); - pmullw(dst, kScratchDoubleReg); - } -} - -void TurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, - XMMRegister src2) { - // The logic here is slightly complicated to handle all the cases of register - // aliasing. This allows flexibility for callers in TurboFan and Liftoff. - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope avx_scope(this, AVX); - if (src1 == src2) { - vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg); - vpunpckhbw(dst, src1, kScratchDoubleReg); - vpmullw(dst, dst, dst); - } else { - if (dst == src2) { - // We overwrite dst, then use src2, so swap src1 and src2. - std::swap(src1, src2); - } - vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg); - vpunpckhbw(dst, src1, kScratchDoubleReg); - vpunpckhbw(kScratchDoubleReg, src2, kScratchDoubleReg); - vpmullw(dst, dst, kScratchDoubleReg); - } - } else { - if (src1 == src2) { - xorps(kScratchDoubleReg, kScratchDoubleReg); - if (dst != src1) { - movaps(dst, src1); - } - punpckhbw(dst, kScratchDoubleReg); - pmullw(dst, kScratchDoubleReg); - } else { - // When dst == src1, nothing special needs to be done. - // When dst == src2, swap src1 and src2, since we overwrite dst. - // When dst is unique, copy src1 to dst first. - if (dst == src2) { - std::swap(src1, src2); - // Now, dst == src1. - } else if (dst != src1) { - // dst != src1 && dst != src2. - movaps(dst, src1); - } - xorps(kScratchDoubleReg, kScratchDoubleReg); - punpckhbw(dst, kScratchDoubleReg); - punpckhbw(kScratchDoubleReg, src2); - psrlw(kScratchDoubleReg, 8); - pmullw(dst, kScratchDoubleReg); - } - } -} - void TurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2) { // k = i16x8.splat(0x8000) diff --git a/src/codegen/x64/macro-assembler-x64.h b/src/codegen/x64/macro-assembler-x64.h index 784f5da306..07c13f30a1 100644 --- a/src/codegen/x64/macro-assembler-x64.h +++ b/src/codegen/x64/macro-assembler-x64.h @@ -612,15 +612,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler { void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1, XMMRegister src2); - void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, - bool low, bool is_signed); - // Requires that dst == src1 if AVX is not supported. - void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, - bool low, bool is_signed); + // TODO(zhin): Move this into shared-ia32-x64-macro-assembler. void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_signed); - void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2); - void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2); void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2); diff --git a/src/compiler/backend/x64/code-generator-x64.cc b/src/compiler/backend/x64/code-generator-x64.cc index 73ba2fb51c..97fadf4695 100644 --- a/src/compiler/backend/x64/code-generator-x64.cc +++ b/src/compiler/backend/x64/code-generator-x64.cc @@ -2884,25 +2884,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( } case kX64I64x2ExtMulLowI32x4S: { __ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), - i.InputSimd128Register(1), /*low=*/true, + i.InputSimd128Register(1), kScratchDoubleReg, /*low=*/true, /*is_signed=*/true); break; } case kX64I64x2ExtMulHighI32x4S: { __ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), - i.InputSimd128Register(1), /*low=*/false, + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/false, /*is_signed=*/true); break; } case kX64I64x2ExtMulLowI32x4U: { __ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), - i.InputSimd128Register(1), /*low=*/true, + i.InputSimd128Register(1), kScratchDoubleReg, /*low=*/true, /*is_signed=*/false); break; } case kX64I64x2ExtMulHighI32x4U: { __ I64x2ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), - i.InputSimd128Register(1), /*low=*/false, + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/false, /*is_signed=*/false); break; } @@ -3320,7 +3322,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( } case kX64I16x8ExtMulHighI8x16S: { __ I16x8ExtMulHighS(i.OutputSimd128Register(), i.InputSimd128Register(0), - i.InputSimd128Register(1)); + i.InputSimd128Register(1), kScratchDoubleReg); break; } case kX64I16x8ExtMulLowI8x16U: { @@ -3330,7 +3332,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( } case kX64I16x8ExtMulHighI8x16U: { __ I16x8ExtMulHighU(i.OutputSimd128Register(), i.InputSimd128Register(0), - i.InputSimd128Register(1)); + i.InputSimd128Register(1), kScratchDoubleReg); break; } case kX64I16x8ExtAddPairwiseI8x16S: { @@ -3632,25 +3634,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( } case kX64I32x4ExtMulLowI16x8S: { __ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), - i.InputSimd128Register(1), /*low=*/true, + i.InputSimd128Register(1), kScratchDoubleReg, /*low=*/true, /*is_signed=*/true); break; } case kX64I32x4ExtMulHighI16x8S: { __ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), - i.InputSimd128Register(1), /*low=*/false, + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/false, /*is_signed=*/true); break; } case kX64I32x4ExtMulLowI16x8U: { __ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), - i.InputSimd128Register(1), /*low=*/true, + i.InputSimd128Register(1), kScratchDoubleReg, /*low=*/true, /*is_signed=*/false); break; } case kX64I32x4ExtMulHighI16x8U: { __ I32x4ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0), - i.InputSimd128Register(1), /*low=*/false, + i.InputSimd128Register(1), kScratchDoubleReg, + /*low=*/false, /*is_signed=*/false); break; } diff --git a/src/wasm/baseline/x64/liftoff-assembler-x64.h b/src/wasm/baseline/x64/liftoff-assembler-x64.h index f9253219ff..eea5fa76e0 100644 --- a/src/wasm/baseline/x64/liftoff-assembler-x64.h +++ b/src/wasm/baseline/x64/liftoff-assembler-x64.h @@ -9,6 +9,7 @@ #include "src/codegen/assembler.h" #include "src/codegen/cpu-features.h" #include "src/codegen/machine-type.h" +#include "src/codegen/x64/register-x64.h" #include "src/heap/memory-chunk.h" #include "src/wasm/baseline/liftoff-assembler.h" #include "src/wasm/simd-shuffle.h" @@ -3233,13 +3234,13 @@ void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_u(LiftoffRegister dst, void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_s(LiftoffRegister dst, LiftoffRegister src1, LiftoffRegister src2) { - I16x8ExtMulHighS(dst.fp(), src1.fp(), src2.fp()); + I16x8ExtMulHighS(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg); } void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_u(LiftoffRegister dst, LiftoffRegister src1, LiftoffRegister src2) { - I16x8ExtMulHighU(dst.fp(), src1.fp(), src2.fp()); + I16x8ExtMulHighU(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg); } void LiftoffAssembler::emit_i16x8_q15mulr_sat_s(LiftoffRegister dst, @@ -3380,16 +3381,16 @@ inline void I32x4ExtMulHelper(LiftoffAssembler* assm, XMMRegister dst, bool is_signed) { // I32x4ExtMul requires dst == src1 if AVX is not supported. if (CpuFeatures::IsSupported(AVX) || dst == src1) { - assm->I32x4ExtMul(dst, src1, src2, low, is_signed); + assm->I32x4ExtMul(dst, src1, src2, kScratchDoubleReg, low, is_signed); } else if (dst != src2) { // dst != src1 && dst != src2 assm->movaps(dst, src1); - assm->I32x4ExtMul(dst, dst, src2, low, is_signed); + assm->I32x4ExtMul(dst, dst, src2, kScratchDoubleReg, low, is_signed); } else { // dst == src2 // Extended multiplication is commutative, assm->movaps(dst, src2); - assm->I32x4ExtMul(dst, dst, src1, low, is_signed); + assm->I32x4ExtMul(dst, dst, src1, kScratchDoubleReg, low, is_signed); } } } // namespace liftoff @@ -3521,27 +3522,28 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs, void LiftoffAssembler::emit_i64x2_extmul_low_i32x4_s(LiftoffRegister dst, LiftoffRegister src1, LiftoffRegister src2) { - I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/true, /*is_signed=*/true); + I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg, /*low=*/true, + /*is_signed=*/true); } void LiftoffAssembler::emit_i64x2_extmul_low_i32x4_u(LiftoffRegister dst, LiftoffRegister src1, LiftoffRegister src2) { - I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/true, + I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg, /*low=*/true, /*is_signed=*/false); } void LiftoffAssembler::emit_i64x2_extmul_high_i32x4_s(LiftoffRegister dst, LiftoffRegister src1, LiftoffRegister src2) { - I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/false, + I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg, /*low=*/false, /*is_signed=*/true); } void LiftoffAssembler::emit_i64x2_extmul_high_i32x4_u(LiftoffRegister dst, LiftoffRegister src1, LiftoffRegister src2) { - I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/false, + I64x2ExtMul(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg, /*low=*/false, /*is_signed=*/false); }