[wasm-simd][liftoff][ia32] Move v128.select into macro-assembler

This allows us to reuse this optimized code sequence in Liftoff.

This is similar to the x64 implementation, except that the
macro-assembler function takes an additional scratch register.

Change-Id: Ieaa5899cd1be65abee1c6e0c0908a357777afcd9
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2610510
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Reviewed-by: Bill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71996}
This commit is contained in:
Zhi An Ng 2021-01-06 01:32:25 +00:00 committed by Commit Bot
parent aef1be398f
commit 2aa3e64f54
7 changed files with 41 additions and 42 deletions

View File

@ -720,6 +720,24 @@ void TurboAssembler::I16x8ExtMul(XMMRegister dst, XMMRegister src1,
}
}
void TurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
XMMRegister src1, XMMRegister src2,
XMMRegister scratch) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpandn(scratch, mask, src2);
vpand(dst, src1, mask);
vpor(dst, dst, scratch);
} else {
DCHECK_EQ(dst, mask);
// Use float ops as they are 1 byte shorter than int ops.
movaps(scratch, dst);
andnps(scratch, src2);
andps(dst, src1);
orps(dst, scratch);
}
}
void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {
DCHECK_GE(63, shift);
if (shift >= 32) {

View File

@ -621,6 +621,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
XMMRegister scratch, bool low, bool is_signed);
void I16x8ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed);
// Requires dst == mask when AVX is not supported.
void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
XMMRegister src2, XMMRegister scratch);
void Push(Register src) { push(src); }
void Push(Operand src) { push(src); }

View File

@ -6,6 +6,7 @@
#include "src/codegen/assembler-inl.h"
#include "src/codegen/callable.h"
#include "src/codegen/ia32/assembler-ia32.h"
#include "src/codegen/ia32/register-ia32.h"
#include "src/codegen/macro-assembler.h"
#include "src/codegen/optimized-compilation-info.h"
#include "src/compiler/backend/code-generator-impl.h"
@ -3835,24 +3836,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputOperand(1));
break;
}
case kSSES128Select: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
// Mask used here is stored in dst.
XMMRegister dst = i.OutputSimd128Register();
// Use float ops as they are 1 byte shorter than int ops.
__ movaps(kScratchDoubleReg, i.InputSimd128Register(0));
__ andnps(kScratchDoubleReg, i.InputSimd128Register(2));
__ andps(dst, i.InputSimd128Register(1));
__ orps(dst, kScratchDoubleReg);
break;
}
case kAVXS128Select: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister mask = i.InputSimd128Register(0);
__ vpandn(kScratchDoubleReg, mask, i.InputSimd128Register(2));
__ vpand(dst, i.InputSimd128Register(1), mask);
__ vpor(dst, dst, kScratchDoubleReg);
case kIA32S128Select: {
__ S128Select(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), i.InputSimd128Register(2),
kScratchDoubleReg);
break;
}
case kIA32S128AndNot: {

View File

@ -362,8 +362,7 @@ namespace compiler {
V(AVXS128Or) \
V(SSES128Xor) \
V(AVXS128Xor) \
V(SSES128Select) \
V(AVXS128Select) \
V(IA32S128Select) \
V(IA32S128AndNot) \
V(IA32I8x16Swizzle) \
V(IA32I8x16Shuffle) \

View File

@ -344,8 +344,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kAVXS128Or:
case kSSES128Xor:
case kAVXS128Xor:
case kSSES128Select:
case kAVXS128Select:
case kIA32S128Select:
case kIA32S128AndNot:
case kIA32I8x16Swizzle:
case kIA32I8x16Shuffle:

View File

@ -2476,16 +2476,10 @@ void InstructionSelector::VisitS128Zero(Node* node) {
void InstructionSelector::VisitS128Select(Node* node) {
IA32OperandGenerator g(this);
InstructionOperand operand0 = g.UseRegister(node->InputAt(0));
InstructionOperand operand1 = g.UseRegister(node->InputAt(1));
InstructionOperand operand2 = g.UseRegister(node->InputAt(2));
if (IsSupported(AVX)) {
Emit(kAVXS128Select, g.DefineAsRegister(node), operand0, operand1,
operand2);
} else {
Emit(kSSES128Select, g.DefineSameAsFirst(node), operand0, operand1,
operand2);
}
InstructionOperand dst =
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
Emit(kIA32S128Select, dst, g.UseRegister(node->InputAt(0)),
g.UseRegister(node->InputAt(1)), g.UseRegister(node->InputAt(2)));
}
void InstructionSelector::VisitS128AndNot(Node* node) {

View File

@ -3147,17 +3147,16 @@ void LiftoffAssembler::emit_s128_select(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
LiftoffRegister mask) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vxorps(liftoff::kScratchDoubleReg, src1.fp(), src2.fp());
vandps(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, mask.fp());
vxorps(dst.fp(), liftoff::kScratchDoubleReg, src2.fp());
// Ensure that we don't overwrite any inputs with the movdqu below.
DCHECK_NE(dst, src1);
DCHECK_NE(dst, src2);
if (!CpuFeatures::IsSupported(AVX) && dst != mask) {
movdqu(dst.fp(), mask.fp());
S128Select(dst.fp(), dst.fp(), src1.fp(), src2.fp(),
liftoff::kScratchDoubleReg);
} else {
movaps(liftoff::kScratchDoubleReg, src1.fp());
xorps(liftoff::kScratchDoubleReg, src2.fp());
andps(liftoff::kScratchDoubleReg, mask.fp());
if (dst.fp() != src2.fp()) movaps(dst.fp(), src2.fp());
xorps(dst.fp(), liftoff::kScratchDoubleReg);
S128Select(dst.fp(), mask.fp(), src1.fp(), src2.fp(),
liftoff::kScratchDoubleReg);
}
}