[wasm-simd][x64] Bitmask instructions
Implement i8x16.bitmask, i16x8.bitmask, i32x4.bitmask on x64. Bug: v8:10308 Change-Id: Id47cb229de77d80d0a7ec91f4862a91258ff1979 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2127317 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Cr-Commit-Position: refs/heads/master@{#67022}
This commit is contained in:
parent
dfdf66cbe8
commit
043ac205ec
@ -3441,6 +3441,15 @@ void Assembler::movmskps(Register dst, XMMRegister src) {
|
||||
emit_sse_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::pmovmskb(Register dst, XMMRegister src) {
|
||||
EnsureSpace ensure_space(this);
|
||||
emit_optional_rex_32(dst, src);
|
||||
emit(0x66);
|
||||
emit(0x0F);
|
||||
emit(0xD7);
|
||||
emit_sse_operand(dst, src);
|
||||
}
|
||||
|
||||
// AVX instructions
|
||||
|
||||
void Assembler::vmovddup(XMMRegister dst, XMMRegister src) {
|
||||
@ -3634,6 +3643,15 @@ void Assembler::vucomiss(XMMRegister dst, Operand src) {
|
||||
emit_sse_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::vpmovmskb(Register dst, XMMRegister src) {
|
||||
XMMRegister idst = XMMRegister::from_code(dst.code());
|
||||
DCHECK(IsEnabled(AVX));
|
||||
EnsureSpace ensure_space(this);
|
||||
emit_vex_prefix(idst, xmm0, src, kL128, k66, k0F, kWIG);
|
||||
emit(0xD7);
|
||||
emit_sse_operand(idst, src);
|
||||
}
|
||||
|
||||
void Assembler::vss(byte op, XMMRegister dst, XMMRegister src1,
|
||||
XMMRegister src2) {
|
||||
DCHECK(IsEnabled(AVX));
|
||||
|
@ -1124,6 +1124,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
|
||||
void movmskpd(Register dst, XMMRegister src);
|
||||
|
||||
void pmovmskb(Register dst, XMMRegister src);
|
||||
|
||||
// SSE 4.1 instruction
|
||||
void insertps(XMMRegister dst, XMMRegister src, byte imm8);
|
||||
void insertps(XMMRegister dst, Operand src, byte imm8);
|
||||
@ -1393,6 +1395,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
XMMRegister idst = XMMRegister::from_code(dst.code());
|
||||
vpd(0x50, idst, xmm0, src);
|
||||
}
|
||||
void vpmovmskb(Register dst, XMMRegister src);
|
||||
void vcmpps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int8_t cmp) {
|
||||
vps(0xC2, dst, src1, src2);
|
||||
emit(cmp);
|
||||
|
@ -141,6 +141,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
AVX_OP(Movups, movups)
|
||||
AVX_OP(Movmskps, movmskps)
|
||||
AVX_OP(Movmskpd, movmskpd)
|
||||
AVX_OP(Pmovmskb, pmovmskb)
|
||||
AVX_OP(Movss, movss)
|
||||
AVX_OP(Movsd, movsd)
|
||||
AVX_OP(Movdqu, movdqu)
|
||||
|
@ -2634,11 +2634,15 @@ void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }
|
||||
void InstructionSelector::VisitI64x2MaxU(Node* node) { UNIMPLEMENTED(); }
|
||||
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_S390X
|
||||
|
||||
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_IA32
|
||||
// TODO(v8:10308) Bitmask operations are in prototype now, we can remove these
|
||||
// guards when they go into the proposal.
|
||||
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_IA32 && \
|
||||
!V8_TARGET_ARCH_X64
|
||||
void InstructionSelector::VisitI8x16BitMask(Node* node) { UNIMPLEMENTED(); }
|
||||
void InstructionSelector::VisitI16x8BitMask(Node* node) { UNIMPLEMENTED(); }
|
||||
void InstructionSelector::VisitI32x4BitMask(Node* node) { UNIMPLEMENTED(); }
|
||||
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_IA32
|
||||
// && !V8_TARGET_ARCH_X64
|
||||
|
||||
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }
|
||||
|
||||
|
@ -3085,6 +3085,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
__ Pabsd(i.OutputSimd128Register(), i.InputSimd128Register(0));
|
||||
break;
|
||||
}
|
||||
case kX64I32x4BitMask: {
|
||||
__ Movmskps(i.OutputRegister(), i.InputSimd128Register(0));
|
||||
break;
|
||||
}
|
||||
case kX64S128Zero: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
__ Xorps(dst, dst);
|
||||
@ -3273,6 +3277,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
__ Pabsw(i.OutputSimd128Register(), i.InputSimd128Register(0));
|
||||
break;
|
||||
}
|
||||
case kX64I16x8BitMask: {
|
||||
Register dst = i.OutputRegister();
|
||||
XMMRegister tmp = i.TempSimd128Register(0);
|
||||
__ Packsswb(tmp, i.InputSimd128Register(0));
|
||||
__ Pmovmskb(dst, tmp);
|
||||
__ shrq(dst, Immediate(8));
|
||||
break;
|
||||
}
|
||||
case kX64I8x16Splat: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
if (HasRegisterInput(instr, 0)) {
|
||||
@ -3542,6 +3554,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
__ Pabsb(i.OutputSimd128Register(), i.InputSimd128Register(0));
|
||||
break;
|
||||
}
|
||||
case kX64I8x16BitMask: {
|
||||
__ Pmovmskb(i.OutputRegister(), i.InputSimd128Register(0));
|
||||
break;
|
||||
}
|
||||
case kX64S128And: {
|
||||
__ Pand(i.OutputSimd128Register(), i.InputSimd128Register(1));
|
||||
break;
|
||||
|
@ -241,6 +241,7 @@ namespace compiler {
|
||||
V(X64I32x4GtU) \
|
||||
V(X64I32x4GeU) \
|
||||
V(X64I32x4Abs) \
|
||||
V(X64I32x4BitMask) \
|
||||
V(X64I16x8Splat) \
|
||||
V(X64I16x8ExtractLaneU) \
|
||||
V(X64I16x8ExtractLaneS) \
|
||||
@ -275,6 +276,7 @@ namespace compiler {
|
||||
V(X64I16x8GeU) \
|
||||
V(X64I16x8RoundingAverageU) \
|
||||
V(X64I16x8Abs) \
|
||||
V(X64I16x8BitMask) \
|
||||
V(X64I8x16Splat) \
|
||||
V(X64I8x16ExtractLaneU) \
|
||||
V(X64I8x16ExtractLaneS) \
|
||||
@ -304,6 +306,7 @@ namespace compiler {
|
||||
V(X64I8x16GeU) \
|
||||
V(X64I8x16RoundingAverageU) \
|
||||
V(X64I8x16Abs) \
|
||||
V(X64I8x16BitMask) \
|
||||
V(X64S128Zero) \
|
||||
V(X64S128Not) \
|
||||
V(X64S128And) \
|
||||
|
@ -213,6 +213,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kX64I32x4GtU:
|
||||
case kX64I32x4GeU:
|
||||
case kX64I32x4Abs:
|
||||
case kX64I32x4BitMask:
|
||||
case kX64I16x8Splat:
|
||||
case kX64I16x8ExtractLaneU:
|
||||
case kX64I16x8ExtractLaneS:
|
||||
@ -247,6 +248,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kX64I16x8GeU:
|
||||
case kX64I16x8RoundingAverageU:
|
||||
case kX64I16x8Abs:
|
||||
case kX64I16x8BitMask:
|
||||
case kX64I8x16Splat:
|
||||
case kX64I8x16ExtractLaneU:
|
||||
case kX64I8x16ExtractLaneS:
|
||||
@ -276,6 +278,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kX64I8x16GeU:
|
||||
case kX64I8x16RoundingAverageU:
|
||||
case kX64I8x16Abs:
|
||||
case kX64I8x16BitMask:
|
||||
case kX64S128And:
|
||||
case kX64S128Or:
|
||||
case kX64S128Xor:
|
||||
|
@ -2730,6 +2730,7 @@ VISIT_ATOMIC_BINOP(Xor)
|
||||
V(I32x4UConvertI16x8Low) \
|
||||
V(I32x4UConvertI16x8High) \
|
||||
V(I32x4Abs) \
|
||||
V(I32x4BitMask) \
|
||||
V(I16x8SConvertI8x16Low) \
|
||||
V(I16x8SConvertI8x16High) \
|
||||
V(I16x8Neg) \
|
||||
@ -2738,6 +2739,7 @@ VISIT_ATOMIC_BINOP(Xor)
|
||||
V(I16x8Abs) \
|
||||
V(I8x16Neg) \
|
||||
V(I8x16Abs) \
|
||||
V(I8x16BitMask) \
|
||||
V(S128Not)
|
||||
|
||||
#define SIMD_SHIFT_OPCODES(V) \
|
||||
@ -3033,6 +3035,13 @@ void InstructionSelector::VisitI16x8UConvertI32x4(Node* node) {
|
||||
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitI16x8BitMask(Node* node) {
|
||||
X64OperandGenerator g(this);
|
||||
InstructionOperand temps[] = {g.TempSimd128Register()};
|
||||
Emit(kX64I16x8BitMask, g.DefineAsRegister(node),
|
||||
g.UseUniqueRegister(node->InputAt(0)), arraysize(temps), temps);
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) {
|
||||
X64OperandGenerator g(this);
|
||||
Emit(kX64I8x16UConvertI16x8, g.DefineSameAsFirst(node),
|
||||
|
@ -1490,6 +1490,10 @@ int DisassemblerX64::AVXInstruction(byte* data) {
|
||||
current += PrintRightXMMOperand(current);
|
||||
AppendToBuffer(",0x%x", *current++);
|
||||
break;
|
||||
case 0xD7:
|
||||
AppendToBuffer("vpmovmskb %s,", NameOfCPURegister(regop));
|
||||
current += PrintRightXMMOperand(current);
|
||||
break;
|
||||
#define DECLARE_SSE_AVX_DIS_CASE(instruction, notUsed1, notUsed2, opcode) \
|
||||
case 0x##opcode: { \
|
||||
AppendToBuffer("v" #instruction " %s,%s,", NameOfXMMRegister(regop), \
|
||||
@ -2124,7 +2128,10 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
|
||||
} else {
|
||||
UnimplementedInstruction();
|
||||
}
|
||||
AppendToBuffer("%s %s,", mnemonic, NameOfXMMRegister(regop));
|
||||
// Not every opcode here has an XMM register as the dst operand.
|
||||
const char* regop_reg = opcode == 0xD7 ? NameOfCPURegister(regop)
|
||||
: NameOfXMMRegister(regop);
|
||||
AppendToBuffer("%s %s,", mnemonic, regop_reg);
|
||||
current += PrintRightXMMOperand(current);
|
||||
if (opcode == 0xC2) {
|
||||
const char* const pseudo_op[] = {"eq", "lt", "le", "unord",
|
||||
|
@ -435,6 +435,8 @@ TEST(DisasmX64) {
|
||||
|
||||
__ ucomisd(xmm0, xmm1);
|
||||
|
||||
__ pmovmskb(rdx, xmm9);
|
||||
|
||||
__ pcmpeqd(xmm1, xmm0);
|
||||
|
||||
__ punpckldq(xmm1, xmm11);
|
||||
@ -650,6 +652,7 @@ TEST(DisasmX64) {
|
||||
__ vmovupd(xmm0, Operand(rbx, rcx, times_4, 10000));
|
||||
__ vmovupd(Operand(rbx, rcx, times_4, 10000), xmm0);
|
||||
__ vmovmskpd(r9, xmm4);
|
||||
__ vpmovmskb(r10, xmm9);
|
||||
|
||||
__ vmovups(xmm5, xmm1);
|
||||
__ vmovups(xmm5, Operand(rdx, 4));
|
||||
|
@ -1661,7 +1661,8 @@ WASM_SIMD_TEST(I16x8ReplaceLane) {
|
||||
}
|
||||
}
|
||||
|
||||
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_IA32
|
||||
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_IA32 || \
|
||||
V8_TARGET_ARCH_X64
|
||||
WASM_SIMD_TEST_NO_LOWERING(I8x16BitMask) {
|
||||
FLAG_SCOPE(wasm_simd_post_mvp);
|
||||
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
|
||||
@ -1721,7 +1722,8 @@ WASM_SIMD_TEST_NO_LOWERING(I32x4BitMask) {
|
||||
CHECK_EQ(actual, expected);
|
||||
}
|
||||
}
|
||||
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_IA32
|
||||
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_IA32 ||
|
||||
// V8_TARGET_ARCH_X64
|
||||
|
||||
WASM_SIMD_TEST(I8x16Splat) {
|
||||
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
|
||||
|
Loading…
Reference in New Issue
Block a user