[wasm-simd][x64] Bitmask instructions

Implement i8x16.bitmask, i16x8.bitmask, i32x4.bitmask on x64.

Bug: v8:10308
Change-Id: Id47cb229de77d80d0a7ec91f4862a91258ff1979
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2127317
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/master@{#67022}
This commit is contained in:
Ng Zhi An 2020-04-06 09:42:49 -07:00 committed by Commit Bot
parent dfdf66cbe8
commit 043ac205ec
11 changed files with 73 additions and 4 deletions

View File

@ -3441,6 +3441,15 @@ void Assembler::movmskps(Register dst, XMMRegister src) {
emit_sse_operand(dst, src);
}
void Assembler::pmovmskb(Register dst, XMMRegister src) {
EnsureSpace ensure_space(this);
emit_optional_rex_32(dst, src);
emit(0x66);
emit(0x0F);
emit(0xD7);
emit_sse_operand(dst, src);
}
// AVX instructions
void Assembler::vmovddup(XMMRegister dst, XMMRegister src) {
@ -3634,6 +3643,15 @@ void Assembler::vucomiss(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src);
}
void Assembler::vpmovmskb(Register dst, XMMRegister src) {
XMMRegister idst = XMMRegister::from_code(dst.code());
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
emit_vex_prefix(idst, xmm0, src, kL128, k66, k0F, kWIG);
emit(0xD7);
emit_sse_operand(idst, src);
}
void Assembler::vss(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
DCHECK(IsEnabled(AVX));

View File

@ -1124,6 +1124,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movmskpd(Register dst, XMMRegister src);
void pmovmskb(Register dst, XMMRegister src);
// SSE 4.1 instruction
void insertps(XMMRegister dst, XMMRegister src, byte imm8);
void insertps(XMMRegister dst, Operand src, byte imm8);
@ -1393,6 +1395,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
XMMRegister idst = XMMRegister::from_code(dst.code());
vpd(0x50, idst, xmm0, src);
}
void vpmovmskb(Register dst, XMMRegister src);
void vcmpps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int8_t cmp) {
vps(0xC2, dst, src1, src2);
emit(cmp);

View File

@ -141,6 +141,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Movups, movups)
AVX_OP(Movmskps, movmskps)
AVX_OP(Movmskpd, movmskpd)
AVX_OP(Pmovmskb, pmovmskb)
AVX_OP(Movss, movss)
AVX_OP(Movsd, movsd)
AVX_OP(Movdqu, movdqu)

View File

@ -2634,11 +2634,15 @@ void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MaxU(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_S390X
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_IA32
// TODO(v8:10308) Bitmask operations are in prototype now, we can remove these
// guards when they go into the proposal.
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_IA32 && \
!V8_TARGET_ARCH_X64
void InstructionSelector::VisitI8x16BitMask(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI16x8BitMask(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI32x4BitMask(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_IA32
// && !V8_TARGET_ARCH_X64
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }

View File

@ -3085,6 +3085,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pabsd(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kX64I32x4BitMask: {
__ Movmskps(i.OutputRegister(), i.InputSimd128Register(0));
break;
}
case kX64S128Zero: {
XMMRegister dst = i.OutputSimd128Register();
__ Xorps(dst, dst);
@ -3273,6 +3277,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pabsw(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kX64I16x8BitMask: {
Register dst = i.OutputRegister();
XMMRegister tmp = i.TempSimd128Register(0);
__ Packsswb(tmp, i.InputSimd128Register(0));
__ Pmovmskb(dst, tmp);
__ shrq(dst, Immediate(8));
break;
}
case kX64I8x16Splat: {
XMMRegister dst = i.OutputSimd128Register();
if (HasRegisterInput(instr, 0)) {
@ -3542,6 +3554,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pabsb(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kX64I8x16BitMask: {
__ Pmovmskb(i.OutputRegister(), i.InputSimd128Register(0));
break;
}
case kX64S128And: {
__ Pand(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;

View File

@ -241,6 +241,7 @@ namespace compiler {
V(X64I32x4GtU) \
V(X64I32x4GeU) \
V(X64I32x4Abs) \
V(X64I32x4BitMask) \
V(X64I16x8Splat) \
V(X64I16x8ExtractLaneU) \
V(X64I16x8ExtractLaneS) \
@ -275,6 +276,7 @@ namespace compiler {
V(X64I16x8GeU) \
V(X64I16x8RoundingAverageU) \
V(X64I16x8Abs) \
V(X64I16x8BitMask) \
V(X64I8x16Splat) \
V(X64I8x16ExtractLaneU) \
V(X64I8x16ExtractLaneS) \
@ -304,6 +306,7 @@ namespace compiler {
V(X64I8x16GeU) \
V(X64I8x16RoundingAverageU) \
V(X64I8x16Abs) \
V(X64I8x16BitMask) \
V(X64S128Zero) \
V(X64S128Not) \
V(X64S128And) \

View File

@ -213,6 +213,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I32x4GtU:
case kX64I32x4GeU:
case kX64I32x4Abs:
case kX64I32x4BitMask:
case kX64I16x8Splat:
case kX64I16x8ExtractLaneU:
case kX64I16x8ExtractLaneS:
@ -247,6 +248,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I16x8GeU:
case kX64I16x8RoundingAverageU:
case kX64I16x8Abs:
case kX64I16x8BitMask:
case kX64I8x16Splat:
case kX64I8x16ExtractLaneU:
case kX64I8x16ExtractLaneS:
@ -276,6 +278,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I8x16GeU:
case kX64I8x16RoundingAverageU:
case kX64I8x16Abs:
case kX64I8x16BitMask:
case kX64S128And:
case kX64S128Or:
case kX64S128Xor:

View File

@ -2730,6 +2730,7 @@ VISIT_ATOMIC_BINOP(Xor)
V(I32x4UConvertI16x8Low) \
V(I32x4UConvertI16x8High) \
V(I32x4Abs) \
V(I32x4BitMask) \
V(I16x8SConvertI8x16Low) \
V(I16x8SConvertI8x16High) \
V(I16x8Neg) \
@ -2738,6 +2739,7 @@ VISIT_ATOMIC_BINOP(Xor)
V(I16x8Abs) \
V(I8x16Neg) \
V(I8x16Abs) \
V(I8x16BitMask) \
V(S128Not)
#define SIMD_SHIFT_OPCODES(V) \
@ -3033,6 +3035,13 @@ void InstructionSelector::VisitI16x8UConvertI32x4(Node* node) {
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
}
void InstructionSelector::VisitI16x8BitMask(Node* node) {
X64OperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register()};
Emit(kX64I16x8BitMask, g.DefineAsRegister(node),
g.UseUniqueRegister(node->InputAt(0)), arraysize(temps), temps);
}
void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) {
X64OperandGenerator g(this);
Emit(kX64I8x16UConvertI16x8, g.DefineSameAsFirst(node),

View File

@ -1490,6 +1490,10 @@ int DisassemblerX64::AVXInstruction(byte* data) {
current += PrintRightXMMOperand(current);
AppendToBuffer(",0x%x", *current++);
break;
case 0xD7:
AppendToBuffer("vpmovmskb %s,", NameOfCPURegister(regop));
current += PrintRightXMMOperand(current);
break;
#define DECLARE_SSE_AVX_DIS_CASE(instruction, notUsed1, notUsed2, opcode) \
case 0x##opcode: { \
AppendToBuffer("v" #instruction " %s,%s,", NameOfXMMRegister(regop), \
@ -2124,7 +2128,10 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
} else {
UnimplementedInstruction();
}
AppendToBuffer("%s %s,", mnemonic, NameOfXMMRegister(regop));
// Not every opcode here has an XMM register as the dst operand.
const char* regop_reg = opcode == 0xD7 ? NameOfCPURegister(regop)
: NameOfXMMRegister(regop);
AppendToBuffer("%s %s,", mnemonic, regop_reg);
current += PrintRightXMMOperand(current);
if (opcode == 0xC2) {
const char* const pseudo_op[] = {"eq", "lt", "le", "unord",

View File

@ -435,6 +435,8 @@ TEST(DisasmX64) {
__ ucomisd(xmm0, xmm1);
__ pmovmskb(rdx, xmm9);
__ pcmpeqd(xmm1, xmm0);
__ punpckldq(xmm1, xmm11);
@ -650,6 +652,7 @@ TEST(DisasmX64) {
__ vmovupd(xmm0, Operand(rbx, rcx, times_4, 10000));
__ vmovupd(Operand(rbx, rcx, times_4, 10000), xmm0);
__ vmovmskpd(r9, xmm4);
__ vpmovmskb(r10, xmm9);
__ vmovups(xmm5, xmm1);
__ vmovups(xmm5, Operand(rdx, 4));

View File

@ -1661,7 +1661,8 @@ WASM_SIMD_TEST(I16x8ReplaceLane) {
}
}
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_IA32
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_IA32 || \
V8_TARGET_ARCH_X64
WASM_SIMD_TEST_NO_LOWERING(I8x16BitMask) {
FLAG_SCOPE(wasm_simd_post_mvp);
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
@ -1721,7 +1722,8 @@ WASM_SIMD_TEST_NO_LOWERING(I32x4BitMask) {
CHECK_EQ(actual, expected);
}
}
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_IA32
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_IA32 ||
// V8_TARGET_ARCH_X64
WASM_SIMD_TEST(I8x16Splat) {
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);