Reland "[wasm-simd][ia32] Prototype store lane"

This is a reland of a69b7ef2ff

Original change's description:
> [wasm-simd][ia32] Prototype store lane
>
> Prototype v128.store{8,16,32,64}_lane on IA32.
>
> Drive by fix for wrong disassembly of movlps.
>
> Also added more test cases for StoreLane, test for more alignment and offset.
>
> Bug: v8:10975
> Change-Id: I0e16f1b5be824b6fc818d02d0fd84ebc0dff4174
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2557068
> Commit-Queue: Zhi An Ng <zhin@chromium.org>
> Reviewed-by: Bill Budge <bbudge@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#71511}

Bug: v8:10975
Change-Id: I2c9b219b9ab9d78a83d1bf32ad1271d717471c19
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2567317
Reviewed-by: Bill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71547}
This commit is contained in:
Zhi An Ng 2020-12-01 02:53:08 +00:00 committed by Commit Bot
parent 83d289b87f
commit 257b266ebf
11 changed files with 238 additions and 24 deletions

View File

@ -2421,6 +2421,13 @@ void Assembler::movlps(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src);
}
void Assembler::movlps(Operand dst, XMMRegister src) {
EnsureSpace ensure_space(this);
EMIT(0x0F);
EMIT(0x13);
emit_sse_operand(src, dst);
}
void Assembler::movhps(XMMRegister dst, Operand src) {
EnsureSpace ensure_space(this);
EMIT(0x0F);
@ -2428,6 +2435,13 @@ void Assembler::movhps(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src);
}
void Assembler::movhps(Operand dst, XMMRegister src) {
EnsureSpace ensure_space(this);
EMIT(0x0F);
EMIT(0x17);
emit_sse_operand(src, dst);
}
void Assembler::movdqa(Operand dst, XMMRegister src) {
EnsureSpace ensure_space(this);
EMIT(0x66);
@ -2518,6 +2532,18 @@ void Assembler::movd(Operand dst, XMMRegister src) {
emit_sse_operand(src, dst);
}
void Assembler::extractps(Operand dst, XMMRegister src, byte imm8) {
DCHECK(IsEnabled(SSE4_1));
DCHECK(is_uint8(imm8));
EnsureSpace ensure_space(this);
EMIT(0x66);
EMIT(0x0F);
EMIT(0x3A);
EMIT(0x17);
emit_sse_operand(src, dst);
EMIT(imm8);
}
void Assembler::extractps(Register dst, XMMRegister src, byte imm8) {
DCHECK(IsEnabled(SSE4_1));
DCHECK(is_uint8(imm8));
@ -2853,10 +2879,18 @@ void Assembler::vmovlps(XMMRegister dst, XMMRegister src1, Operand src2) {
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
}
void Assembler::vmovlps(Operand dst, XMMRegister src) {
vinstr(0x13, src, xmm0, dst, kNone, k0F, kWIG);
}
void Assembler::vmovhps(XMMRegister dst, XMMRegister src1, Operand src2) {
vinstr(0x16, dst, src1, src2, kNone, k0F, kWIG);
}
void Assembler::vmovhps(Operand dst, XMMRegister src) {
vinstr(0x17, src, xmm0, dst, kNone, k0F, kWIG);
}
void Assembler::vcmpps(XMMRegister dst, XMMRegister src1, Operand src2,
uint8_t cmp) {
vps(0xC2, dst, src1, src2);
@ -3023,6 +3057,11 @@ void Assembler::vpmovmskb(Register dst, XMMRegister src) {
emit_sse_operand(dst, src);
}
void Assembler::vextractps(Operand dst, XMMRegister src, byte imm8) {
vinstr(0x17, src, xmm0, dst, k66, k0F3A, VexW::kWIG);
EMIT(imm8);
}
void Assembler::bmi1(byte op, Register reg, Register vreg, Operand rm) {
DCHECK(IsEnabled(BMI1));
EnsureSpace ensure_space(this);

View File

@ -862,7 +862,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void shufpd(XMMRegister dst, XMMRegister src, byte imm8);
void movlps(XMMRegister dst, Operand src);
void movlps(Operand dst, XMMRegister src);
void movhps(XMMRegister dst, Operand src);
void movhps(Operand dst, XMMRegister src);
void maxss(XMMRegister dst, XMMRegister src) { maxss(dst, Operand(src)); }
void maxss(XMMRegister dst, Operand src);
@ -1004,6 +1006,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movss(XMMRegister dst, Operand src);
void movss(Operand dst, XMMRegister src);
void movss(XMMRegister dst, XMMRegister src) { movss(dst, Operand(src)); }
void extractps(Operand dst, XMMRegister src, byte imm8);
void extractps(Register dst, XMMRegister src, byte imm8);
void psllw(XMMRegister reg, uint8_t shift);
@ -1355,6 +1359,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vmovsd(XMMRegister dst, Operand src) {
vinstr(0x10, dst, xmm0, src, kF2, k0F, kWIG);
}
void vextractps(Operand dst, XMMRegister src, byte imm8);
void vmovaps(XMMRegister dst, XMMRegister src) { vmovaps(dst, Operand(src)); }
void vmovaps(XMMRegister dst, Operand src) { vps(0x28, dst, xmm0, src); }
void vmovapd(XMMRegister dst, XMMRegister src) { vmovapd(dst, Operand(src)); }
@ -1373,7 +1380,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vshufpd(XMMRegister dst, XMMRegister src1, Operand src2, byte imm8);
void vmovlps(XMMRegister dst, XMMRegister src1, Operand src2);
void vmovlps(Operand dst, XMMRegister src);
void vmovhps(XMMRegister dst, XMMRegister src1, Operand src2);
void vmovhps(Operand dst, XMMRegister src);
void vpsllw(XMMRegister dst, XMMRegister src, uint8_t imm8);
void vpslld(XMMRegister dst, XMMRegister src, uint8_t imm8);

View File

@ -1648,6 +1648,18 @@ void TurboAssembler::Palignr(XMMRegister dst, Operand src, uint8_t imm8) {
FATAL("no AVX or SSE3 support");
}
void TurboAssembler::Pextrb(Operand dst, XMMRegister src, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpextrb(dst, src, imm8);
return;
}
DCHECK(CpuFeatures::IsSupported(SSE4_1));
CpuFeatureScope sse_scope(this, SSE4_1);
pextrb(dst, src, imm8);
return;
}
void TurboAssembler::Pextrb(Register dst, XMMRegister src, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
@ -1662,6 +1674,18 @@ void TurboAssembler::Pextrb(Register dst, XMMRegister src, uint8_t imm8) {
FATAL("no AVX or SSE4.1 support");
}
void TurboAssembler::Pextrw(Operand dst, XMMRegister src, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpextrw(dst, src, imm8);
return;
}
DCHECK(CpuFeatures::IsSupported(SSE4_1));
CpuFeatureScope sse_scope(this, SSE4_1);
pextrw(dst, src, imm8);
return;
}
void TurboAssembler::Pextrw(Register dst, XMMRegister src, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
@ -1766,6 +1790,17 @@ void TurboAssembler::Vbroadcastss(XMMRegister dst, Operand src) {
shufps(dst, dst, static_cast<byte>(0));
}
void TurboAssembler::Extractps(Operand dst, XMMRegister src, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vextractps(dst, src, imm8);
}
DCHECK(CpuFeatures::IsSupported(SSE4_1));
CpuFeatureScope avx_scope(this, SSE4_1);
extractps(dst, src, imm8);
}
void TurboAssembler::Lzcnt(Register dst, Operand src) {
if (CpuFeatures::IsSupported(LZCNT)) {
CpuFeatureScope scope(this, LZCNT);

View File

@ -313,6 +313,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP2_WITH_TYPE(Pmovmskb, pmovmskb, Register, XMMRegister)
AVX_OP2_WITH_TYPE(Movmskpd, movmskpd, Register, XMMRegister)
AVX_OP2_WITH_TYPE(Movmskps, movmskps, Register, XMMRegister)
AVX_OP2_WITH_TYPE(Movlps, movlps, Operand, XMMRegister)
AVX_OP2_WITH_TYPE(Movhps, movlps, Operand, XMMRegister)
#undef AVX_OP2_WITH_TYPE
@ -519,7 +521,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
}
void Palignr(XMMRegister dst, Operand src, uint8_t imm8);
void Pextrb(Operand dst, XMMRegister src, uint8_t imm8);
void Pextrb(Register dst, XMMRegister src, uint8_t imm8);
void Pextrw(Operand dst, XMMRegister src, uint8_t imm8);
void Pextrw(Register dst, XMMRegister src, uint8_t imm8);
void Pextrd(Register dst, XMMRegister src, uint8_t imm8);
void Pinsrb(XMMRegister dst, Register src, int8_t imm8) {
@ -535,6 +539,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
}
void Pinsrw(XMMRegister dst, Operand src, int8_t imm8);
void Vbroadcastss(XMMRegister dst, Operand src);
void Extractps(Operand dst, XMMRegister src, uint8_t imm8);
// Expression support
// cvtsi2sd instruction only writes to the low 64-bit of dst register, which

View File

@ -3213,15 +3213,55 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32Movlps: {
DCHECK(instr->HasOutput()); // Move to memory unimplemented for now.
__ Movlps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.MemoryOperand(2));
if (instr->HasOutput()) {
__ Movlps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.MemoryOperand(2));
} else {
size_t index = 0;
Operand dst = i.MemoryOperand(&index);
__ Movlps(dst, i.InputSimd128Register(index));
}
break;
}
case kIA32Movhps: {
DCHECK(instr->HasOutput()); // Move to memory unimplemented for now.
__ Movhps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.MemoryOperand(2));
if (instr->HasOutput()) {
__ Movhps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.MemoryOperand(2));
} else {
size_t index = 0;
Operand dst = i.MemoryOperand(&index);
__ Movhps(dst, i.InputSimd128Register(index));
}
break;
}
case kIA32Pextrb: {
// TODO(zhin): Move i8x16 extract lane u into this opcode.
DCHECK(HasAddressingMode(instr));
size_t index = 0;
Operand operand = i.MemoryOperand(&index);
__ Pextrb(operand, i.InputSimd128Register(index),
i.InputUint8(index + 1));
break;
}
case kIA32Pextrw: {
// TODO(zhin): Move i16x8 extract lane u into this opcode.
DCHECK(HasAddressingMode(instr));
size_t index = 0;
Operand operand = i.MemoryOperand(&index);
__ Pextrw(operand, i.InputSimd128Register(index),
i.InputUint8(index + 1));
break;
}
case kIA32S128Store32Lane: {
size_t index = 0;
Operand operand = i.MemoryOperand(&index);
uint8_t laneidx = i.InputUint8(index + 1);
if (laneidx == 0) {
__ Movss(operand, i.InputSimd128Register(index));
} else {
DCHECK_GE(3, laneidx);
__ Extractps(operand, i.InputSimd128Register(index), 1);
}
break;
}
case kSSEI8x16SConvertI16x8: {

View File

@ -302,6 +302,9 @@ namespace compiler {
V(IA32Pinsrb) \
V(IA32Pinsrw) \
V(IA32Pinsrd) \
V(IA32Pextrb) \
V(IA32Pextrw) \
V(IA32S128Store32Lane) \
V(SSEI8x16SConvertI16x8) \
V(AVXI8x16SConvertI16x8) \
V(IA32I8x16Neg) \

View File

@ -281,6 +281,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32Pinsrb:
case kIA32Pinsrw:
case kIA32Pinsrd:
case kIA32Pextrb:
case kIA32Pextrw:
case kIA32S128Store32Lane:
case kSSEI8x16SConvertI16x8:
case kAVXI8x16SConvertI16x8:
case kIA32I8x16Neg:

View File

@ -410,8 +410,6 @@ void InstructionSelector::VisitLoadLane(Node* node) {
Emit(opcode, 1, outputs, input_count, inputs);
}
void InstructionSelector::VisitStoreLane(Node* node) {}
void InstructionSelector::VisitLoadTransform(Node* node) {
LoadTransformParameters params = LoadTransformParametersOf(node->op());
InstructionCode opcode;
@ -616,6 +614,41 @@ void InstructionSelector::VisitProtectedStore(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitStoreLane(Node* node) {
IA32OperandGenerator g(this);
StoreLaneParameters params = StoreLaneParametersOf(node->op());
InstructionCode opcode = kArchNop;
if (params.rep == MachineRepresentation::kWord8) {
opcode = kIA32Pextrb;
} else if (params.rep == MachineRepresentation::kWord16) {
opcode = kIA32Pextrw;
} else if (params.rep == MachineRepresentation::kWord32) {
opcode = kIA32S128Store32Lane;
} else if (params.rep == MachineRepresentation::kWord64) {
if (params.laneidx == 0) {
opcode = kIA32Movlps;
} else {
DCHECK_EQ(1, params.laneidx);
opcode = kIA32Movhps;
}
} else {
UNREACHABLE();
}
InstructionOperand inputs[4];
size_t input_count = 0;
AddressingMode addressing_mode =
g.GetEffectiveAddressMemoryOperand(node, inputs, &input_count);
opcode |= AddressingModeField::encode(addressing_mode);
InstructionOperand value_operand = g.UseRegister(node->InputAt(2));
inputs[input_count++] = value_operand;
inputs[input_count++] = g.UseImmediate(params.laneidx);
DCHECK_GE(4, input_count);
Emit(opcode, 0, nullptr, input_count, inputs);
}
// Architecture supports unaligned access, therefore VisitLoad is used instead
void InstructionSelector::VisitUnalignedLoad(Node* node) { UNREACHABLE(); }

View File

@ -1107,10 +1107,20 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
AppendToBuffer("vmovlps %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x13:
AppendToBuffer("vmovlps ");
current += PrintRightXMMOperand(current);
AppendToBuffer(",%s", NameOfXMMRegister(regop));
break;
case 0x16:
AppendToBuffer("vmovhps %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x17:
AppendToBuffer("vmovhps ");
current += PrintRightXMMOperand(current);
AppendToBuffer(",%s", NameOfXMMRegister(regop));
break;
case 0x28:
AppendToBuffer("vmovaps %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
@ -1835,12 +1845,22 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
get_modrm(*(data + 2), &mod, &regop, &rm);
if (f0byte == 0x12) {
data += 2;
AppendToBuffer("movlps %s,%s", NameOfXMMRegister(regop),
NameOfXMMRegister(rm));
AppendToBuffer("movlps %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
} else if (f0byte == 0x13) {
data += 2;
AppendToBuffer("movlps ");
data += PrintRightXMMOperand(data);
AppendToBuffer(",%s", NameOfXMMRegister(regop));
} else if (f0byte == 0x16) {
data += 2;
AppendToBuffer("movhps %s,%s", NameOfXMMRegister(regop),
NameOfXMMRegister(rm));
AppendToBuffer("movhps %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
} else if (f0byte == 0x17) {
data += 2;
AppendToBuffer("movhps ");
data += PrintRightXMMOperand(data);
AppendToBuffer(",%s", NameOfXMMRegister(regop));
} else if (f0byte == 0x18) {
data += 2;
const char* suffix[] = {"nta", "1", "2", "3"};

View File

@ -397,7 +397,9 @@ TEST(DisasmIa320) {
__ movq(xmm0, Operand(edx, 4));
__ movlps(xmm0, Operand(ebx, ecx, times_4, 10000));
__ movlps(Operand(ebx, ecx, times_4, 10000), xmm0);
__ movhps(xmm0, Operand(ebx, ecx, times_4, 10000));
__ movhps(Operand(ebx, ecx, times_4, 10000), xmm0);
// logic operation
__ andps(xmm0, xmm1);
@ -703,7 +705,9 @@ TEST(DisasmIa320) {
__ vhaddps(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovlps(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovlps(Operand(ebx, ecx, times_4, 10000), xmm0);
__ vmovhps(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovhps(Operand(ebx, ecx, times_4, 10000), xmm0);
__ vcmpeqps(xmm5, xmm4, xmm1);
__ vcmpeqps(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));

View File

@ -4077,9 +4077,7 @@ WASM_SIMD_TEST_NO_LOWERING(S128Load64Lane) {
RunLoadLaneTest<int64_t>(execution_tier, lower_simd, kExprS128Load64Lane,
kExprI64x2Splat);
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32
#if V8_TARGET_ARCH_X64
template <typename T>
void RunStoreLaneTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode store_op, WasmOpcode splat_op) {
@ -4090,23 +4088,24 @@ void RunStoreLaneTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
}
constexpr int lanes = kSimd128Size / sizeof(T);
constexpr int mem_index = 16; // Store from mem index 16 (bytes).
constexpr int mem_index = 16; // Store to mem index 16 (bytes).
constexpr int splat_value = 33;
WasmOpcode const_op =
splat_op == kExprI64x2Splat ? kExprI64Const : kExprI32Const;
for (int lane_index = 0; lane_index < lanes; lane_index++) {
WasmRunner<int32_t> r(execution_tier, lower_simd);
T* memory = r.builder().AddMemoryElems<T>(kWasmPageSize / sizeof(T));
T* memory; // Will be set by build_fn.
// Splat splat_value, then only Store and replace a single lane with the
auto build_fn = [=, &memory](WasmRunner<int32_t>& r, int mem_index,
int lane_index, int alignment, int offset) {
memory = r.builder().AddMemoryElems<T>(kWasmPageSize / sizeof(T));
// Splat splat_value, then only Store and replace a single lane.
BUILD(r, WASM_I32V(mem_index), const_op, splat_value,
WASM_SIMD_OP(splat_op), WASM_SIMD_OP(store_op), ZERO_ALIGNMENT,
ZERO_OFFSET, lane_index, WASM_ONE);
WASM_SIMD_OP(splat_op), WASM_SIMD_OP(store_op), alignment, offset,
lane_index, WASM_ONE);
r.builder().BlankMemory();
r.Call();
};
auto check_results = [=](WasmRunner<int32_t>& r, T* memory) {
for (int i = 0; i < lanes; i++) {
CHECK_EQ(0, r.builder().ReadMemory(&memory[i]));
}
@ -4116,6 +4115,30 @@ void RunStoreLaneTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
for (int i = lanes + 1; i < lanes * 2; i++) {
CHECK_EQ(0, r.builder().ReadMemory(&memory[i]));
}
};
for (int lane_index = 0; lane_index < lanes; lane_index++) {
WasmRunner<int32_t> r(execution_tier, lower_simd);
build_fn(r, mem_index, lane_index, ZERO_ALIGNMENT, ZERO_OFFSET);
r.Call();
check_results(r, memory);
}
// Check all possible alignments.
constexpr int max_alignment = base::bits::CountTrailingZeros(sizeof(T));
for (byte alignment = 0; alignment <= max_alignment; ++alignment) {
WasmRunner<int32_t> r(execution_tier, lower_simd);
build_fn(r, mem_index, /*lane_index=*/0, alignment, ZERO_OFFSET);
r.Call();
check_results(r, memory);
}
{
// Use memarg for offset.
WasmRunner<int32_t> r(execution_tier, lower_simd);
build_fn(r, /*mem_index=*/0, /*lane_index=*/0, ZERO_ALIGNMENT, mem_index);
r.Call();
check_results(r, memory);
}
// OOB stores
@ -4154,7 +4177,7 @@ WASM_SIMD_TEST_NO_LOWERING(S128Store64Lane) {
kExprI64x2Splat);
}
#endif // V8_TARGET_ARCH_X64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32
#define WASM_SIMD_ANYTRUE_TEST(format, lanes, max, param_type) \
WASM_SIMD_TEST(S##format##AnyTrue) { \