From 379cb976004138fae9b869d705b38fd3e97f569e Mon Sep 17 00:00:00 2001 From: Milad Fa Date: Wed, 8 Sep 2021 11:15:41 -0400 Subject: [PATCH] S390 [simd]: Implement vector store lane This CL takes advantage of the z15 `store byte reverse element` instructions to optimize Simd StoreLane opcodes. On the simulator we only run `store element` as reversing is not required. Change-Id: I723f6db535799470c46a1e298a9c1af7574ad5b6 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3144373 Reviewed-by: Junliang Yan Commit-Queue: Milad Fa Cr-Commit-Position: refs/heads/main@{#76734} --- src/codegen/s390/constants-s390.h | 22 +++++--- src/codegen/s390/macro-assembler-s390.cc | 56 ++++++++++++++++--- src/codegen/s390/macro-assembler-s390.h | 4 ++ .../backend/s390/code-generator-s390.cc | 23 ++++++++ .../backend/s390/instruction-codes-s390.h | 4 ++ .../s390/instruction-scheduler-s390.cc | 4 ++ .../backend/s390/instruction-selector-s390.cc | 28 +++++++++- src/execution/s390/simulator-s390.cc | 30 ++++++++++ 8 files changed, 153 insertions(+), 18 deletions(-) diff --git a/src/codegen/s390/constants-s390.h b/src/codegen/s390/constants-s390.h index 20480d2ec4..23e77c93d7 100644 --- a/src/codegen/s390/constants-s390.h +++ b/src/codegen/s390/constants-s390.h @@ -1553,14 +1553,7 @@ using SixByteInstr = uint64_t; V(vlrep, VLREP, 0xE705) /* type = VRX VECTOR LOAD AND REPLICATE */ \ V(vl, VL, 0xE706) /* type = VRX VECTOR LOAD */ \ V(vlbb, VLBB, 0xE707) /* type = VRX VECTOR LOAD TO BLOCK BOUNDARY */ \ - V(vsteb, VSTEB, 0xE708) /* type = VRX VECTOR STORE ELEMENT (8) */ \ - V(vsteh, VSTEH, 0xE709) /* type = VRX VECTOR STORE ELEMENT (16) */ \ - V(vsteg, VSTEG, 0xE70A) /* type = VRX VECTOR STORE ELEMENT (64) */ \ - V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \ - V(vst, VST, 0xE70E) /* type = VRX VECTOR STORE */ \ V(vlbr, VLBR, 0xE606) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENTS */ \ - V(vstbr, VSTBR, \ - 0xE60E) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENTS */ \ V(vlbrrep, VLBRREP, \ 0xE605) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT AND REPLICATE */ \ V(vlebrh, VLEBRH, \ @@ -1568,7 +1561,20 @@ using SixByteInstr = uint64_t; V(vlebrf, VLEBRF, \ 0xE603) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (32) */ \ V(vlebrg, VLEBRG, \ - 0xE602) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (64) */ + 0xE602) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (64) */ \ + V(vsteb, VSTEB, 0xE708) /* type = VRX VECTOR STORE ELEMENT (8) */ \ + V(vsteh, VSTEH, 0xE709) /* type = VRX VECTOR STORE ELEMENT (16) */ \ + V(vsteg, VSTEG, 0xE70A) /* type = VRX VECTOR STORE ELEMENT (64) */ \ + V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \ + V(vst, VST, 0xE70E) /* type = VRX VECTOR STORE */ \ + V(vstbr, VSTBR, \ + 0xE60E) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENTS */ \ + V(vstebrh, VSTEBRH, \ + 0xE609) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENT (16) */ \ + V(vstebrf, VSTEBRF, \ + 0xE60B) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENT (32) */ \ + V(vstebrg, VSTEBRG, \ + 0xE60A) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENT (64) */ #define S390_RIE_G_OPCODE_LIST(V) \ V(lochi, LOCHI, \ diff --git a/src/codegen/s390/macro-assembler-s390.cc b/src/codegen/s390/macro-assembler-s390.cc index a5a42a7e5d..a6c55746f8 100644 --- a/src/codegen/s390/macro-assembler-s390.cc +++ b/src/codegen/s390/macro-assembler-s390.cc @@ -4005,6 +4005,30 @@ LOAD_LANE_LIST(LOAD_LANE) #undef LOAD_LANE #undef LOAD_LANE_LIST +void TurboAssembler::StoreLane8LE(Simd128Register src, const MemOperand& mem, + int index) { + vsteb(src, mem, Condition(index)); +} +#define STORE_LANE_LIST(V) \ + V(64, vstebrg, StoreU64LE, 3) \ + V(32, vstebrf, StoreU32LE, 2) \ + V(16, vstebrh, StoreU16LE, 1) + +#define STORE_LANE(name, vector_instr, scalar_instr, condition) \ + void TurboAssembler::StoreLane##name##LE(Simd128Register src, \ + const MemOperand& mem, int lane) { \ + if (CpuFeatures::IsSupported(VECTOR_ENHANCE_FACILITY_2) && \ + is_uint12(mem.offset())) { \ + vector_instr(src, mem, Condition(lane)); \ + return; \ + } \ + vlgv(r1, src, MemOperand(r0, lane), Condition(condition)); \ + scalar_instr(r1, mem); \ + } +STORE_LANE_LIST(STORE_LANE) +#undef STORE_LANE +#undef STORE_LANE_LIST + #else void TurboAssembler::LoadU64LE(Register dst, const MemOperand& mem, Register scratch) { @@ -4112,6 +4136,16 @@ LOAD_EXTEND_LIST(LOAD_EXTEND) #undef LOAD_EXTEND #undef LOAD_EXTEND +void TurboAssembler::LoadV32ZeroLE(Simd128Register dst, const MemOperand& mem) { + vx(dst, dst, dst, Condition(0), Condition(0), Condition(0)); + vlef(dst, mem, Condition(3)); +} + +void TurboAssembler::LoadV64ZeroLE(Simd128Register dst, const MemOperand& mem) { + vx(dst, dst, dst, Condition(0), Condition(0), Condition(0)); + vleg(dst, mem, Condition(1)); +} + #define LOAD_LANE_LIST(V) \ V(64, vleg) \ V(32, vlef) \ @@ -4128,15 +4162,21 @@ LOAD_LANE_LIST(LOAD_LANE) #undef LOAD_LANE #undef LOAD_LANE_LIST -void TurboAssembler::LoadV32ZeroLE(Simd128Register dst, const MemOperand& mem) { - vx(dst, dst, dst, Condition(0), Condition(0), Condition(0)); - vlef(dst, mem, Condition(3)); -} +#define STORE_LANE_LIST(V) \ + V(64, vsteg) \ + V(32, vstef) \ + V(16, vsteh) \ + V(8, vsteb) -void TurboAssembler::LoadV64ZeroLE(Simd128Register dst, const MemOperand& mem) { - vx(dst, dst, dst, Condition(0), Condition(0), Condition(0)); - vleg(dst, mem, Condition(1)); -} +#define STORE_LANE(name, vector_instr) \ + void TurboAssembler::StoreLane##name##LE(Simd128Register src, \ + const MemOperand& mem, int lane) { \ + DCHECK(is_uint12(mem.offset())); \ + vector_instr(src, mem, Condition(lane)); \ + } +STORE_LANE_LIST(STORE_LANE) +#undef STORE_LANE +#undef STORE_LANE_LIST #endif diff --git a/src/codegen/s390/macro-assembler-s390.h b/src/codegen/s390/macro-assembler-s390.h index 18ab24bf3c..b7123d5960 100644 --- a/src/codegen/s390/macro-assembler-s390.h +++ b/src/codegen/s390/macro-assembler-s390.h @@ -409,6 +409,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { void LoadLane16LE(Simd128Register dst, const MemOperand& mem, int lane); void LoadLane32LE(Simd128Register dst, const MemOperand& mem, int lane); void LoadLane64LE(Simd128Register dst, const MemOperand& mem, int lane); + void StoreLane8LE(Simd128Register src, const MemOperand& mem, int lane); + void StoreLane16LE(Simd128Register src, const MemOperand& mem, int lane); + void StoreLane32LE(Simd128Register src, const MemOperand& mem, int lane); + void StoreLane64LE(Simd128Register src, const MemOperand& mem, int lane); // Load And Test void LoadAndTest32(Register dst, Register src); diff --git a/src/compiler/backend/s390/code-generator-s390.cc b/src/compiler/backend/s390/code-generator-s390.cc index 6ce4e23ef4..3c2c3d6c06 100644 --- a/src/compiler/backend/s390/code-generator-s390.cc +++ b/src/compiler/backend/s390/code-generator-s390.cc @@ -3478,6 +3478,29 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( break; } #undef LOAD_LANE +#define STORE_LANE(type, lane) \ + AddressingMode mode = kMode_None; \ + size_t index = 2; \ + MemOperand operand = i.MemoryOperand(&mode, &index); \ + Simd128Register src = i.InputSimd128Register(0); \ + __ StoreLane##type##LE(src, operand, lane); + case kS390_S128Store8Lane: { + STORE_LANE(8, 15 - i.InputUint8(1)); + break; + } + case kS390_S128Store16Lane: { + STORE_LANE(16, 7 - i.InputUint8(1)); + break; + } + case kS390_S128Store32Lane: { + STORE_LANE(32, 3 - i.InputUint8(1)); + break; + } + case kS390_S128Store64Lane: { + STORE_LANE(64, 1 - i.InputUint8(1)); + break; + } +#undef STORE_LANE case kS390_StoreCompressTagged: { CHECK(!instr->HasOutput()); size_t index = 0; diff --git a/src/compiler/backend/s390/instruction-codes-s390.h b/src/compiler/backend/s390/instruction-codes-s390.h index 15ac9078c4..03806b57b1 100644 --- a/src/compiler/backend/s390/instruction-codes-s390.h +++ b/src/compiler/backend/s390/instruction-codes-s390.h @@ -388,6 +388,10 @@ namespace compiler { V(S390_S128Load16Lane) \ V(S390_S128Load32Lane) \ V(S390_S128Load64Lane) \ + V(S390_S128Store8Lane) \ + V(S390_S128Store16Lane) \ + V(S390_S128Store32Lane) \ + V(S390_S128Store64Lane) \ V(S390_StoreSimd128) \ V(S390_LoadSimd128) \ V(S390_StoreCompressTagged) \ diff --git a/src/compiler/backend/s390/instruction-scheduler-s390.cc b/src/compiler/backend/s390/instruction-scheduler-s390.cc index 232de4d873..d7046507c7 100644 --- a/src/compiler/backend/s390/instruction-scheduler-s390.cc +++ b/src/compiler/backend/s390/instruction-scheduler-s390.cc @@ -392,6 +392,10 @@ int InstructionScheduler::GetTargetInstructionFlags( case kS390_Push: case kS390_PushFrame: case kS390_StoreToStackSlot: + case kS390_S128Store8Lane: + case kS390_S128Store16Lane: + case kS390_S128Store32Lane: + case kS390_S128Store64Lane: return kHasSideEffect; case kS390_Word64AtomicExchangeUint64: diff --git a/src/compiler/backend/s390/instruction-selector-s390.cc b/src/compiler/backend/s390/instruction-selector-s390.cc index d141714352..489065e65f 100644 --- a/src/compiler/backend/s390/instruction-selector-s390.cc +++ b/src/compiler/backend/s390/instruction-selector-s390.cc @@ -2862,8 +2862,32 @@ void InstructionSelector::VisitLoadTransform(Node* node) { } void InstructionSelector::VisitStoreLane(Node* node) { - // We should never reach here, see http://crrev.com/c/2577820 - UNREACHABLE(); + StoreLaneParameters params = StoreLaneParametersOf(node->op()); + InstructionCode opcode; + if (params.rep == MachineRepresentation::kWord8) { + opcode = kS390_S128Store8Lane; + } else if (params.rep == MachineRepresentation::kWord16) { + opcode = kS390_S128Store16Lane; + } else if (params.rep == MachineRepresentation::kWord32) { + opcode = kS390_S128Store32Lane; + } else if (params.rep == MachineRepresentation::kWord64) { + opcode = kS390_S128Store64Lane; + } else { + UNREACHABLE(); + } + + S390OperandGenerator g(this); + InstructionOperand outputs[] = {g.DefineSameAsFirst(node)}; + InstructionOperand inputs[5]; + size_t input_count = 0; + + inputs[input_count++] = g.UseRegister(node->InputAt(2)); + inputs[input_count++] = g.UseImmediate(params.laneidx); + + AddressingMode mode = + g.GetEffectiveAddressMemoryOperand(node, inputs, &input_count); + opcode |= AddressingModeField::encode(mode); + Emit(opcode, 1, outputs, input_count, inputs); } void InstructionSelector::VisitTruncateFloat32ToInt32(Node* node) { diff --git a/src/execution/s390/simulator-s390.cc b/src/execution/s390/simulator-s390.cc index fd21af1374..31a03eed4e 100644 --- a/src/execution/s390/simulator-s390.cc +++ b/src/execution/s390/simulator-s390.cc @@ -757,7 +757,10 @@ void Simulator::EvalTableInit() { V(vlrep, VLREP, 0xE705) /* type = VRX VECTOR LOAD AND REPLICATE */ \ V(vrepi, VREPI, 0xE745) /* type = VRI_A VECTOR REPLICATE IMMEDIATE */ \ V(vlr, VLR, 0xE756) /* type = VRR_A VECTOR LOAD */ \ + V(vsteb, VSTEB, 0xE708) /* type = VRX VECTOR STORE ELEMENT (8) */ \ + V(vsteh, VSTEH, 0xE709) /* type = VRX VECTOR STORE ELEMENT (16) */ \ V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \ + V(vsteg, VSTEG, 0xE70A) /* type = VRX VECTOR STORE ELEMENT (64) */ \ V(vleb, VLEB, 0xE701) /* type = VRX VECTOR LOAD ELEMENT (8) */ \ V(vleh, VLEH, 0xE701) /* type = VRX VECTOR LOAD ELEMENT (16) */ \ V(vlef, VLEF, 0xE703) /* type = VRX VECTOR LOAD ELEMENT (32) */ \ @@ -3186,6 +3189,24 @@ EVALUATE(VLR) { return length; } +EVALUATE(VSTEB) { + DCHECK_OPCODE(VSTEB); + DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3); + intptr_t addr = GET_ADDRESS(x2, b2, d2); + int8_t value = get_simd_register_by_lane(r1, m3); + WriteB(addr, value); + return length; +} + +EVALUATE(VSTEH) { + DCHECK_OPCODE(VSTEH); + DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3); + intptr_t addr = GET_ADDRESS(x2, b2, d2); + int16_t value = get_simd_register_by_lane(r1, m3); + WriteH(addr, value); + return length; +} + EVALUATE(VSTEF) { DCHECK_OPCODE(VSTEF); DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3); @@ -3195,6 +3216,15 @@ EVALUATE(VSTEF) { return length; } +EVALUATE(VSTEG) { + DCHECK_OPCODE(VSTEG); + DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3); + intptr_t addr = GET_ADDRESS(x2, b2, d2); + int64_t value = get_simd_register_by_lane(r1, m3); + WriteDW(addr, value); + return length; +} + EVALUATE(VLEB) { DCHECK_OPCODE(VLEB); DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);