S390 [simd]: Implement vector store lane

This CL takes advantage of the z15 `store byte reverse element`
instructions to optimize Simd StoreLane opcodes.

On the simulator we only run `store element` as reversing is
not required.

Change-Id: I723f6db535799470c46a1e298a9c1af7574ad5b6
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3144373
Reviewed-by: Junliang Yan <junyan@redhat.com>
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Cr-Commit-Position: refs/heads/main@{#76734}
This commit is contained in:
Milad Fa 2021-09-08 11:15:41 -04:00 committed by V8 LUCI CQ
parent b435c60bda
commit 379cb97600
8 changed files with 153 additions and 18 deletions

View File

@ -1553,14 +1553,7 @@ using SixByteInstr = uint64_t;
V(vlrep, VLREP, 0xE705) /* type = VRX VECTOR LOAD AND REPLICATE */ \
V(vl, VL, 0xE706) /* type = VRX VECTOR LOAD */ \
V(vlbb, VLBB, 0xE707) /* type = VRX VECTOR LOAD TO BLOCK BOUNDARY */ \
V(vsteb, VSTEB, 0xE708) /* type = VRX VECTOR STORE ELEMENT (8) */ \
V(vsteh, VSTEH, 0xE709) /* type = VRX VECTOR STORE ELEMENT (16) */ \
V(vsteg, VSTEG, 0xE70A) /* type = VRX VECTOR STORE ELEMENT (64) */ \
V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \
V(vst, VST, 0xE70E) /* type = VRX VECTOR STORE */ \
V(vlbr, VLBR, 0xE606) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENTS */ \
V(vstbr, VSTBR, \
0xE60E) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENTS */ \
V(vlbrrep, VLBRREP, \
0xE605) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT AND REPLICATE */ \
V(vlebrh, VLEBRH, \
@ -1568,7 +1561,20 @@ using SixByteInstr = uint64_t;
V(vlebrf, VLEBRF, \
0xE603) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (32) */ \
V(vlebrg, VLEBRG, \
0xE602) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (64) */
0xE602) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (64) */ \
V(vsteb, VSTEB, 0xE708) /* type = VRX VECTOR STORE ELEMENT (8) */ \
V(vsteh, VSTEH, 0xE709) /* type = VRX VECTOR STORE ELEMENT (16) */ \
V(vsteg, VSTEG, 0xE70A) /* type = VRX VECTOR STORE ELEMENT (64) */ \
V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \
V(vst, VST, 0xE70E) /* type = VRX VECTOR STORE */ \
V(vstbr, VSTBR, \
0xE60E) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENTS */ \
V(vstebrh, VSTEBRH, \
0xE609) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENT (16) */ \
V(vstebrf, VSTEBRF, \
0xE60B) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENT (32) */ \
V(vstebrg, VSTEBRG, \
0xE60A) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENT (64) */
#define S390_RIE_G_OPCODE_LIST(V) \
V(lochi, LOCHI, \

View File

@ -4005,6 +4005,30 @@ LOAD_LANE_LIST(LOAD_LANE)
#undef LOAD_LANE
#undef LOAD_LANE_LIST
void TurboAssembler::StoreLane8LE(Simd128Register src, const MemOperand& mem,
int index) {
vsteb(src, mem, Condition(index));
}
#define STORE_LANE_LIST(V) \
V(64, vstebrg, StoreU64LE, 3) \
V(32, vstebrf, StoreU32LE, 2) \
V(16, vstebrh, StoreU16LE, 1)
#define STORE_LANE(name, vector_instr, scalar_instr, condition) \
void TurboAssembler::StoreLane##name##LE(Simd128Register src, \
const MemOperand& mem, int lane) { \
if (CpuFeatures::IsSupported(VECTOR_ENHANCE_FACILITY_2) && \
is_uint12(mem.offset())) { \
vector_instr(src, mem, Condition(lane)); \
return; \
} \
vlgv(r1, src, MemOperand(r0, lane), Condition(condition)); \
scalar_instr(r1, mem); \
}
STORE_LANE_LIST(STORE_LANE)
#undef STORE_LANE
#undef STORE_LANE_LIST
#else
void TurboAssembler::LoadU64LE(Register dst, const MemOperand& mem,
Register scratch) {
@ -4112,6 +4136,16 @@ LOAD_EXTEND_LIST(LOAD_EXTEND)
#undef LOAD_EXTEND
#undef LOAD_EXTEND
void TurboAssembler::LoadV32ZeroLE(Simd128Register dst, const MemOperand& mem) {
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
vlef(dst, mem, Condition(3));
}
void TurboAssembler::LoadV64ZeroLE(Simd128Register dst, const MemOperand& mem) {
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
vleg(dst, mem, Condition(1));
}
#define LOAD_LANE_LIST(V) \
V(64, vleg) \
V(32, vlef) \
@ -4128,15 +4162,21 @@ LOAD_LANE_LIST(LOAD_LANE)
#undef LOAD_LANE
#undef LOAD_LANE_LIST
void TurboAssembler::LoadV32ZeroLE(Simd128Register dst, const MemOperand& mem) {
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
vlef(dst, mem, Condition(3));
}
#define STORE_LANE_LIST(V) \
V(64, vsteg) \
V(32, vstef) \
V(16, vsteh) \
V(8, vsteb)
void TurboAssembler::LoadV64ZeroLE(Simd128Register dst, const MemOperand& mem) {
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
vleg(dst, mem, Condition(1));
}
#define STORE_LANE(name, vector_instr) \
void TurboAssembler::StoreLane##name##LE(Simd128Register src, \
const MemOperand& mem, int lane) { \
DCHECK(is_uint12(mem.offset())); \
vector_instr(src, mem, Condition(lane)); \
}
STORE_LANE_LIST(STORE_LANE)
#undef STORE_LANE
#undef STORE_LANE_LIST
#endif

View File

@ -409,6 +409,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void LoadLane16LE(Simd128Register dst, const MemOperand& mem, int lane);
void LoadLane32LE(Simd128Register dst, const MemOperand& mem, int lane);
void LoadLane64LE(Simd128Register dst, const MemOperand& mem, int lane);
void StoreLane8LE(Simd128Register src, const MemOperand& mem, int lane);
void StoreLane16LE(Simd128Register src, const MemOperand& mem, int lane);
void StoreLane32LE(Simd128Register src, const MemOperand& mem, int lane);
void StoreLane64LE(Simd128Register src, const MemOperand& mem, int lane);
// Load And Test
void LoadAndTest32(Register dst, Register src);

View File

@ -3478,6 +3478,29 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
#undef LOAD_LANE
#define STORE_LANE(type, lane) \
AddressingMode mode = kMode_None; \
size_t index = 2; \
MemOperand operand = i.MemoryOperand(&mode, &index); \
Simd128Register src = i.InputSimd128Register(0); \
__ StoreLane##type##LE(src, operand, lane);
case kS390_S128Store8Lane: {
STORE_LANE(8, 15 - i.InputUint8(1));
break;
}
case kS390_S128Store16Lane: {
STORE_LANE(16, 7 - i.InputUint8(1));
break;
}
case kS390_S128Store32Lane: {
STORE_LANE(32, 3 - i.InputUint8(1));
break;
}
case kS390_S128Store64Lane: {
STORE_LANE(64, 1 - i.InputUint8(1));
break;
}
#undef STORE_LANE
case kS390_StoreCompressTagged: {
CHECK(!instr->HasOutput());
size_t index = 0;

View File

@ -388,6 +388,10 @@ namespace compiler {
V(S390_S128Load16Lane) \
V(S390_S128Load32Lane) \
V(S390_S128Load64Lane) \
V(S390_S128Store8Lane) \
V(S390_S128Store16Lane) \
V(S390_S128Store32Lane) \
V(S390_S128Store64Lane) \
V(S390_StoreSimd128) \
V(S390_LoadSimd128) \
V(S390_StoreCompressTagged) \

View File

@ -392,6 +392,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kS390_Push:
case kS390_PushFrame:
case kS390_StoreToStackSlot:
case kS390_S128Store8Lane:
case kS390_S128Store16Lane:
case kS390_S128Store32Lane:
case kS390_S128Store64Lane:
return kHasSideEffect;
case kS390_Word64AtomicExchangeUint64:

View File

@ -2862,8 +2862,32 @@ void InstructionSelector::VisitLoadTransform(Node* node) {
}
void InstructionSelector::VisitStoreLane(Node* node) {
// We should never reach here, see http://crrev.com/c/2577820
UNREACHABLE();
StoreLaneParameters params = StoreLaneParametersOf(node->op());
InstructionCode opcode;
if (params.rep == MachineRepresentation::kWord8) {
opcode = kS390_S128Store8Lane;
} else if (params.rep == MachineRepresentation::kWord16) {
opcode = kS390_S128Store16Lane;
} else if (params.rep == MachineRepresentation::kWord32) {
opcode = kS390_S128Store32Lane;
} else if (params.rep == MachineRepresentation::kWord64) {
opcode = kS390_S128Store64Lane;
} else {
UNREACHABLE();
}
S390OperandGenerator g(this);
InstructionOperand outputs[] = {g.DefineSameAsFirst(node)};
InstructionOperand inputs[5];
size_t input_count = 0;
inputs[input_count++] = g.UseRegister(node->InputAt(2));
inputs[input_count++] = g.UseImmediate(params.laneidx);
AddressingMode mode =
g.GetEffectiveAddressMemoryOperand(node, inputs, &input_count);
opcode |= AddressingModeField::encode(mode);
Emit(opcode, 1, outputs, input_count, inputs);
}
void InstructionSelector::VisitTruncateFloat32ToInt32(Node* node) {

View File

@ -757,7 +757,10 @@ void Simulator::EvalTableInit() {
V(vlrep, VLREP, 0xE705) /* type = VRX VECTOR LOAD AND REPLICATE */ \
V(vrepi, VREPI, 0xE745) /* type = VRI_A VECTOR REPLICATE IMMEDIATE */ \
V(vlr, VLR, 0xE756) /* type = VRR_A VECTOR LOAD */ \
V(vsteb, VSTEB, 0xE708) /* type = VRX VECTOR STORE ELEMENT (8) */ \
V(vsteh, VSTEH, 0xE709) /* type = VRX VECTOR STORE ELEMENT (16) */ \
V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \
V(vsteg, VSTEG, 0xE70A) /* type = VRX VECTOR STORE ELEMENT (64) */ \
V(vleb, VLEB, 0xE701) /* type = VRX VECTOR LOAD ELEMENT (8) */ \
V(vleh, VLEH, 0xE701) /* type = VRX VECTOR LOAD ELEMENT (16) */ \
V(vlef, VLEF, 0xE703) /* type = VRX VECTOR LOAD ELEMENT (32) */ \
@ -3186,6 +3189,24 @@ EVALUATE(VLR) {
return length;
}
EVALUATE(VSTEB) {
DCHECK_OPCODE(VSTEB);
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
intptr_t addr = GET_ADDRESS(x2, b2, d2);
int8_t value = get_simd_register_by_lane<int8_t>(r1, m3);
WriteB(addr, value);
return length;
}
EVALUATE(VSTEH) {
DCHECK_OPCODE(VSTEH);
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
intptr_t addr = GET_ADDRESS(x2, b2, d2);
int16_t value = get_simd_register_by_lane<int16_t>(r1, m3);
WriteH(addr, value);
return length;
}
EVALUATE(VSTEF) {
DCHECK_OPCODE(VSTEF);
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
@ -3195,6 +3216,15 @@ EVALUATE(VSTEF) {
return length;
}
EVALUATE(VSTEG) {
DCHECK_OPCODE(VSTEG);
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
intptr_t addr = GET_ADDRESS(x2, b2, d2);
int64_t value = get_simd_register_by_lane<int64_t>(r1, m3);
WriteDW(addr, value);
return length;
}
EVALUATE(VLEB) {
DCHECK_OPCODE(VLEB);
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);