S390 [simd]: Implement vector store lane
This CL takes advantage of the z15 `store byte reverse element` instructions to optimize Simd StoreLane opcodes. On the simulator we only run `store element` as reversing is not required. Change-Id: I723f6db535799470c46a1e298a9c1af7574ad5b6 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3144373 Reviewed-by: Junliang Yan <junyan@redhat.com> Commit-Queue: Milad Fa <mfarazma@redhat.com> Cr-Commit-Position: refs/heads/main@{#76734}
This commit is contained in:
parent
b435c60bda
commit
379cb97600
@ -1553,14 +1553,7 @@ using SixByteInstr = uint64_t;
|
||||
V(vlrep, VLREP, 0xE705) /* type = VRX VECTOR LOAD AND REPLICATE */ \
|
||||
V(vl, VL, 0xE706) /* type = VRX VECTOR LOAD */ \
|
||||
V(vlbb, VLBB, 0xE707) /* type = VRX VECTOR LOAD TO BLOCK BOUNDARY */ \
|
||||
V(vsteb, VSTEB, 0xE708) /* type = VRX VECTOR STORE ELEMENT (8) */ \
|
||||
V(vsteh, VSTEH, 0xE709) /* type = VRX VECTOR STORE ELEMENT (16) */ \
|
||||
V(vsteg, VSTEG, 0xE70A) /* type = VRX VECTOR STORE ELEMENT (64) */ \
|
||||
V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \
|
||||
V(vst, VST, 0xE70E) /* type = VRX VECTOR STORE */ \
|
||||
V(vlbr, VLBR, 0xE606) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENTS */ \
|
||||
V(vstbr, VSTBR, \
|
||||
0xE60E) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENTS */ \
|
||||
V(vlbrrep, VLBRREP, \
|
||||
0xE605) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT AND REPLICATE */ \
|
||||
V(vlebrh, VLEBRH, \
|
||||
@ -1568,7 +1561,20 @@ using SixByteInstr = uint64_t;
|
||||
V(vlebrf, VLEBRF, \
|
||||
0xE603) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (32) */ \
|
||||
V(vlebrg, VLEBRG, \
|
||||
0xE602) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (64) */
|
||||
0xE602) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (64) */ \
|
||||
V(vsteb, VSTEB, 0xE708) /* type = VRX VECTOR STORE ELEMENT (8) */ \
|
||||
V(vsteh, VSTEH, 0xE709) /* type = VRX VECTOR STORE ELEMENT (16) */ \
|
||||
V(vsteg, VSTEG, 0xE70A) /* type = VRX VECTOR STORE ELEMENT (64) */ \
|
||||
V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \
|
||||
V(vst, VST, 0xE70E) /* type = VRX VECTOR STORE */ \
|
||||
V(vstbr, VSTBR, \
|
||||
0xE60E) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENTS */ \
|
||||
V(vstebrh, VSTEBRH, \
|
||||
0xE609) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENT (16) */ \
|
||||
V(vstebrf, VSTEBRF, \
|
||||
0xE60B) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENT (32) */ \
|
||||
V(vstebrg, VSTEBRG, \
|
||||
0xE60A) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENT (64) */
|
||||
|
||||
#define S390_RIE_G_OPCODE_LIST(V) \
|
||||
V(lochi, LOCHI, \
|
||||
|
@ -4005,6 +4005,30 @@ LOAD_LANE_LIST(LOAD_LANE)
|
||||
#undef LOAD_LANE
|
||||
#undef LOAD_LANE_LIST
|
||||
|
||||
void TurboAssembler::StoreLane8LE(Simd128Register src, const MemOperand& mem,
|
||||
int index) {
|
||||
vsteb(src, mem, Condition(index));
|
||||
}
|
||||
#define STORE_LANE_LIST(V) \
|
||||
V(64, vstebrg, StoreU64LE, 3) \
|
||||
V(32, vstebrf, StoreU32LE, 2) \
|
||||
V(16, vstebrh, StoreU16LE, 1)
|
||||
|
||||
#define STORE_LANE(name, vector_instr, scalar_instr, condition) \
|
||||
void TurboAssembler::StoreLane##name##LE(Simd128Register src, \
|
||||
const MemOperand& mem, int lane) { \
|
||||
if (CpuFeatures::IsSupported(VECTOR_ENHANCE_FACILITY_2) && \
|
||||
is_uint12(mem.offset())) { \
|
||||
vector_instr(src, mem, Condition(lane)); \
|
||||
return; \
|
||||
} \
|
||||
vlgv(r1, src, MemOperand(r0, lane), Condition(condition)); \
|
||||
scalar_instr(r1, mem); \
|
||||
}
|
||||
STORE_LANE_LIST(STORE_LANE)
|
||||
#undef STORE_LANE
|
||||
#undef STORE_LANE_LIST
|
||||
|
||||
#else
|
||||
void TurboAssembler::LoadU64LE(Register dst, const MemOperand& mem,
|
||||
Register scratch) {
|
||||
@ -4112,6 +4136,16 @@ LOAD_EXTEND_LIST(LOAD_EXTEND)
|
||||
#undef LOAD_EXTEND
|
||||
#undef LOAD_EXTEND
|
||||
|
||||
void TurboAssembler::LoadV32ZeroLE(Simd128Register dst, const MemOperand& mem) {
|
||||
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
|
||||
vlef(dst, mem, Condition(3));
|
||||
}
|
||||
|
||||
void TurboAssembler::LoadV64ZeroLE(Simd128Register dst, const MemOperand& mem) {
|
||||
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
|
||||
vleg(dst, mem, Condition(1));
|
||||
}
|
||||
|
||||
#define LOAD_LANE_LIST(V) \
|
||||
V(64, vleg) \
|
||||
V(32, vlef) \
|
||||
@ -4128,15 +4162,21 @@ LOAD_LANE_LIST(LOAD_LANE)
|
||||
#undef LOAD_LANE
|
||||
#undef LOAD_LANE_LIST
|
||||
|
||||
void TurboAssembler::LoadV32ZeroLE(Simd128Register dst, const MemOperand& mem) {
|
||||
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
|
||||
vlef(dst, mem, Condition(3));
|
||||
}
|
||||
#define STORE_LANE_LIST(V) \
|
||||
V(64, vsteg) \
|
||||
V(32, vstef) \
|
||||
V(16, vsteh) \
|
||||
V(8, vsteb)
|
||||
|
||||
void TurboAssembler::LoadV64ZeroLE(Simd128Register dst, const MemOperand& mem) {
|
||||
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
|
||||
vleg(dst, mem, Condition(1));
|
||||
}
|
||||
#define STORE_LANE(name, vector_instr) \
|
||||
void TurboAssembler::StoreLane##name##LE(Simd128Register src, \
|
||||
const MemOperand& mem, int lane) { \
|
||||
DCHECK(is_uint12(mem.offset())); \
|
||||
vector_instr(src, mem, Condition(lane)); \
|
||||
}
|
||||
STORE_LANE_LIST(STORE_LANE)
|
||||
#undef STORE_LANE
|
||||
#undef STORE_LANE_LIST
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -409,6 +409,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
void LoadLane16LE(Simd128Register dst, const MemOperand& mem, int lane);
|
||||
void LoadLane32LE(Simd128Register dst, const MemOperand& mem, int lane);
|
||||
void LoadLane64LE(Simd128Register dst, const MemOperand& mem, int lane);
|
||||
void StoreLane8LE(Simd128Register src, const MemOperand& mem, int lane);
|
||||
void StoreLane16LE(Simd128Register src, const MemOperand& mem, int lane);
|
||||
void StoreLane32LE(Simd128Register src, const MemOperand& mem, int lane);
|
||||
void StoreLane64LE(Simd128Register src, const MemOperand& mem, int lane);
|
||||
|
||||
// Load And Test
|
||||
void LoadAndTest32(Register dst, Register src);
|
||||
|
@ -3478,6 +3478,29 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
#undef LOAD_LANE
|
||||
#define STORE_LANE(type, lane) \
|
||||
AddressingMode mode = kMode_None; \
|
||||
size_t index = 2; \
|
||||
MemOperand operand = i.MemoryOperand(&mode, &index); \
|
||||
Simd128Register src = i.InputSimd128Register(0); \
|
||||
__ StoreLane##type##LE(src, operand, lane);
|
||||
case kS390_S128Store8Lane: {
|
||||
STORE_LANE(8, 15 - i.InputUint8(1));
|
||||
break;
|
||||
}
|
||||
case kS390_S128Store16Lane: {
|
||||
STORE_LANE(16, 7 - i.InputUint8(1));
|
||||
break;
|
||||
}
|
||||
case kS390_S128Store32Lane: {
|
||||
STORE_LANE(32, 3 - i.InputUint8(1));
|
||||
break;
|
||||
}
|
||||
case kS390_S128Store64Lane: {
|
||||
STORE_LANE(64, 1 - i.InputUint8(1));
|
||||
break;
|
||||
}
|
||||
#undef STORE_LANE
|
||||
case kS390_StoreCompressTagged: {
|
||||
CHECK(!instr->HasOutput());
|
||||
size_t index = 0;
|
||||
|
@ -388,6 +388,10 @@ namespace compiler {
|
||||
V(S390_S128Load16Lane) \
|
||||
V(S390_S128Load32Lane) \
|
||||
V(S390_S128Load64Lane) \
|
||||
V(S390_S128Store8Lane) \
|
||||
V(S390_S128Store16Lane) \
|
||||
V(S390_S128Store32Lane) \
|
||||
V(S390_S128Store64Lane) \
|
||||
V(S390_StoreSimd128) \
|
||||
V(S390_LoadSimd128) \
|
||||
V(S390_StoreCompressTagged) \
|
||||
|
@ -392,6 +392,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kS390_Push:
|
||||
case kS390_PushFrame:
|
||||
case kS390_StoreToStackSlot:
|
||||
case kS390_S128Store8Lane:
|
||||
case kS390_S128Store16Lane:
|
||||
case kS390_S128Store32Lane:
|
||||
case kS390_S128Store64Lane:
|
||||
return kHasSideEffect;
|
||||
|
||||
case kS390_Word64AtomicExchangeUint64:
|
||||
|
@ -2862,8 +2862,32 @@ void InstructionSelector::VisitLoadTransform(Node* node) {
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitStoreLane(Node* node) {
|
||||
// We should never reach here, see http://crrev.com/c/2577820
|
||||
UNREACHABLE();
|
||||
StoreLaneParameters params = StoreLaneParametersOf(node->op());
|
||||
InstructionCode opcode;
|
||||
if (params.rep == MachineRepresentation::kWord8) {
|
||||
opcode = kS390_S128Store8Lane;
|
||||
} else if (params.rep == MachineRepresentation::kWord16) {
|
||||
opcode = kS390_S128Store16Lane;
|
||||
} else if (params.rep == MachineRepresentation::kWord32) {
|
||||
opcode = kS390_S128Store32Lane;
|
||||
} else if (params.rep == MachineRepresentation::kWord64) {
|
||||
opcode = kS390_S128Store64Lane;
|
||||
} else {
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
S390OperandGenerator g(this);
|
||||
InstructionOperand outputs[] = {g.DefineSameAsFirst(node)};
|
||||
InstructionOperand inputs[5];
|
||||
size_t input_count = 0;
|
||||
|
||||
inputs[input_count++] = g.UseRegister(node->InputAt(2));
|
||||
inputs[input_count++] = g.UseImmediate(params.laneidx);
|
||||
|
||||
AddressingMode mode =
|
||||
g.GetEffectiveAddressMemoryOperand(node, inputs, &input_count);
|
||||
opcode |= AddressingModeField::encode(mode);
|
||||
Emit(opcode, 1, outputs, input_count, inputs);
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitTruncateFloat32ToInt32(Node* node) {
|
||||
|
@ -757,7 +757,10 @@ void Simulator::EvalTableInit() {
|
||||
V(vlrep, VLREP, 0xE705) /* type = VRX VECTOR LOAD AND REPLICATE */ \
|
||||
V(vrepi, VREPI, 0xE745) /* type = VRI_A VECTOR REPLICATE IMMEDIATE */ \
|
||||
V(vlr, VLR, 0xE756) /* type = VRR_A VECTOR LOAD */ \
|
||||
V(vsteb, VSTEB, 0xE708) /* type = VRX VECTOR STORE ELEMENT (8) */ \
|
||||
V(vsteh, VSTEH, 0xE709) /* type = VRX VECTOR STORE ELEMENT (16) */ \
|
||||
V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \
|
||||
V(vsteg, VSTEG, 0xE70A) /* type = VRX VECTOR STORE ELEMENT (64) */ \
|
||||
V(vleb, VLEB, 0xE701) /* type = VRX VECTOR LOAD ELEMENT (8) */ \
|
||||
V(vleh, VLEH, 0xE701) /* type = VRX VECTOR LOAD ELEMENT (16) */ \
|
||||
V(vlef, VLEF, 0xE703) /* type = VRX VECTOR LOAD ELEMENT (32) */ \
|
||||
@ -3186,6 +3189,24 @@ EVALUATE(VLR) {
|
||||
return length;
|
||||
}
|
||||
|
||||
EVALUATE(VSTEB) {
|
||||
DCHECK_OPCODE(VSTEB);
|
||||
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
|
||||
intptr_t addr = GET_ADDRESS(x2, b2, d2);
|
||||
int8_t value = get_simd_register_by_lane<int8_t>(r1, m3);
|
||||
WriteB(addr, value);
|
||||
return length;
|
||||
}
|
||||
|
||||
EVALUATE(VSTEH) {
|
||||
DCHECK_OPCODE(VSTEH);
|
||||
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
|
||||
intptr_t addr = GET_ADDRESS(x2, b2, d2);
|
||||
int16_t value = get_simd_register_by_lane<int16_t>(r1, m3);
|
||||
WriteH(addr, value);
|
||||
return length;
|
||||
}
|
||||
|
||||
EVALUATE(VSTEF) {
|
||||
DCHECK_OPCODE(VSTEF);
|
||||
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
|
||||
@ -3195,6 +3216,15 @@ EVALUATE(VSTEF) {
|
||||
return length;
|
||||
}
|
||||
|
||||
EVALUATE(VSTEG) {
|
||||
DCHECK_OPCODE(VSTEG);
|
||||
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
|
||||
intptr_t addr = GET_ADDRESS(x2, b2, d2);
|
||||
int64_t value = get_simd_register_by_lane<int64_t>(r1, m3);
|
||||
WriteDW(addr, value);
|
||||
return length;
|
||||
}
|
||||
|
||||
EVALUATE(VLEB) {
|
||||
DCHECK_OPCODE(VLEB);
|
||||
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
|
||||
|
Loading…
Reference in New Issue
Block a user