S390 [simd]: Implement vector load and splat

This CL takes advantage of the z15 `load reverse and replicate`
instruction to optimize Simd LoadSplat opcodes.

On the simulator we only run `load replicate` as reversing is
not required.

We will need to implement the rest of the `load transform` ops
before enabling this from wasm-compiler on BE machines.

Change-Id: I81ffedf51c3d35dbbc2a6455a2756cad25434127
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3115142
Reviewed-by: Junliang Yan <junyan@redhat.com>
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Cr-Commit-Position: refs/heads/main@{#76490}
This commit is contained in:
Milad Fa 2021-08-24 18:02:56 -04:00 committed by V8 LUCI CQ
parent 409e02c1dd
commit 3489bdf8d1
7 changed files with 97 additions and 4 deletions

View File

@ -1559,8 +1559,10 @@ using SixByteInstr = uint64_t;
V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \
V(vst, VST, 0xE70E) /* type = VRX VECTOR STORE */ \
V(vlbr, VLBR, 0xE606) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENTS */ \
V(vstbr, VSTBR, 0xE60E) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENTS \
*/
V(vstbr, VSTBR, \
0xE60E) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENTS */ \
V(vlbrrep, VLBRREP, \
0xE605) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT AND REPLICATE */
#define S390_RIE_G_OPCODE_LIST(V) \
V(lochi, LOCHI, \

View File

@ -3924,6 +3924,31 @@ void TurboAssembler::StoreV128LE(Simd128Register src, const MemOperand& mem,
}
}
void TurboAssembler::LoadAndSplat8x16LE(Simd128Register dst,
const MemOperand& mem) {
vlrep(dst, mem, Condition(0));
}
#define LOAD_SPLAT_LIST(V) \
V(64x2, LoadU64LE, 3) \
V(32x4, LoadU32LE, 2) \
V(16x8, LoadU16LE, 1)
#define LOAD_SPLAT(name, scalar_instr, condition) \
void TurboAssembler::LoadAndSplat##name##LE(Simd128Register dst, \
const MemOperand& mem) { \
if (CpuFeatures::IsSupported(VECTOR_ENHANCE_FACILITY_2) && \
is_uint12(mem.offset())) { \
vlbrrep(dst, mem, Condition(condition)); \
return; \
} \
scalar_instr(r1, mem); \
vlvg(dst, r1, MemOperand(r0, 0), Condition(condition)); \
vrep(dst, dst, Operand(0), Condition(condition)); \
}
LOAD_SPLAT_LIST(LOAD_SPLAT)
#undef LOAD_SPLAT
#undef LOAD_SPLAT_LIST
#else
void TurboAssembler::LoadU64LE(Register dst, const MemOperand& mem,
Register scratch) {
@ -3996,6 +4021,21 @@ void TurboAssembler::StoreV128LE(Simd128Register src, const MemOperand& mem,
StoreV128(src, mem, scratch1);
}
#define LOAD_SPLAT_LIST(V) \
V(64x2, 3) \
V(32x4, 2) \
V(16x8, 1) \
V(8x16, 0)
#define LOAD_SPLAT(name, condition) \
void TurboAssembler::LoadAndSplat##name##LE(Simd128Register dst, \
const MemOperand& mem) { \
vlrep(dst, mem, Condition(condition)); \
}
LOAD_SPLAT_LIST(LOAD_SPLAT)
#undef LOAD_SPLAT
#undef LOAD_SPLAT_LIST
#endif
// Load And Test (Reg <- Reg)

View File

@ -392,6 +392,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
Register scratch1);
void LoadF64LE(DoubleRegister dst, const MemOperand& opnd, Register scratch);
void LoadF32LE(DoubleRegister dst, const MemOperand& opnd, Register scratch);
void LoadAndSplat64x2LE(Simd128Register dst, const MemOperand& mem);
void LoadAndSplat32x4LE(Simd128Register dst, const MemOperand& mem);
void LoadAndSplat16x8LE(Simd128Register dst, const MemOperand& mem);
void LoadAndSplat8x16LE(Simd128Register dst, const MemOperand& mem);
// Load And Test
void LoadAndTest32(Register dst, Register src);

View File

@ -2218,6 +2218,28 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vl(i.OutputSimd128Register(), operand, Condition(0));
break;
}
#define LOAD_SPLAT(type) \
AddressingMode mode = kMode_None; \
MemOperand operand = i.MemoryOperand(&mode); \
Simd128Register dst = i.OutputSimd128Register(); \
__ LoadAndSplat##type##LE(dst, operand);
case kS390_S128Load8Splat: {
LOAD_SPLAT(8x16);
break;
}
case kS390_S128Load16Splat: {
LOAD_SPLAT(16x8);
break;
}
case kS390_S128Load32Splat: {
LOAD_SPLAT(32x4);
break;
}
case kS390_S128Load64Splat: {
LOAD_SPLAT(64x2);
break;
}
#undef LOAD_SPLAT
case kS390_StoreWord8:
ASSEMBLE_STORE_INTEGER(StoreU8);
break;

View File

@ -372,6 +372,10 @@ namespace compiler {
V(S390_S128Not) \
V(S390_S128Select) \
V(S390_S128AndNot) \
V(S390_S128Load8Splat) \
V(S390_S128Load16Splat) \
V(S390_S128Load32Splat) \
V(S390_S128Load64Splat) \
V(S390_StoreSimd128) \
V(S390_LoadSimd128) \
V(S390_StoreCompressTagged) \

View File

@ -359,6 +359,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kS390_LoadDecompressTaggedSigned:
case kS390_LoadDecompressTaggedPointer:
case kS390_LoadDecompressAnyTagged:
case kS390_S128Load8Splat:
case kS390_S128Load16Splat:
case kS390_S128Load32Splat:
case kS390_S128Load64Splat:
return kIsLoadOperation;
case kS390_StoreWord8:

View File

@ -2793,8 +2793,25 @@ void InstructionSelector::VisitLoadLane(Node* node) {
}
void InstructionSelector::VisitLoadTransform(Node* node) {
// We should never reach here, see http://crrev.com/c/2050811
UNREACHABLE();
LoadTransformParameters params = LoadTransformParametersOf(node->op());
ArchOpcode opcode;
switch (params.transformation) {
case LoadTransformation::kS128Load8Splat:
opcode = kS390_S128Load8Splat;
break;
case LoadTransformation::kS128Load16Splat:
opcode = kS390_S128Load16Splat;
break;
case LoadTransformation::kS128Load32Splat:
opcode = kS390_S128Load32Splat;
break;
case LoadTransformation::kS128Load64Splat:
opcode = kS390_S128Load64Splat;
break;
default:
UNREACHABLE();
}
VisitLoad(node, node, opcode);
}
void InstructionSelector::VisitStoreLane(Node* node) {