[wasm-simd] Implement load splat and extends on arm64

Bug: v8:9886
Change-Id: I88a4364596ef529c3873f4c80f36e0bfbe71e022
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1919695
Reviewed-by: Bill Budge <bbudge@chromium.org>
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65045}
This commit is contained in:
Ng Zhi An 2019-11-18 14:35:31 -08:00 committed by Commit Bot
parent f067ed8315
commit a8c28fa1bc
6 changed files with 129 additions and 4 deletions

View File

@ -2495,6 +2495,52 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Add(i.OutputRegister32(), i.OutputRegister32(), 1);
break;
}
case kArm64S8x16LoadSplat: {
__ ld1r(i.OutputSimd128Register().V16B(), i.MemoryOperand(0));
break;
}
case kArm64S16x8LoadSplat: {
__ ld1r(i.OutputSimd128Register().V8H(), i.MemoryOperand(0));
break;
}
case kArm64S32x4LoadSplat: {
__ ld1r(i.OutputSimd128Register().V4S(), i.MemoryOperand(0));
break;
}
case kArm64S64x2LoadSplat: {
__ ld1r(i.OutputSimd128Register().V2D(), i.MemoryOperand(0));
break;
}
case kArm64I16x8Load8x8S: {
__ ld1(i.OutputSimd128Register().V8B(), i.MemoryOperand(0));
__ Sxtl(i.OutputSimd128Register().V8H(), i.OutputSimd128Register().V8B());
break;
}
case kArm64I16x8Load8x8U: {
__ ld1(i.OutputSimd128Register().V8B(), i.MemoryOperand(0));
__ Uxtl(i.OutputSimd128Register().V8H(), i.OutputSimd128Register().V8B());
break;
}
case kArm64I32x4Load16x4S: {
__ ld1(i.OutputSimd128Register().V4H(), i.MemoryOperand(0));
__ Sxtl(i.OutputSimd128Register().V4S(), i.OutputSimd128Register().V4H());
break;
}
case kArm64I32x4Load16x4U: {
__ ld1(i.OutputSimd128Register().V4H(), i.MemoryOperand(0));
__ Uxtl(i.OutputSimd128Register().V4S(), i.OutputSimd128Register().V4H());
break;
}
case kArm64I64x2Load32x2S: {
__ ld1(i.OutputSimd128Register().V2S(), i.MemoryOperand(0));
__ Sxtl(i.OutputSimd128Register().V2D(), i.OutputSimd128Register().V2S());
break;
}
case kArm64I64x2Load32x2U: {
__ ld1(i.OutputSimd128Register().V2S(), i.MemoryOperand(0));
__ Uxtl(i.OutputSimd128Register().V2D(), i.OutputSimd128Register().V2S());
break;
}
#define SIMD_REDUCE_OP_CASE(Op, Instr, format, FORMAT) \
case Op: { \
UseScratchRegisterScope scope(tasm()); \

View File

@ -355,6 +355,16 @@ namespace compiler {
V(Arm64S1x8AllTrue) \
V(Arm64S1x16AnyTrue) \
V(Arm64S1x16AllTrue) \
V(Arm64S8x16LoadSplat) \
V(Arm64S16x8LoadSplat) \
V(Arm64S32x4LoadSplat) \
V(Arm64S64x2LoadSplat) \
V(Arm64I16x8Load8x8S) \
V(Arm64I16x8Load8x8U) \
V(Arm64I32x4Load16x4S) \
V(Arm64I32x4Load16x4U) \
V(Arm64I64x2Load32x2S) \
V(Arm64I64x2Load32x2U) \
V(Arm64Word64AtomicLoadUint8) \
V(Arm64Word64AtomicLoadUint16) \
V(Arm64Word64AtomicLoadUint32) \

View File

@ -345,6 +345,16 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64LdrDecompressTaggedPointer:
case kArm64LdrDecompressAnyTagged:
case kArm64Peek:
case kArm64S8x16LoadSplat:
case kArm64S16x8LoadSplat:
case kArm64S32x4LoadSplat:
case kArm64S64x2LoadSplat:
case kArm64I16x8Load8x8S:
case kArm64I16x8Load8x8U:
case kArm64I32x4Load16x4S:
case kArm64I32x4Load16x4U:
case kArm64I64x2Load32x2S:
case kArm64I64x2Load32x2U:
return kIsLoadOperation;
case kArm64Claim:

View File

@ -592,6 +592,66 @@ void EmitLoad(InstructionSelector* selector, Node* node, InstructionCode opcode,
selector->Emit(opcode, arraysize(outputs), outputs, input_count, inputs);
}
void InstructionSelector::VisitLoadTransform(Node* node) {
LoadTransformParameters params = LoadTransformParametersOf(node->op());
InstructionCode opcode = kArchNop;
switch (params.transformation) {
case LoadTransformation::kS8x16LoadSplat:
opcode = kArm64S8x16LoadSplat;
break;
case LoadTransformation::kS16x8LoadSplat:
opcode = kArm64S16x8LoadSplat;
break;
case LoadTransformation::kS32x4LoadSplat:
opcode = kArm64S32x4LoadSplat;
break;
case LoadTransformation::kS64x2LoadSplat:
opcode = kArm64S64x2LoadSplat;
break;
case LoadTransformation::kI16x8Load8x8S:
opcode = kArm64I16x8Load8x8S;
break;
case LoadTransformation::kI16x8Load8x8U:
opcode = kArm64I16x8Load8x8U;
break;
case LoadTransformation::kI32x4Load16x4S:
opcode = kArm64I32x4Load16x4S;
break;
case LoadTransformation::kI32x4Load16x4U:
opcode = kArm64I32x4Load16x4U;
break;
case LoadTransformation::kI64x2Load32x2S:
opcode = kArm64I64x2Load32x2S;
break;
case LoadTransformation::kI64x2Load32x2U:
opcode = kArm64I64x2Load32x2U;
break;
default:
UNIMPLEMENTED();
}
// ARM64 supports unaligned loads
DCHECK_NE(params.kind, LoadKind::kUnaligned);
Arm64OperandGenerator g(this);
Node* base = node->InputAt(0);
Node* index = node->InputAt(1);
InstructionOperand inputs[2];
InstructionOperand outputs[1];
inputs[0] = g.UseRegister(base);
inputs[1] = g.UseRegister(index);
outputs[0] = g.DefineAsRegister(node);
// ld1r uses post-index, so construct address first.
// TODO(v8:9886) If index can be immediate, use vldr without this add.
InstructionOperand addr = g.TempRegister();
Emit(kArm64Add, 1, &addr, 2, inputs);
inputs[0] = addr;
inputs[1] = g.TempImmediate(0);
opcode |= AddressingModeField::encode(kMode_MRI);
Emit(opcode, 1, outputs, 2, inputs);
}
void InstructionSelector::VisitLoad(Node* node) {
InstructionCode opcode = kArchNop;
ImmediateMode immediate_mode = kNoImmediate;

View File

@ -2614,7 +2614,6 @@ void InstructionSelector::VisitI64x2ReplaceLaneI32Pair(Node* node) {
#endif // !V8_TARGET_ARCH_IA32
#if !V8_TARGET_ARCH_X64
void InstructionSelector::VisitLoadTransform(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2SConvertI64x2(Node* node) {
UNIMPLEMENTED();
}
@ -2622,6 +2621,7 @@ void InstructionSelector::VisitF64x2UConvertI64x2(Node* node) {
UNIMPLEMENTED();
}
#if !V8_TARGET_ARCH_ARM64
void InstructionSelector::VisitLoadTransform(Node* node) { UNIMPLEMENTED(); }
#if !V8_TARGET_ARCH_IA32
void InstructionSelector::VisitF64x2Min(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Max(Node* node) { UNIMPLEMENTED(); }

View File

@ -3273,7 +3273,7 @@ WASM_SIMD_TEST(SimdLoadStoreLoadMemargOffset) {
}
}
#if V8_TARGET_ARCH_X64
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
template <typename T>
void RunLoadSplatTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode op) {
@ -3352,7 +3352,6 @@ WASM_SIMD_TEST_NO_LOWERING(I16x8Load8x8S) {
RunLoadExtendTest<int8_t, int16_t>(execution_tier, lower_simd,
kExprI16x8Load8x8S);
}
WASM_SIMD_TEST_NO_LOWERING(I32x4Load16x4U) {
RunLoadExtendTest<uint16_t, uint32_t>(execution_tier, lower_simd,
kExprI32x4Load16x4U);
@ -3372,7 +3371,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2Load32x2S) {
RunLoadExtendTest<int32_t, int64_t>(execution_tier, lower_simd,
kExprI64x2Load32x2S);
}
#endif // V8_TARGET_ARCH_X64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 || \
V8_TARGET_ARCH_ARM