From 7f770766d8b898943c5bb9c63b8cd0474b898d44 Mon Sep 17 00:00:00 2001 From: Zhi An Ng Date: Fri, 20 Nov 2020 01:06:13 +0000 Subject: [PATCH] [wasm-simd][arm64] Prototype prefetch arm64 Prototype 2 prefetch instructions (temporal and non-temporal) on arm64 and interpreter. Add prfm to assembler, and use MiscField to encode the two versions. Small tweak to simulator to handle these new instructions (no-op). The implementation in the interpreter just pops the memory index and does nothing. Simple test cases added for these 2 new instructions, as well as a prefetch with OOB index, which should not trap. Bug: v8:11168 Change-Id: Ieced8081615d07f950d6d4c1128d1bc6a75839fd Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2543167 Reviewed-by: Bill Budge Reviewed-by: Jakob Kummerow Reviewed-by: Tobias Tebbi Commit-Queue: Zhi An Ng Cr-Commit-Position: refs/heads/master@{#71353} --- src/codegen/arm64/assembler-arm64.cc | 31 ++++++++++ src/codegen/arm64/assembler-arm64.h | 3 + src/codegen/arm64/constants-arm64.h | 25 ++++++++ .../backend/arm64/code-generator-arm64.cc | 4 ++ .../backend/arm64/instruction-codes-arm64.h | 1 + .../arm64/instruction-scheduler-arm64.cc | 1 + .../arm64/instruction-selector-arm64.cc | 20 +++++++ src/compiler/backend/instruction-selector.cc | 12 ++++ src/compiler/machine-operator.cc | 12 ++++ src/compiler/machine-operator.h | 3 + src/compiler/opcodes.h | 2 + src/compiler/wasm-compiler.cc | 11 ++++ src/compiler/wasm-compiler.h | 2 + src/diagnostics/arm64/disasm-arm64.cc | 17 ++++-- src/execution/arm64/simulator-arm64.cc | 14 ++++- src/wasm/baseline/liftoff-compiler.cc | 6 ++ src/wasm/function-body-decoder-impl.h | 23 +++++++- src/wasm/graph-builder-interface.cc | 6 ++ src/wasm/wasm-opcodes-inl.h | 3 + src/wasm/wasm-opcodes.h | 5 +- test/cctest/test-assembler-arm64.cc | 57 +++++++++++++++++++ test/cctest/test-disasm-arm64.cc | 18 ++++++ test/cctest/wasm/test-run-wasm-simd.cc | 54 ++++++++++++++++++ test/common/wasm/wasm-interpreter.cc | 10 ++++ 24 files changed, 331 insertions(+), 9 deletions(-) diff --git a/src/codegen/arm64/assembler-arm64.cc b/src/codegen/arm64/assembler-arm64.cc index 4aaa413d2d..e825a01429 100644 --- a/src/codegen/arm64/assembler-arm64.cc +++ b/src/codegen/arm64/assembler-arm64.cc @@ -1414,6 +1414,37 @@ void Assembler::stlxrh(const Register& rs, const Register& rt, Emit(STLXR_h | Rs(rs) | Rt2(x31) | RnSP(rn) | Rt(rt)); } +void Assembler::prfm(int prfop, const MemOperand& addr) { + // Restricted support for prfm, only register offset. + // This can probably be merged with Assembler::LoadStore as we expand support. + DCHECK(addr.IsRegisterOffset()); + DCHECK(is_uint5(prfop)); + Instr memop = PRFM | prfop | RnSP(addr.base()); + + Extend ext = addr.extend(); + Shift shift = addr.shift(); + unsigned shift_amount = addr.shift_amount(); + + // LSL is encoded in the option field as UXTX. + if (shift == LSL) { + ext = UXTX; + } + + // Shifts are encoded in one bit, indicating a left shift by the memory + // access size. + DCHECK((shift_amount == 0) || + (shift_amount == static_cast(CalcLSDataSize(PRFM)))); + + Emit(LoadStoreRegisterOffsetFixed | memop | Rm(addr.regoffset()) | + ExtendMode(ext) | ImmShiftLS((shift_amount > 0) ? 1 : 0)); +} + +void Assembler::prfm(PrefetchOperation prfop, const MemOperand& addr) { + // Restricted support for prfm, only register offset. + // This can probably be merged with Assembler::LoadStore as we expand support. + prfm(static_cast(prfop), addr); +} + void Assembler::NEON3DifferentL(const VRegister& vd, const VRegister& vn, const VRegister& vm, NEON3DifferentOp vop) { DCHECK(AreSameFormat(vn, vm)); diff --git a/src/codegen/arm64/assembler-arm64.h b/src/codegen/arm64/assembler-arm64.h index 6bebb83786..481ecc41c7 100644 --- a/src/codegen/arm64/assembler-arm64.h +++ b/src/codegen/arm64/assembler-arm64.h @@ -880,6 +880,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { // Store-release exclusive half-word. void stlxrh(const Register& rs, const Register& rt, const Register& rn); + void prfm(int prfop, const MemOperand& addr); + void prfm(PrefetchOperation prfop, const MemOperand& addr); + // Move instructions. The default shift of -1 indicates that the move // instruction will calculate an appropriate 16-bit immediate and left shift // that is equal to the 64-bit immediate argument. If an explicit left shift diff --git a/src/codegen/arm64/constants-arm64.h b/src/codegen/arm64/constants-arm64.h index 52790b9faf..940216fc94 100644 --- a/src/codegen/arm64/constants-arm64.h +++ b/src/codegen/arm64/constants-arm64.h @@ -159,6 +159,9 @@ using float16 = uint16_t; /* store second source. */ \ V_(Rs, 20, 16, Bits) /* Store-exclusive status */ \ V_(PrefetchMode, 4, 0, Bits) \ + V_(PrefetchHint, 4, 3, Bits) \ + V_(PrefetchTarget, 2, 1, Bits) \ + V_(PrefetchStream, 0, 0, Bits) \ \ /* Common bits */ \ V_(SixtyFourBits, 31, 31, Bits) \ @@ -216,6 +219,7 @@ using float16 = uint16_t; V_(LSOpc, 23, 22, Bits) \ V_(LSVector, 26, 26, Bits) \ V_(LSSize, 31, 30, Bits) \ + V_(ImmPrefetchOperation, 4, 0, Bits) \ \ /* NEON generic fields */ \ V_(NEONQ, 30, 30, Bits) \ @@ -443,6 +447,27 @@ enum SystemRegister { ImmSystemRegister_offset }; +enum PrefetchOperation { + PLDL1KEEP = 0x00, + PLDL1STRM = 0x01, + PLDL2KEEP = 0x02, + PLDL2STRM = 0x03, + PLDL3KEEP = 0x04, + PLDL3STRM = 0x05, + PLIL1KEEP = 0x08, + PLIL1STRM = 0x09, + PLIL2KEEP = 0x0a, + PLIL2STRM = 0x0b, + PLIL3KEEP = 0x0c, + PLIL3STRM = 0x0d, + PSTL1KEEP = 0x10, + PSTL1STRM = 0x11, + PSTL2KEEP = 0x12, + PSTL2STRM = 0x13, + PSTL3KEEP = 0x14, + PSTL3STRM = 0x15, +}; + // Instruction enumerations. // // These are the masks that define a class of instructions, and the list of diff --git a/src/compiler/backend/arm64/code-generator-arm64.cc b/src/compiler/backend/arm64/code-generator-arm64.cc index 11dd5d4411..8a4965bdf4 100644 --- a/src/compiler/backend/arm64/code-generator-arm64.cc +++ b/src/compiler/backend/arm64/code-generator-arm64.cc @@ -1442,6 +1442,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( } break; } + case kArm64Prfm: { + __ prfm(MiscField::decode(opcode), i.MemoryOperand(0)); + break; + } case kArm64Clz: __ Clz(i.OutputRegister64(), i.InputRegister64(0)); break; diff --git a/src/compiler/backend/arm64/instruction-codes-arm64.h b/src/compiler/backend/arm64/instruction-codes-arm64.h index 7eafb7f39e..102003c88d 100644 --- a/src/compiler/backend/arm64/instruction-codes-arm64.h +++ b/src/compiler/backend/arm64/instruction-codes-arm64.h @@ -93,6 +93,7 @@ namespace compiler { V(Arm64Poke) \ V(Arm64PokePair) \ V(Arm64Peek) \ + V(Arm64Prfm) \ V(Arm64Float32Cmp) \ V(Arm64Float32Add) \ V(Arm64Float32Sub) \ diff --git a/src/compiler/backend/arm64/instruction-scheduler-arm64.cc b/src/compiler/backend/arm64/instruction-scheduler-arm64.cc index ce96f7c35f..d75f53f34f 100644 --- a/src/compiler/backend/arm64/instruction-scheduler-arm64.cc +++ b/src/compiler/backend/arm64/instruction-scheduler-arm64.cc @@ -383,6 +383,7 @@ int InstructionScheduler::GetTargetInstructionFlags( case kArm64StrCompressTagged: case kArm64DmbIsh: case kArm64DsbIsb: + case kArm64Prfm: return kHasSideEffect; case kArm64Word64AtomicLoadUint8: diff --git a/src/compiler/backend/arm64/instruction-selector-arm64.cc b/src/compiler/backend/arm64/instruction-selector-arm64.cc index 737deba2de..bb866e68d4 100644 --- a/src/compiler/backend/arm64/instruction-selector-arm64.cc +++ b/src/compiler/backend/arm64/instruction-selector-arm64.cc @@ -617,6 +617,26 @@ void EmitLoad(InstructionSelector* selector, Node* node, InstructionCode opcode, selector->Emit(opcode, arraysize(outputs), outputs, input_count, inputs); } +void InstructionSelector::VisitPrefetchTemporal(Node* node) { + Arm64OperandGenerator g(this); + InstructionOperand inputs[2] = {g.UseRegister(node->InputAt(0)), + g.UseRegister(node->InputAt(1))}; + InstructionCode opcode = kArm64Prfm; + opcode |= AddressingModeField::encode(kMode_MRR); + opcode |= MiscField::encode(PLDL1KEEP); + Emit(opcode, 0, nullptr, 2, inputs); +} + +void InstructionSelector::VisitPrefetchNonTemporal(Node* node) { + Arm64OperandGenerator g(this); + InstructionOperand inputs[2] = {g.UseRegister(node->InputAt(0)), + g.UseRegister(node->InputAt(1))}; + InstructionCode opcode = kArm64Prfm; + opcode |= AddressingModeField::encode(kMode_MRR); + opcode |= MiscField::encode(PLDL1STRM); + Emit(opcode, 0, nullptr, 2, inputs); +} + void InstructionSelector::VisitLoadTransform(Node* node) { LoadTransformParameters params = LoadTransformParametersOf(node->op()); InstructionCode opcode = kArchNop; diff --git a/src/compiler/backend/instruction-selector.cc b/src/compiler/backend/instruction-selector.cc index 166a3e77f2..4e385dfe9b 100644 --- a/src/compiler/backend/instruction-selector.cc +++ b/src/compiler/backend/instruction-selector.cc @@ -1416,6 +1416,12 @@ void InstructionSelector::VisitNode(Node* node) { MarkAsRepresentation(MachineRepresentation::kSimd128, node); return VisitLoadTransform(node); } + case IrOpcode::kPrefetchTemporal: { + return VisitPrefetchTemporal(node); + } + case IrOpcode::kPrefetchNonTemporal: { + return VisitPrefetchNonTemporal(node); + } case IrOpcode::kLoadLane: { MarkAsRepresentation(MachineRepresentation::kSimd128, node); return VisitLoadLane(node); @@ -2795,6 +2801,12 @@ void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16S(Node* node) { void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) { UNIMPLEMENTED(); } + +// TODO(v8:11168): Prototyping prefetch. +void InstructionSelector::VisitPrefetchTemporal(Node* node) { UNIMPLEMENTED(); } +void InstructionSelector::VisitPrefetchNonTemporal(Node* node) { + UNIMPLEMENTED(); +} #endif // !V8_TARGET_ARCH_ARM64 #if !V8_TARGET_ARCH_X64 diff --git a/src/compiler/machine-operator.cc b/src/compiler/machine-operator.cc index 8436b1af3d..9630b259fb 100644 --- a/src/compiler/machine-operator.cc +++ b/src/compiler/machine-operator.cc @@ -1160,6 +1160,18 @@ std::ostream& operator<<(std::ostream& os, TruncateKind kind) { MACHINE_PURE_OP_LIST(PURE) #undef PURE +const Operator* MachineOperatorBuilder::PrefetchTemporal() { + return GetCachedOperator< + CachedOperator>( + Operator::kNoDeopt | Operator::kNoThrow, "PrefetchTemporal"); +} + +const Operator* MachineOperatorBuilder::PrefetchNonTemporal() { + return GetCachedOperator< + CachedOperator>( + Operator::kNoDeopt | Operator::kNoThrow, "PrefetchNonTemporal"); +} + const Operator* MachineOperatorBuilder::Load(LoadRepresentation rep) { #define LOAD(Type) \ if (rep == MachineType::Type()) { \ diff --git a/src/compiler/machine-operator.h b/src/compiler/machine-operator.h index 7912c55de5..8b00ee1dd7 100644 --- a/src/compiler/machine-operator.h +++ b/src/compiler/machine-operator.h @@ -828,6 +828,9 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final const Operator* LoadTransform(MemoryAccessKind kind, LoadTransformation transform); + const Operator* PrefetchTemporal(); + const Operator* PrefetchNonTemporal(); + // SIMD load: replace a specified lane with [base + index]. const Operator* LoadLane(MemoryAccessKind kind, LoadRepresentation rep, uint8_t laneidx); diff --git a/src/compiler/opcodes.h b/src/compiler/opcodes.h index 046f3daf91..fe2aa4e87a 100644 --- a/src/compiler/opcodes.h +++ b/src/compiler/opcodes.h @@ -979,6 +979,8 @@ V(V8x16AnyTrue) \ V(V8x16AllTrue) \ V(LoadTransform) \ + V(PrefetchTemporal) \ + V(PrefetchNonTemporal) \ V(LoadLane) \ V(StoreLane) diff --git a/src/compiler/wasm-compiler.cc b/src/compiler/wasm-compiler.cc index e540af596c..2a21c8eecc 100644 --- a/src/compiler/wasm-compiler.cc +++ b/src/compiler/wasm-compiler.cc @@ -4158,6 +4158,17 @@ Node* WasmGraphBuilder::LoadTransform(wasm::ValueType type, MachineType memtype, return load; } +Node* WasmGraphBuilder::Prefetch(Node* index, uint64_t offset, + uint32_t alignment, bool temporal) { + uintptr_t capped_offset = static_cast(offset); + const Operator* prefetchOp = + temporal ? mcgraph()->machine()->PrefetchTemporal() + : mcgraph()->machine()->PrefetchNonTemporal(); + Node* prefetch = SetEffect(graph()->NewNode( + prefetchOp, MemBuffer(capped_offset), index, effect(), control())); + return prefetch; +} + Node* WasmGraphBuilder::LoadMem(wasm::ValueType type, MachineType memtype, Node* index, uint64_t offset, uint32_t alignment, diff --git a/src/compiler/wasm-compiler.h b/src/compiler/wasm-compiler.h index d6e4cb84aa..ae58d0663a 100644 --- a/src/compiler/wasm-compiler.h +++ b/src/compiler/wasm-compiler.h @@ -301,6 +301,8 @@ class WasmGraphBuilder { Node* CurrentMemoryPages(); Node* TraceMemoryOperation(bool is_store, MachineRepresentation, Node* index, uintptr_t offset, wasm::WasmCodePosition); + Node* Prefetch(Node* index, uint64_t offset, uint32_t alignment, + bool temporal); Node* LoadMem(wasm::ValueType type, MachineType memtype, Node* index, uint64_t offset, uint32_t alignment, wasm::WasmCodePosition position); diff --git a/src/diagnostics/arm64/disasm-arm64.cc b/src/diagnostics/arm64/disasm-arm64.cc index 89afda4f2f..456f0b5f66 100644 --- a/src/diagnostics/arm64/disasm-arm64.cc +++ b/src/diagnostics/arm64/disasm-arm64.cc @@ -7,6 +7,8 @@ #include #include +#include + #if V8_TARGET_ARCH_ARM64 #include "src/base/platform/platform.h" @@ -4266,12 +4268,19 @@ int DisassemblingDecoder::SubstitutePrefetchField(Instruction* instr, USE(format); int prefetch_mode = instr->PrefetchMode(); + const std::array hints = {"ld", "li", "st"}; + unsigned hint = instr->PrefetchHint(); + unsigned target = instr->PrefetchTarget() + 1; - const char* ls = (prefetch_mode & 0x10) ? "st" : "ld"; - int level = (prefetch_mode >> 1) + 1; - const char* ks = (prefetch_mode & 1) ? "strm" : "keep"; + if (hint >= hints.size() || target > 3) { + std::bitset<5> prefetch_mode(instr->ImmPrefetchOperation()); + AppendToOutput("#0b%s", prefetch_mode.to_string().c_str()); + } else { + const char* ks = (prefetch_mode & 1) ? "strm" : "keep"; + + AppendToOutput("p%sl%d%s", hints[hint].c_str(), target, ks); + } - AppendToOutput("p%sl%d%s", ls, level, ks); return 6; } diff --git a/src/execution/arm64/simulator-arm64.cc b/src/execution/arm64/simulator-arm64.cc index cca6e0d090..7b49ee4673 100644 --- a/src/execution/arm64/simulator-arm64.cc +++ b/src/execution/arm64/simulator-arm64.cc @@ -1799,14 +1799,17 @@ void Simulator::LoadStoreHelper(Instruction* instr, int64_t offset, unsigned addr_reg = instr->Rn(); uintptr_t address = LoadStoreAddress(addr_reg, offset, addrmode); uintptr_t stack = 0; + LoadStoreOp op = static_cast(instr->Mask(LoadStoreMask)); { base::MutexGuard lock_guard(&GlobalMonitor::Get()->mutex); if (instr->IsLoad()) { local_monitor_.NotifyLoad(); - } else { + } else if (instr->IsStore()) { local_monitor_.NotifyStore(); GlobalMonitor::Get()->NotifyStore_Locked(&global_monitor_processor_); + } else { + DCHECK_EQ(op, PRFM); } } @@ -1825,7 +1828,6 @@ void Simulator::LoadStoreHelper(Instruction* instr, int64_t offset, stack = sp(); } - LoadStoreOp op = static_cast(instr->Mask(LoadStoreMask)); switch (op) { // Use _no_log variants to suppress the register trace (LOG_REGS, // LOG_VREGS). We will print a more detailed log. @@ -1900,6 +1902,10 @@ void Simulator::LoadStoreHelper(Instruction* instr, int64_t offset, MemoryWrite(address, qreg(srcdst)); break; + // Do nothing for prefetch. + case PRFM: + break; + default: UNIMPLEMENTED(); } @@ -1915,7 +1921,7 @@ void Simulator::LoadStoreHelper(Instruction* instr, int64_t offset, } else { LogRead(address, srcdst, GetPrintRegisterFormatForSize(access_size)); } - } else { + } else if (instr->IsStore()) { if ((op == STR_s) || (op == STR_d)) { LogVWrite(address, srcdst, GetPrintRegisterFormatForSizeFP(access_size)); } else if ((op == STR_b) || (op == STR_h) || (op == STR_q)) { @@ -1923,6 +1929,8 @@ void Simulator::LoadStoreHelper(Instruction* instr, int64_t offset, } else { LogWrite(address, srcdst, GetPrintRegisterFormatForSize(access_size)); } + } else { + DCHECK_EQ(op, PRFM); } // Handle the writeback for loads after the load to ensure safe pop diff --git a/src/wasm/baseline/liftoff-compiler.cc b/src/wasm/baseline/liftoff-compiler.cc index 7de285ebf1..587bc71771 100644 --- a/src/wasm/baseline/liftoff-compiler.cc +++ b/src/wasm/baseline/liftoff-compiler.cc @@ -2255,6 +2255,12 @@ class LiftoffCompiler { return index; } + void Prefetch(FullDecoder* decoder, + const MemoryAccessImmediate& imm, + const Value& index_val, bool temporal) { + unsupported(decoder, kSimd, "simd prefetch"); + } + void LoadMem(FullDecoder* decoder, LoadType type, const MemoryAccessImmediate& imm, const Value& index_val, Value* result) { diff --git a/src/wasm/function-body-decoder-impl.h b/src/wasm/function-body-decoder-impl.h index 60c50f0c75..49c7681fc8 100644 --- a/src/wasm/function-body-decoder-impl.h +++ b/src/wasm/function-body-decoder-impl.h @@ -1001,6 +1001,8 @@ struct ControlBase : public PcForErrors { F(LoadLane, LoadType type, const Value& value, const Value& index, \ const MemoryAccessImmediate& imm, const uint8_t laneidx, \ Value* result) \ + F(Prefetch, const MemoryAccessImmediate& imm, const Value& index, \ + bool temporal) \ F(StoreMem, StoreType type, const MemoryAccessImmediate& imm, \ const Value& index, const Value& value) \ F(StoreLane, StoreType type, const MemoryAccessImmediate& imm, \ @@ -1760,7 +1762,8 @@ class WasmDecoder : public Decoder { #define DECLARE_OPCODE_CASE(name, opcode, sig) case kExpr##name: FOREACH_SIMD_MEM_OPCODE(DECLARE_OPCODE_CASE) #undef DECLARE_OPCODE_CASE - { + case kExprPrefetchT: + case kExprPrefetchNT: { MemoryAccessImmediate imm(decoder, pc + length, UINT32_MAX); return length + imm.length; @@ -3506,6 +3509,18 @@ class WasmFullDecoder : public WasmDecoder { return opcode_length + 16; } + uint32_t SimdPrefetch(uint32_t opcode_length, bool temporal) { + if (!CheckHasMemory()) return 0; + // Alignment doesn't matter, set to an arbitrary value. + uint32_t max_alignment = 4; + MemoryAccessImmediate imm(this, this->pc_ + opcode_length, + max_alignment); + ValueType index_type = this->module_->is_memory64 ? kWasmI64 : kWasmI32; + Value index = Pop(0, index_type); + CALL_INTERFACE_IF_REACHABLE(Prefetch, imm, index, temporal); + return opcode_length + imm.length; + } + uint32_t DecodeSimdOpcode(WasmOpcode opcode, uint32_t opcode_length) { // opcode_length is the number of bytes that this SIMD-specific opcode takes // up in the LEB128 encoded form. @@ -3610,6 +3625,12 @@ class WasmFullDecoder : public WasmDecoder { } case kExprS128Const: return SimdConstOp(opcode_length); + case kExprPrefetchT: { + return SimdPrefetch(opcode_length, /*temporal=*/true); + } + case kExprPrefetchNT: { + return SimdPrefetch(opcode_length, /*temporal=*/false); + } default: { const FunctionSig* sig = WasmOpcodes::Signature(opcode); if (!VALIDATE(sig != nullptr)) { diff --git a/src/wasm/graph-builder-interface.cc b/src/wasm/graph-builder-interface.cc index 21e655253c..22db16d0d9 100644 --- a/src/wasm/graph-builder-interface.cc +++ b/src/wasm/graph-builder-interface.cc @@ -419,6 +419,12 @@ class WasmGraphBuildingInterface { SetEnv(if_block->false_env); } + void Prefetch(FullDecoder* decoder, + const MemoryAccessImmediate& imm, const Value& index, + bool temporal) { + BUILD(Prefetch, index.node, imm.offset, imm.alignment, temporal); + } + void LoadMem(FullDecoder* decoder, LoadType type, const MemoryAccessImmediate& imm, const Value& index, Value* result) { diff --git a/src/wasm/wasm-opcodes-inl.h b/src/wasm/wasm-opcodes-inl.h index 5e0f172bd5..a138a66bd8 100644 --- a/src/wasm/wasm-opcodes-inl.h +++ b/src/wasm/wasm-opcodes-inl.h @@ -360,6 +360,9 @@ constexpr const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) { CASE_SIGN_OP(I32x4, ExtAddPairwiseI16x8, "extadd_pairwise_i16x8") CASE_SIGN_OP(I16x8, ExtAddPairwiseI8x16, "extadd_pairwise_i8x6") + CASE_OP(PrefetchT, "prefetch_t") + CASE_OP(PrefetchNT, "prefetch_nt") + // Atomic operations. CASE_OP(AtomicNotify, "atomic.notify") CASE_INT_OP(AtomicWait, "atomic.wait") diff --git a/src/wasm/wasm-opcodes.h b/src/wasm/wasm-opcodes.h index 42fceb7a4f..bf9b0514ef 100644 --- a/src/wasm/wasm-opcodes.h +++ b/src/wasm/wasm-opcodes.h @@ -470,7 +470,9 @@ bool V8_EXPORT_PRIVATE IsJSCompatibleSignature(const FunctionSig* sig, V(S128Store8Lane, 0xfd5c, v_is) \ V(S128Store16Lane, 0xfd5d, v_is) \ V(S128Store32Lane, 0xfd5e, v_is) \ - V(S128Store64Lane, 0xfd5f, v_is) + V(S128Store64Lane, 0xfd5f, v_is) \ + V(PrefetchT, 0xfdc5, v_i) \ + V(PrefetchNT, 0xfdc6, v_i) #define FOREACH_SIMD_POST_MVP_OPCODE(V) \ V(I8x16Mul, 0xfd75, s_ss) \ @@ -706,6 +708,7 @@ bool V8_EXPORT_PRIVATE IsJSCompatibleSignature(const FunctionSig* sig, V(d_f, kWasmF64, kWasmF32) \ V(d_i, kWasmF64, kWasmI32) \ V(d_l, kWasmF64, kWasmI64) \ + V(v_i, kWasmStmt, kWasmI32) \ V(v_ii, kWasmStmt, kWasmI32, kWasmI32) \ V(v_id, kWasmStmt, kWasmI32, kWasmF64) \ V(d_id, kWasmF64, kWasmI32, kWasmF64) \ diff --git a/test/cctest/test-assembler-arm64.cc b/test/cctest/test-assembler-arm64.cc index 7dbe07c924..4deb43673f 100644 --- a/test/cctest/test-assembler-arm64.cc +++ b/test/cctest/test-assembler-arm64.cc @@ -6832,6 +6832,63 @@ TEST(ldr_literal_range_max_dist_no_emission_2) { #endif +static const PrefetchOperation kPrfmOperations[] = { + PLDL1KEEP, PLDL1STRM, PLDL2KEEP, PLDL2STRM, PLDL3KEEP, PLDL3STRM, + + PLIL1KEEP, PLIL1STRM, PLIL2KEEP, PLIL2STRM, PLIL3KEEP, PLIL3STRM, + + PSTL1KEEP, PSTL1STRM, PSTL2KEEP, PSTL2STRM, PSTL3KEEP, PSTL3STRM}; + +TEST(prfm_regoffset_assem) { + INIT_V8(); + SETUP(); + + START(); + // The address used in prfm doesn't have to be valid. + __ Mov(x0, 0x0123456789abcdef); + + CPURegList inputs(CPURegister::kRegister, kXRegSizeInBits, 10, 18); + __ Mov(x10, 0); + __ Mov(x11, 1); + __ Mov(x12, 8); + __ Mov(x13, 255); + __ Mov(x14, -0); + __ Mov(x15, -1); + __ Mov(x16, -8); + __ Mov(x17, -255); + __ Mov(x18, 0xfedcba9876543210); + + for (int op = 0; op < (1 << ImmPrefetchOperation_width); op++) { + // Unallocated prefetch operations are ignored, so test all of them. + // We have to use the Assembler directly for this. + CPURegList loop = inputs; + while (!loop.IsEmpty()) { + __ prfm(op, MemOperand(x0, Register::Create(loop.PopLowestIndex().code(), + kXRegSizeInBits))); + } + } + + for (PrefetchOperation op : kPrfmOperations) { + // Also test named operations. + CPURegList loop = inputs; + while (!loop.IsEmpty()) { + Register input = + Register::Create(loop.PopLowestIndex().code(), kXRegSizeInBits); + __ prfm(op, MemOperand(x0, input, UXTW)); + __ prfm(op, MemOperand(x0, input, UXTW, 3)); + __ prfm(op, MemOperand(x0, input, LSL)); + __ prfm(op, MemOperand(x0, input, LSL, 3)); + __ prfm(op, MemOperand(x0, input, SXTW)); + __ prfm(op, MemOperand(x0, input, SXTW, 3)); + __ prfm(op, MemOperand(x0, input, SXTX)); + __ prfm(op, MemOperand(x0, input, SXTX, 3)); + } + } + + END(); + RUN(); +} + TEST(add_sub_imm) { INIT_V8(); SETUP(); diff --git a/test/cctest/test-disasm-arm64.cc b/test/cctest/test-disasm-arm64.cc index 551488ab21..441ae53f32 100644 --- a/test/cctest/test-disasm-arm64.cc +++ b/test/cctest/test-disasm-arm64.cc @@ -1518,6 +1518,24 @@ TEST_(load_literal) { CLEANUP(); } +TEST(prfm_regoffset) { + SET_UP_ASM(); + + COMPARE(prfm(PLIL1KEEP, MemOperand(x1, x2)), "prfm plil1keep, [x1, x2]"); + COMPARE(prfm(PLIL1STRM, MemOperand(x3, w4, SXTW)), + "prfm plil1strm, [x3, w4, sxtw]"); + COMPARE(prfm(PLIL2KEEP, MemOperand(x5, x6, LSL, 3)), + "prfm plil2keep, [x5, x6, lsl #3]"); + + COMPARE(prfm(PLIL2STRM, MemOperand(sp, xzr)), "prfm plil2strm, [sp, xzr]"); + COMPARE(prfm(PLIL3KEEP, MemOperand(sp, wzr, SXTW)), + "prfm plil3keep, [sp, wzr, sxtw]"); + COMPARE(prfm(PLIL3STRM, MemOperand(sp, xzr, LSL, 3)), + "prfm plil3strm, [sp, xzr, lsl #3]"); + + CLEANUP(); +} + TEST_(cond_select) { SET_UP_ASM(); diff --git a/test/cctest/wasm/test-run-wasm-simd.cc b/test/cctest/wasm/test-run-wasm-simd.cc index ae78ef150c..d75d4d3cd7 100644 --- a/test/cctest/wasm/test-run-wasm-simd.cc +++ b/test/cctest/wasm/test-run-wasm-simd.cc @@ -3565,6 +3565,60 @@ WASM_SIMD_TEST(SimdF32x4SetGlobal) { CHECK_EQ(GetScalar(global, 3), 65.0f); } +#if V8_TARGET_ARCH_ARM64 +// TODO(v8:11168): Prototyping prefetch. +WASM_SIMD_TEST(SimdPrefetch) { + FLAG_SCOPE(wasm_simd_post_mvp); + + { + // Test PrefetchT. + WasmRunner r(execution_tier, lower_simd); + int32_t* memory = + r.builder().AddMemoryElems(kWasmPageSize / sizeof(int32_t)); + BUILD(r, WASM_ZERO, WASM_SIMD_OP(kExprPrefetchT), ZERO_ALIGNMENT, + ZERO_OFFSET, + WASM_SIMD_I32x4_EXTRACT_LANE(0, WASM_SIMD_LOAD_MEM(WASM_ZERO))); + + FOR_INT32_INPUTS(i) { + r.builder().WriteMemory(&memory[0], i); + CHECK_EQ(i, r.Call()); + } + } + + { + // Test PrefetchNT. + WasmRunner r(execution_tier, lower_simd); + int32_t* memory = + r.builder().AddMemoryElems(kWasmPageSize / sizeof(int32_t)); + BUILD(r, WASM_ZERO, WASM_SIMD_OP(kExprPrefetchNT), ZERO_ALIGNMENT, + ZERO_OFFSET, + WASM_SIMD_I32x4_EXTRACT_LANE(0, WASM_SIMD_LOAD_MEM(WASM_ZERO))); + + FOR_INT32_INPUTS(i) { + r.builder().WriteMemory(&memory[0], i); + CHECK_EQ(i, r.Call()); + } + } + + { + // Test OOB. + WasmRunner r(execution_tier, lower_simd); + int32_t* memory = + r.builder().AddMemoryElems(kWasmPageSize / sizeof(int32_t)); + + // Prefetch kWasmPageSize+1 but still load from 0. + BUILD(r, WASM_I32V(kWasmPageSize + 1), WASM_SIMD_OP(kExprPrefetchNT), + ZERO_ALIGNMENT, ZERO_OFFSET, + WASM_SIMD_I32x4_EXTRACT_LANE(0, WASM_SIMD_LOAD_MEM(WASM_ZERO))); + + FOR_INT32_INPUTS(i) { + r.builder().WriteMemory(&memory[0], i); + CHECK_EQ(i, r.Call()); + } + } +} +#endif // V8_TARGET_ARCH_ARM64 + WASM_SIMD_TEST(SimdLoadStoreLoad) { WasmRunner r(execution_tier, lower_simd); int32_t* memory = diff --git a/test/common/wasm/wasm-interpreter.cc b/test/common/wasm/wasm-interpreter.cc index 3dd0e05d39..9be4d94ae2 100644 --- a/test/common/wasm/wasm-interpreter.cc +++ b/test/common/wasm/wasm-interpreter.cc @@ -2801,6 +2801,16 @@ class WasmInterpreterInternals { case kExprI16x8ExtAddPairwiseI8x16U: { return DoSimdExtAddPairwise(); } + case kExprPrefetchT: + case kExprPrefetchNT: { + // Max alignment doesn't matter, use an arbitrary value. + MemoryAccessImmediate imm( + decoder, code->at(pc + *len), 4); + // Pop address and do nothing. + Pop().to(); + *len += imm.length; + return true; + } default: return false; }