diff --git a/src/codegen/arm/assembler-arm.cc b/src/codegen/arm/assembler-arm.cc index bdc74ab098..ba6ff3422e 100644 --- a/src/codegen/arm/assembler-arm.cc +++ b/src/codegen/arm/assembler-arm.cc @@ -4338,7 +4338,6 @@ void Assembler::vmull(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src1, src2.split_code(&vm, &m); int size = NeonSz(dt); int u = NeonU(dt); - if (!u) UNIMPLEMENTED(); emit(0xFU * B28 | B25 | u * B24 | B23 | d * B22 | size * B20 | vn * B16 | vd * B12 | 0xC * B8 | n * B7 | m * B5 | vm); } diff --git a/src/compiler/backend/arm/code-generator-arm.cc b/src/compiler/backend/arm/code-generator-arm.cc index 667bbda0b4..f3b9249e87 100644 --- a/src/compiler/backend/arm/code-generator-arm.cc +++ b/src/compiler/backend/arm/code-generator-arm.cc @@ -2434,6 +2434,19 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( __ VmovLow(dst, tmp2.low()); break; } + case kArmI32x4DotI16x8S: { + Simd128Register dst = i.OutputSimd128Register(); + Simd128Register lhs = i.InputSimd128Register(0); + Simd128Register rhs = i.InputSimd128Register(1); + Simd128Register tmp1 = i.TempSimd128Register(0); + UseScratchRegisterScope temps(tasm()); + Simd128Register scratch = temps.AcquireQ(); + __ vmull(NeonS16, tmp1, lhs.low(), rhs.low()); + __ vmull(NeonS16, scratch, lhs.high(), rhs.high()); + __ vpadd(Neon32, dst.low(), tmp1.low(), tmp1.high()); + __ vpadd(Neon32, dst.high(), scratch.low(), scratch.high()); + break; + } case kArmI16x8Splat: { __ vdup(Neon16, i.OutputSimd128Register(), i.InputRegister(0)); break; diff --git a/src/compiler/backend/arm/instruction-codes-arm.h b/src/compiler/backend/arm/instruction-codes-arm.h index 39ed658fc4..dd80472bc8 100644 --- a/src/compiler/backend/arm/instruction-codes-arm.h +++ b/src/compiler/backend/arm/instruction-codes-arm.h @@ -207,6 +207,7 @@ namespace compiler { V(ArmI32x4GeU) \ V(ArmI32x4Abs) \ V(ArmI32x4BitMask) \ + V(ArmI32x4DotI16x8S) \ V(ArmI16x8Splat) \ V(ArmI16x8ExtractLaneS) \ V(ArmI16x8ReplaceLane) \ diff --git a/src/compiler/backend/arm/instruction-scheduler-arm.cc b/src/compiler/backend/arm/instruction-scheduler-arm.cc index 196aa1ce6c..845934095c 100644 --- a/src/compiler/backend/arm/instruction-scheduler-arm.cc +++ b/src/compiler/backend/arm/instruction-scheduler-arm.cc @@ -187,6 +187,7 @@ int InstructionScheduler::GetTargetInstructionFlags( case kArmI32x4GeU: case kArmI32x4Abs: case kArmI32x4BitMask: + case kArmI32x4DotI16x8S: case kArmI16x8Splat: case kArmI16x8ExtractLaneS: case kArmI16x8ReplaceLane: diff --git a/src/compiler/backend/arm/instruction-selector-arm.cc b/src/compiler/backend/arm/instruction-selector-arm.cc index a8bfc4bdbd..ba9c744ccf 100644 --- a/src/compiler/backend/arm/instruction-selector-arm.cc +++ b/src/compiler/backend/arm/instruction-selector-arm.cc @@ -2633,6 +2633,14 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) { V(S128Xor, kArmS128Xor) \ V(S128AndNot, kArmS128AndNot) +void InstructionSelector::VisitI32x4DotI16x8S(Node* node) { + ArmOperandGenerator g(this); + InstructionOperand temps[] = {g.TempSimd128Register()}; + Emit(kArmI32x4DotI16x8S, g.DefineAsRegister(node), + g.UseUniqueRegister(node->InputAt(0)), + g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps); +} + void InstructionSelector::VisitS128Zero(Node* node) { ArmOperandGenerator g(this); Emit(kArmS128Zero, g.DefineAsRegister(node)); diff --git a/src/compiler/backend/instruction-selector.cc b/src/compiler/backend/instruction-selector.cc index 5fd0617902..0b98250d32 100644 --- a/src/compiler/backend/instruction-selector.cc +++ b/src/compiler/backend/instruction-selector.cc @@ -2705,10 +2705,12 @@ void InstructionSelector::VisitF64x2NearestInt(Node* node) { UNIMPLEMENTED(); } #endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_S390X // && !V8_TARGET_ARCH_IA32 -#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64 +#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64 && \ + !V8_TARGET_ARCH_ARM // TODO(v8:10583) Prototype i32x4.dot_i16x8_s void InstructionSelector::VisitI32x4DotI16x8S(Node* node) { UNIMPLEMENTED(); } #endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64 + // && !V8_TARGET_ARCH_ARM void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); } diff --git a/src/diagnostics/arm/disasm-arm.cc b/src/diagnostics/arm/disasm-arm.cc index 617b3bef53..0cd4016479 100644 --- a/src/diagnostics/arm/disasm-arm.cc +++ b/src/diagnostics/arm/disasm-arm.cc @@ -2065,6 +2065,16 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, "vshr.s%d q%d, q%d, #%d", size, Vd, Vm, shift); + } else if (instr->Bits(11, 8) == 0xC && instr->Bit(6) == 0 && + instr->Bit(4) == 0) { + // vmull.s Qd, Dn, Dm + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kDoublePrecision); + int Vm = instr->VFPMRegValue(kDoublePrecision); + int size = 8 << instr->Bits(21, 20); + out_buffer_pos_ += + SNPrintF(out_buffer_ + out_buffer_pos_, "vmull.s%d q%d, d%d, d%d", + size, Vd, Vn, Vm); } else { Unknown(instr); } diff --git a/src/execution/arm/simulator-arm.cc b/src/execution/arm/simulator-arm.cc index 5f23e07c81..33d5928a9a 100644 --- a/src/execution/arm/simulator-arm.cc +++ b/src/execution/arm/simulator-arm.cc @@ -4241,6 +4241,28 @@ void RoundingAverageUnsigned(Simulator* simulator, int Vd, int Vm, int Vn) { simulator->set_neon_register(Vd, src1); } +template +void MultiplyLong(Simulator* simulator, int Vd, int Vn, int Vm) { + DCHECK_EQ(sizeof(WideType), 2 * sizeof(NarrowType)); + static const int kElems = kSimd128Size / sizeof(WideType); + NarrowType src1[kElems], src2[kElems]; + WideType dst[kElems]; + + // Get the entire d reg, then memcpy it to an array so we can address the + // underlying datatype easily. + uint64_t tmp; + simulator->get_d_register(Vn, &tmp); + memcpy(src1, &tmp, sizeof(tmp)); + simulator->get_d_register(Vm, &tmp); + memcpy(src2, &tmp, sizeof(tmp)); + + for (int i = 0; i < kElems; i++) { + dst[i] = WideType{src1[i]} * WideType{src2[i]}; + } + + simulator->set_neon_register(Vd, dst); +} + void Simulator::DecodeSpecialCondition(Instruction* instr) { switch (instr->SpecialValue()) { case 4: { @@ -4686,6 +4708,21 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { ArithmeticShiftRight(this, Vd, Vm, shift); break; } + } else if (instr->Bits(11, 8) == 0xC && instr->Bit(6) == 0 && + instr->Bit(4) == 0) { + // vmull.s Qd, Dn, Dm + NeonSize size = static_cast(instr->Bits(21, 20)); + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kDoublePrecision); + int Vm = instr->VFPMRegValue(kDoublePrecision); + switch (size) { + case Neon16: { + MultiplyLong(this, Vd, Vn, Vm); + break; + } + default: + UNIMPLEMENTED(); + } } else { UNIMPLEMENTED(); } @@ -5579,18 +5616,17 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { instr->Bit(4) == 0) { // vmull.u Qd, Dn, Dm NeonSize size = static_cast(instr->Bits(21, 20)); - if (size != Neon32) UNIMPLEMENTED(); - int Vd = instr->VFPDRegValue(kSimd128Precision); int Vn = instr->VFPNRegValue(kDoublePrecision); int Vm = instr->VFPMRegValue(kDoublePrecision); - uint64_t src1, src2, dst[2]; - - get_d_register(Vn, &src1); - get_d_register(Vm, &src2); - dst[0] = (src1 & 0xFFFFFFFFULL) * (src2 & 0xFFFFFFFFULL); - dst[1] = (src1 >> 32) * (src2 >> 32); - set_neon_register(Vd, dst); + switch (size) { + case Neon32: { + MultiplyLong(this, Vd, Vn, Vm); + break; + } + default: + UNIMPLEMENTED(); + } } else { UNIMPLEMENTED(); } diff --git a/test/cctest/wasm/test-run-wasm-simd.cc b/test/cctest/wasm/test-run-wasm-simd.cc index d625144b31..c9698ed74c 100644 --- a/test/cctest/wasm/test-run-wasm-simd.cc +++ b/test/cctest/wasm/test-run-wasm-simd.cc @@ -2313,7 +2313,8 @@ WASM_SIMD_TEST(I16x8RoundingAverageU) { } // TODO(v8:10583) Prototype i32x4.dot_i16x8_s -#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 +#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 || \ + V8_TARGET_ARCH_ARM WASM_SIMD_TEST_NO_LOWERING(I32x4DotI16x8S) { FLAG_SCOPE(wasm_simd_post_mvp); @@ -2340,7 +2341,8 @@ WASM_SIMD_TEST_NO_LOWERING(I32x4DotI16x8S) { } } } -#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 +#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 || + // V8_TARGET_ARCH_ARM void RunI16x8ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd, WasmOpcode opcode, Int16ShiftOp expected_op) {