diff --git a/src/arm/assembler-arm.cc b/src/arm/assembler-arm.cc index f748d64f84..2eadd4f81e 100644 --- a/src/arm/assembler-arm.cc +++ b/src/arm/assembler-arm.cc @@ -4241,6 +4241,60 @@ void Assembler::vmul(NeonSize size, QwNeonRegister dst, n * B7 | B6 | m * B5 | B4 | vm); } +void Assembler::vrecpe(const QwNeonRegister dst, const QwNeonRegister src) { + DCHECK(IsEnabled(NEON)); + // Qd = vadd(Qn, Qm) SIMD reciprocal estimate. + // Instruction details available in ARM DDI 0406C.b, A8-1024. + int vd, d; + dst.split_code(&vd, &d); + int vm, m; + src.split_code(&vm, &m); + emit(0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x5 * B8 | B6 | m * B5 | + vm); +} + +void Assembler::vrsqrte(const QwNeonRegister dst, const QwNeonRegister src) { + DCHECK(IsEnabled(NEON)); + // Qd = vadd(Qn, Qm) SIMD reciprocal square root estimate. + // Instruction details available in ARM DDI 0406C.b, A8-1038. + int vd, d; + dst.split_code(&vd, &d); + int vm, m; + src.split_code(&vm, &m); + emit(0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x5 * B8 | 0x3 * B6 | + m * B5 | vm); +} + +void Assembler::vrecps(const QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2) { + DCHECK(IsEnabled(NEON)); + // Qd = vadd(Qn, Qm) SIMD reciprocal refinement step. + // Instruction details available in ARM DDI 0406C.b, A8-1026. + int vd, d; + dst.split_code(&vd, &d); + int vn, n; + src1.split_code(&vn, &n); + int vm, m; + src2.split_code(&vm, &m); + emit(0x1E4U * B23 | d * B22 | vn * B16 | vd * B12 | 0xF * B8 | n * B7 | B6 | + m * B5 | B4 | vm); +} + +void Assembler::vrsqrts(const QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2) { + DCHECK(IsEnabled(NEON)); + // Qd = vadd(Qn, Qm) SIMD reciprocal square root refinement step. + // Instruction details available in ARM DDI 0406C.b, A8-1040. + int vd, d; + dst.split_code(&vd, &d); + int vn, n; + src1.split_code(&vn, &n); + int vm, m; + src2.split_code(&vm, &m); + emit(0x1E4U * B23 | d * B22 | B21 | vn * B16 | vd * B12 | 0xF * B8 | n * B7 | + B6 | m * B5 | B4 | vm); +} + void Assembler::vtst(NeonSize size, QwNeonRegister dst, const QwNeonRegister src1, const QwNeonRegister src2) { DCHECK(IsEnabled(NEON)); diff --git a/src/arm/assembler-arm.h b/src/arm/assembler-arm.h index e9cab65649..4f1197e1f8 100644 --- a/src/arm/assembler-arm.h +++ b/src/arm/assembler-arm.h @@ -1380,6 +1380,13 @@ class Assembler : public AssemblerBase { const QwNeonRegister src2); void vmul(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1, const QwNeonRegister src2); + // vrecpe and vrsqrte only support floating point lanes. + void vrecpe(const QwNeonRegister dst, const QwNeonRegister src); + void vrsqrte(const QwNeonRegister dst, const QwNeonRegister src); + void vrecps(const QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2); + void vrsqrts(const QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2); void vtst(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1, const QwNeonRegister src2); void vceq(const QwNeonRegister dst, const QwNeonRegister src1, diff --git a/src/arm/disasm-arm.cc b/src/arm/disasm-arm.cc index 30ea43fde8..22fc380040 100644 --- a/src/arm/disasm-arm.cc +++ b/src/arm/disasm-arm.cc @@ -1899,6 +1899,15 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { // vceq.f32 Qd, Qm, Qn. out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, "vceq.f32 q%d, q%d, q%d", Vd, Vn, Vm); + } else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xf && + instr->Bit(6) == 1 && instr->Bit(4) == 1) { + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kSimd128Precision); + const char* op = instr->Bit(21) == 0 ? "vrecps" : "vrsqrts"; + // vrecps/vrsqrts.f32 Qd, Qm, Qn. + out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, + "%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm); } else { Unknown(instr); } @@ -2081,6 +2090,13 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { } else { Unknown(instr); } + } else if (instr->Bits(19, 18) == 0x2 && instr->Bits(11, 8) == 0x5) { + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + const char* op = instr->Bit(7) == 0 ? "vrecpe" : "vrsqrte"; + // vrecpe/vrsqrte.f32 Qd, Qm. + out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, + "%s.f32 q%d, q%d", op, Vd, Vm); } else { Unknown(instr); } diff --git a/src/arm/simulator-arm.cc b/src/arm/simulator-arm.cc index 2e9bfbcfdf..a7135dc39d 100644 --- a/src/arm/simulator-arm.cc +++ b/src/arm/simulator-arm.cc @@ -3993,7 +3993,26 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { dst[i] = (src1[i] == src2[i]) ? 0xFFFFFFFF : 0; } set_q_register(Vd, dst); - + } else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xf && + instr->Bit(6) == 1 && instr->Bit(4) == 1) { + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kSimd128Precision); + float src1[4], src2[4]; + get_q_register(Vn, src1); + get_q_register(Vm, src2); + if (instr->Bit(21) == 0) { + // vrecps.f32 Qd, Qm, Qn. + for (int i = 0; i < 4; i++) { + src1[i] = 2.0f - src1[i] * src2[i]; + } + } else { + // vrsqrts.f32 Qd, Qm, Qn. + for (int i = 0; i < 4; i++) { + src1[i] = (3.0f - src1[i] * src2[i]) * 0.5f; + } + } + set_q_register(Vd, src1); } else { UNIMPLEMENTED(); } @@ -4526,6 +4545,30 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { } else { UNIMPLEMENTED(); } + } else if (instr->Bits(19, 18) == 0x2 && instr->Bits(11, 8) == 0x5) { + // vrecpe/vrsqrte.f32 Qd, Qm. + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + uint32_t src[4]; + get_q_register(Vm, src); + if (instr->Bit(7) == 0) { + for (int i = 0; i < 4; i++) { + float denom = bit_cast(src[i]); + div_zero_vfp_flag_ = (denom == 0); + float result = 1.0f / denom; + result = canonicalizeNaN(result); + src[i] = bit_cast(result); + } + } else { + lazily_initialize_fast_sqrt(isolate_); + for (int i = 0; i < 4; i++) { + float radicand = bit_cast(src[i]); + float result = 1.0f / fast_sqrt(radicand, isolate_); + result = canonicalizeNaN(result); + src[i] = bit_cast(result); + } + } + set_q_register(Vd, src); } else { UNIMPLEMENTED(); } diff --git a/test/cctest/test-assembler-arm.cc b/test/cctest/test-assembler-arm.cc index f2782339d0..3f19220e15 100644 --- a/test/cctest/test-assembler-arm.cc +++ b/test/cctest/test-assembler-arm.cc @@ -1289,6 +1289,7 @@ TEST(15) { uint32_t vadd8[4], vadd16[4], vadd32[4]; uint32_t vsub8[4], vsub16[4], vsub32[4]; uint32_t vmul8[4], vmul16[4], vmul32[4]; + float vrecpe[4], vrecps[4], vrsqrte[4], vrsqrts[4]; uint32_t vtst[4], vceq[4], vceqf[4], vbsl[4]; uint32_t vext[4]; uint32_t vzip8a[4], vzip8b[4], vzip16a[4], vzip16b[4], vzip32a[4], @@ -1482,6 +1483,34 @@ TEST(15) { __ vmul(q1, q1, q0); __ add(r4, r0, Operand(static_cast(offsetof(T, vmulf)))); __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + // vrecpe. + __ vmov(s4, 2.0); + __ vdup(q0, s4); + __ vrecpe(q1, q0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vrecpe)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + // vrecps. + __ vmov(s4, 2.0); + __ vdup(q0, s4); + __ vmov(s4, 1.5); + __ vdup(q1, s4); + __ vrecps(q1, q0, q1); + __ add(r4, r0, Operand(static_cast(offsetof(T, vrecps)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + // vrsqrte. + __ vmov(s4, 4.0); + __ vdup(q0, s4); + __ vrsqrte(q1, q0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vrsqrte)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + // vrsqrts. + __ vmov(s4, 2.0); + __ vdup(q0, s4); + __ vmov(s4, 2.5); + __ vdup(q1, s4); + __ vrsqrts(q1, q0, q1); + __ add(r4, r0, Operand(static_cast(offsetof(T, vrsqrts)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); // vceq (float). __ vmov(s4, 1.0); __ vdup(q0, s4); @@ -1750,6 +1779,10 @@ TEST(15) { CHECK_EQ_SPLAT(vaddf, 2.0); CHECK_EQ_SPLAT(vsubf, -1.0); CHECK_EQ_SPLAT(vmulf, 4.0); + CHECK_EQ_SPLAT(vrecpe, 0.5f); // 1 / 2 + CHECK_EQ_SPLAT(vrecps, -1.0f); // 2 - (2 * 1.5) + CHECK_EQ_SPLAT(vrsqrte, 0.5f); // 1 / sqrt(4) + CHECK_EQ_SPLAT(vrsqrts, -1.0f); // (3 - (2 * 2.5)) / 2 CHECK_EQ_SPLAT(vceqf, 0xffffffffu); CHECK_EQ_SPLAT(vadd8, 0x03030303u); CHECK_EQ_SPLAT(vadd16, 0x00030003u); diff --git a/test/cctest/test-disasm-arm.cc b/test/cctest/test-disasm-arm.cc index 21e68b91c5..c0acb8df50 100644 --- a/test/cctest/test-disasm-arm.cc +++ b/test/cctest/test-disasm-arm.cc @@ -1049,6 +1049,14 @@ TEST(Neon) { "f2142970 vmul.i16 q1, q2, q8"); COMPARE(vmul(Neon32, q15, q0, q8), "f260e970 vmul.i32 q15, q0, q8"); + COMPARE(vrecpe(q15, q0), + "f3fbe540 vrecpe.f32 q15, q0"); + COMPARE(vrecps(q15, q0, q8), + "f240ef70 vrecps.f32 q15, q0, q8"); + COMPARE(vrsqrte(q15, q0), + "f3fbe5c0 vrsqrte.f32 q15, q0"); + COMPARE(vrsqrts(q15, q0, q8), + "f260ef70 vrsqrts.f32 q15, q0, q8"); COMPARE(vtst(Neon8, q0, q1, q2), "f2020854 vtst.i8 q0, q1, q2"); COMPARE(vtst(Neon16, q1, q2, q8),