[ARM] Add vrecpe, vrecps, vrsqrte, vrsqrts instructions to assembler.
- Disassembler, simulator support too. LOG=N BUG=v8:4124 Review-Url: https://codereview.chromium.org/2600153002 Cr-Commit-Position: refs/heads/master@{#42176}
This commit is contained in:
parent
fc86d4329b
commit
8dfea24e3d
@ -4241,6 +4241,60 @@ void Assembler::vmul(NeonSize size, QwNeonRegister dst,
|
||||
n * B7 | B6 | m * B5 | B4 | vm);
|
||||
}
|
||||
|
||||
void Assembler::vrecpe(const QwNeonRegister dst, const QwNeonRegister src) {
|
||||
DCHECK(IsEnabled(NEON));
|
||||
// Qd = vadd(Qn, Qm) SIMD reciprocal estimate.
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-1024.
|
||||
int vd, d;
|
||||
dst.split_code(&vd, &d);
|
||||
int vm, m;
|
||||
src.split_code(&vm, &m);
|
||||
emit(0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x5 * B8 | B6 | m * B5 |
|
||||
vm);
|
||||
}
|
||||
|
||||
void Assembler::vrsqrte(const QwNeonRegister dst, const QwNeonRegister src) {
|
||||
DCHECK(IsEnabled(NEON));
|
||||
// Qd = vadd(Qn, Qm) SIMD reciprocal square root estimate.
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-1038.
|
||||
int vd, d;
|
||||
dst.split_code(&vd, &d);
|
||||
int vm, m;
|
||||
src.split_code(&vm, &m);
|
||||
emit(0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x5 * B8 | 0x3 * B6 |
|
||||
m * B5 | vm);
|
||||
}
|
||||
|
||||
void Assembler::vrecps(const QwNeonRegister dst, const QwNeonRegister src1,
|
||||
const QwNeonRegister src2) {
|
||||
DCHECK(IsEnabled(NEON));
|
||||
// Qd = vadd(Qn, Qm) SIMD reciprocal refinement step.
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-1026.
|
||||
int vd, d;
|
||||
dst.split_code(&vd, &d);
|
||||
int vn, n;
|
||||
src1.split_code(&vn, &n);
|
||||
int vm, m;
|
||||
src2.split_code(&vm, &m);
|
||||
emit(0x1E4U * B23 | d * B22 | vn * B16 | vd * B12 | 0xF * B8 | n * B7 | B6 |
|
||||
m * B5 | B4 | vm);
|
||||
}
|
||||
|
||||
void Assembler::vrsqrts(const QwNeonRegister dst, const QwNeonRegister src1,
|
||||
const QwNeonRegister src2) {
|
||||
DCHECK(IsEnabled(NEON));
|
||||
// Qd = vadd(Qn, Qm) SIMD reciprocal square root refinement step.
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-1040.
|
||||
int vd, d;
|
||||
dst.split_code(&vd, &d);
|
||||
int vn, n;
|
||||
src1.split_code(&vn, &n);
|
||||
int vm, m;
|
||||
src2.split_code(&vm, &m);
|
||||
emit(0x1E4U * B23 | d * B22 | B21 | vn * B16 | vd * B12 | 0xF * B8 | n * B7 |
|
||||
B6 | m * B5 | B4 | vm);
|
||||
}
|
||||
|
||||
void Assembler::vtst(NeonSize size, QwNeonRegister dst,
|
||||
const QwNeonRegister src1, const QwNeonRegister src2) {
|
||||
DCHECK(IsEnabled(NEON));
|
||||
|
@ -1380,6 +1380,13 @@ class Assembler : public AssemblerBase {
|
||||
const QwNeonRegister src2);
|
||||
void vmul(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
|
||||
const QwNeonRegister src2);
|
||||
// vrecpe and vrsqrte only support floating point lanes.
|
||||
void vrecpe(const QwNeonRegister dst, const QwNeonRegister src);
|
||||
void vrsqrte(const QwNeonRegister dst, const QwNeonRegister src);
|
||||
void vrecps(const QwNeonRegister dst, const QwNeonRegister src1,
|
||||
const QwNeonRegister src2);
|
||||
void vrsqrts(const QwNeonRegister dst, const QwNeonRegister src1,
|
||||
const QwNeonRegister src2);
|
||||
void vtst(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
|
||||
const QwNeonRegister src2);
|
||||
void vceq(const QwNeonRegister dst, const QwNeonRegister src1,
|
||||
|
@ -1899,6 +1899,15 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
|
||||
// vceq.f32 Qd, Qm, Qn.
|
||||
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
|
||||
"vceq.f32 q%d, q%d, q%d", Vd, Vn, Vm);
|
||||
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xf &&
|
||||
instr->Bit(6) == 1 && instr->Bit(4) == 1) {
|
||||
int Vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
int Vm = instr->VFPMRegValue(kSimd128Precision);
|
||||
int Vn = instr->VFPNRegValue(kSimd128Precision);
|
||||
const char* op = instr->Bit(21) == 0 ? "vrecps" : "vrsqrts";
|
||||
// vrecps/vrsqrts.f32 Qd, Qm, Qn.
|
||||
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
|
||||
"%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm);
|
||||
} else {
|
||||
Unknown(instr);
|
||||
}
|
||||
@ -2081,6 +2090,13 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
|
||||
} else {
|
||||
Unknown(instr);
|
||||
}
|
||||
} else if (instr->Bits(19, 18) == 0x2 && instr->Bits(11, 8) == 0x5) {
|
||||
int Vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
int Vm = instr->VFPMRegValue(kSimd128Precision);
|
||||
const char* op = instr->Bit(7) == 0 ? "vrecpe" : "vrsqrte";
|
||||
// vrecpe/vrsqrte.f32 Qd, Qm.
|
||||
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
|
||||
"%s.f32 q%d, q%d", op, Vd, Vm);
|
||||
} else {
|
||||
Unknown(instr);
|
||||
}
|
||||
|
@ -3993,7 +3993,26 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
|
||||
dst[i] = (src1[i] == src2[i]) ? 0xFFFFFFFF : 0;
|
||||
}
|
||||
set_q_register(Vd, dst);
|
||||
|
||||
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xf &&
|
||||
instr->Bit(6) == 1 && instr->Bit(4) == 1) {
|
||||
int Vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
int Vm = instr->VFPMRegValue(kSimd128Precision);
|
||||
int Vn = instr->VFPNRegValue(kSimd128Precision);
|
||||
float src1[4], src2[4];
|
||||
get_q_register(Vn, src1);
|
||||
get_q_register(Vm, src2);
|
||||
if (instr->Bit(21) == 0) {
|
||||
// vrecps.f32 Qd, Qm, Qn.
|
||||
for (int i = 0; i < 4; i++) {
|
||||
src1[i] = 2.0f - src1[i] * src2[i];
|
||||
}
|
||||
} else {
|
||||
// vrsqrts.f32 Qd, Qm, Qn.
|
||||
for (int i = 0; i < 4; i++) {
|
||||
src1[i] = (3.0f - src1[i] * src2[i]) * 0.5f;
|
||||
}
|
||||
}
|
||||
set_q_register(Vd, src1);
|
||||
} else {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
@ -4526,6 +4545,30 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
|
||||
} else {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
} else if (instr->Bits(19, 18) == 0x2 && instr->Bits(11, 8) == 0x5) {
|
||||
// vrecpe/vrsqrte.f32 Qd, Qm.
|
||||
int Vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
int Vm = instr->VFPMRegValue(kSimd128Precision);
|
||||
uint32_t src[4];
|
||||
get_q_register(Vm, src);
|
||||
if (instr->Bit(7) == 0) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
float denom = bit_cast<float>(src[i]);
|
||||
div_zero_vfp_flag_ = (denom == 0);
|
||||
float result = 1.0f / denom;
|
||||
result = canonicalizeNaN(result);
|
||||
src[i] = bit_cast<uint32_t>(result);
|
||||
}
|
||||
} else {
|
||||
lazily_initialize_fast_sqrt(isolate_);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
float radicand = bit_cast<float>(src[i]);
|
||||
float result = 1.0f / fast_sqrt(radicand, isolate_);
|
||||
result = canonicalizeNaN(result);
|
||||
src[i] = bit_cast<uint32_t>(result);
|
||||
}
|
||||
}
|
||||
set_q_register(Vd, src);
|
||||
} else {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
|
@ -1289,6 +1289,7 @@ TEST(15) {
|
||||
uint32_t vadd8[4], vadd16[4], vadd32[4];
|
||||
uint32_t vsub8[4], vsub16[4], vsub32[4];
|
||||
uint32_t vmul8[4], vmul16[4], vmul32[4];
|
||||
float vrecpe[4], vrecps[4], vrsqrte[4], vrsqrts[4];
|
||||
uint32_t vtst[4], vceq[4], vceqf[4], vbsl[4];
|
||||
uint32_t vext[4];
|
||||
uint32_t vzip8a[4], vzip8b[4], vzip16a[4], vzip16b[4], vzip32a[4],
|
||||
@ -1482,6 +1483,34 @@ TEST(15) {
|
||||
__ vmul(q1, q1, q0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmulf))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vrecpe.
|
||||
__ vmov(s4, 2.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vrecpe(q1, q0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrecpe))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vrecps.
|
||||
__ vmov(s4, 2.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vmov(s4, 1.5);
|
||||
__ vdup(q1, s4);
|
||||
__ vrecps(q1, q0, q1);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrecps))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vrsqrte.
|
||||
__ vmov(s4, 4.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vrsqrte(q1, q0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrsqrte))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vrsqrts.
|
||||
__ vmov(s4, 2.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vmov(s4, 2.5);
|
||||
__ vdup(q1, s4);
|
||||
__ vrsqrts(q1, q0, q1);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrsqrts))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vceq (float).
|
||||
__ vmov(s4, 1.0);
|
||||
__ vdup(q0, s4);
|
||||
@ -1750,6 +1779,10 @@ TEST(15) {
|
||||
CHECK_EQ_SPLAT(vaddf, 2.0);
|
||||
CHECK_EQ_SPLAT(vsubf, -1.0);
|
||||
CHECK_EQ_SPLAT(vmulf, 4.0);
|
||||
CHECK_EQ_SPLAT(vrecpe, 0.5f); // 1 / 2
|
||||
CHECK_EQ_SPLAT(vrecps, -1.0f); // 2 - (2 * 1.5)
|
||||
CHECK_EQ_SPLAT(vrsqrte, 0.5f); // 1 / sqrt(4)
|
||||
CHECK_EQ_SPLAT(vrsqrts, -1.0f); // (3 - (2 * 2.5)) / 2
|
||||
CHECK_EQ_SPLAT(vceqf, 0xffffffffu);
|
||||
CHECK_EQ_SPLAT(vadd8, 0x03030303u);
|
||||
CHECK_EQ_SPLAT(vadd16, 0x00030003u);
|
||||
|
@ -1049,6 +1049,14 @@ TEST(Neon) {
|
||||
"f2142970 vmul.i16 q1, q2, q8");
|
||||
COMPARE(vmul(Neon32, q15, q0, q8),
|
||||
"f260e970 vmul.i32 q15, q0, q8");
|
||||
COMPARE(vrecpe(q15, q0),
|
||||
"f3fbe540 vrecpe.f32 q15, q0");
|
||||
COMPARE(vrecps(q15, q0, q8),
|
||||
"f240ef70 vrecps.f32 q15, q0, q8");
|
||||
COMPARE(vrsqrte(q15, q0),
|
||||
"f3fbe5c0 vrsqrte.f32 q15, q0");
|
||||
COMPARE(vrsqrts(q15, q0, q8),
|
||||
"f260ef70 vrsqrts.f32 q15, q0, q8");
|
||||
COMPARE(vtst(Neon8, q0, q1, q2),
|
||||
"f2020854 vtst.i8 q0, q1, q2");
|
||||
COMPARE(vtst(Neon16, q1, q2, q8),
|
||||
|
Loading…
Reference in New Issue
Block a user