[ARM] Add vrecpe, vrecps, vrsqrte, vrsqrts instructions to assembler.

- Disassembler, simulator support too.
LOG=N
BUG=v8:4124

Review-Url: https://codereview.chromium.org/2600153002
Cr-Commit-Position: refs/heads/master@{#42176}
This commit is contained in:
bbudge 2017-01-10 04:36:59 -08:00 committed by Commit bot
parent fc86d4329b
commit 8dfea24e3d
6 changed files with 162 additions and 1 deletions

View File

@ -4241,6 +4241,60 @@ void Assembler::vmul(NeonSize size, QwNeonRegister dst,
n * B7 | B6 | m * B5 | B4 | vm);
}
void Assembler::vrecpe(const QwNeonRegister dst, const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD reciprocal estimate.
// Instruction details available in ARM DDI 0406C.b, A8-1024.
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
emit(0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x5 * B8 | B6 | m * B5 |
vm);
}
void Assembler::vrsqrte(const QwNeonRegister dst, const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD reciprocal square root estimate.
// Instruction details available in ARM DDI 0406C.b, A8-1038.
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
emit(0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x5 * B8 | 0x3 * B6 |
m * B5 | vm);
}
void Assembler::vrecps(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD reciprocal refinement step.
// Instruction details available in ARM DDI 0406C.b, A8-1026.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
emit(0x1E4U * B23 | d * B22 | vn * B16 | vd * B12 | 0xF * B8 | n * B7 | B6 |
m * B5 | B4 | vm);
}
void Assembler::vrsqrts(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD reciprocal square root refinement step.
// Instruction details available in ARM DDI 0406C.b, A8-1040.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
emit(0x1E4U * B23 | d * B22 | B21 | vn * B16 | vd * B12 | 0xF * B8 | n * B7 |
B6 | m * B5 | B4 | vm);
}
void Assembler::vtst(NeonSize size, QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));

View File

@ -1380,6 +1380,13 @@ class Assembler : public AssemblerBase {
const QwNeonRegister src2);
void vmul(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
// vrecpe and vrsqrte only support floating point lanes.
void vrecpe(const QwNeonRegister dst, const QwNeonRegister src);
void vrsqrte(const QwNeonRegister dst, const QwNeonRegister src);
void vrecps(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vrsqrts(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vtst(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vceq(const QwNeonRegister dst, const QwNeonRegister src1,

View File

@ -1899,6 +1899,15 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
// vceq.f32 Qd, Qm, Qn.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vceq.f32 q%d, q%d, q%d", Vd, Vn, Vm);
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xf &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
const char* op = instr->Bit(21) == 0 ? "vrecps" : "vrsqrts";
// vrecps/vrsqrts.f32 Qd, Qm, Qn.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm);
} else {
Unknown(instr);
}
@ -2081,6 +2090,13 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
} else {
Unknown(instr);
}
} else if (instr->Bits(19, 18) == 0x2 && instr->Bits(11, 8) == 0x5) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
const char* op = instr->Bit(7) == 0 ? "vrecpe" : "vrsqrte";
// vrecpe/vrsqrte.f32 Qd, Qm.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d", op, Vd, Vm);
} else {
Unknown(instr);
}

View File

@ -3993,7 +3993,26 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
dst[i] = (src1[i] == src2[i]) ? 0xFFFFFFFF : 0;
}
set_q_register(Vd, dst);
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xf &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
float src1[4], src2[4];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
if (instr->Bit(21) == 0) {
// vrecps.f32 Qd, Qm, Qn.
for (int i = 0; i < 4; i++) {
src1[i] = 2.0f - src1[i] * src2[i];
}
} else {
// vrsqrts.f32 Qd, Qm, Qn.
for (int i = 0; i < 4; i++) {
src1[i] = (3.0f - src1[i] * src2[i]) * 0.5f;
}
}
set_q_register(Vd, src1);
} else {
UNIMPLEMENTED();
}
@ -4526,6 +4545,30 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
} else {
UNIMPLEMENTED();
}
} else if (instr->Bits(19, 18) == 0x2 && instr->Bits(11, 8) == 0x5) {
// vrecpe/vrsqrte.f32 Qd, Qm.
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
uint32_t src[4];
get_q_register(Vm, src);
if (instr->Bit(7) == 0) {
for (int i = 0; i < 4; i++) {
float denom = bit_cast<float>(src[i]);
div_zero_vfp_flag_ = (denom == 0);
float result = 1.0f / denom;
result = canonicalizeNaN(result);
src[i] = bit_cast<uint32_t>(result);
}
} else {
lazily_initialize_fast_sqrt(isolate_);
for (int i = 0; i < 4; i++) {
float radicand = bit_cast<float>(src[i]);
float result = 1.0f / fast_sqrt(radicand, isolate_);
result = canonicalizeNaN(result);
src[i] = bit_cast<uint32_t>(result);
}
}
set_q_register(Vd, src);
} else {
UNIMPLEMENTED();
}

View File

@ -1289,6 +1289,7 @@ TEST(15) {
uint32_t vadd8[4], vadd16[4], vadd32[4];
uint32_t vsub8[4], vsub16[4], vsub32[4];
uint32_t vmul8[4], vmul16[4], vmul32[4];
float vrecpe[4], vrecps[4], vrsqrte[4], vrsqrts[4];
uint32_t vtst[4], vceq[4], vceqf[4], vbsl[4];
uint32_t vext[4];
uint32_t vzip8a[4], vzip8b[4], vzip16a[4], vzip16b[4], vzip32a[4],
@ -1482,6 +1483,34 @@ TEST(15) {
__ vmul(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmulf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vrecpe.
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vrecpe(q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrecpe))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vrecps.
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vmov(s4, 1.5);
__ vdup(q1, s4);
__ vrecps(q1, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrecps))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vrsqrte.
__ vmov(s4, 4.0);
__ vdup(q0, s4);
__ vrsqrte(q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrsqrte))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vrsqrts.
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vmov(s4, 2.5);
__ vdup(q1, s4);
__ vrsqrts(q1, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrsqrts))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vceq (float).
__ vmov(s4, 1.0);
__ vdup(q0, s4);
@ -1750,6 +1779,10 @@ TEST(15) {
CHECK_EQ_SPLAT(vaddf, 2.0);
CHECK_EQ_SPLAT(vsubf, -1.0);
CHECK_EQ_SPLAT(vmulf, 4.0);
CHECK_EQ_SPLAT(vrecpe, 0.5f); // 1 / 2
CHECK_EQ_SPLAT(vrecps, -1.0f); // 2 - (2 * 1.5)
CHECK_EQ_SPLAT(vrsqrte, 0.5f); // 1 / sqrt(4)
CHECK_EQ_SPLAT(vrsqrts, -1.0f); // (3 - (2 * 2.5)) / 2
CHECK_EQ_SPLAT(vceqf, 0xffffffffu);
CHECK_EQ_SPLAT(vadd8, 0x03030303u);
CHECK_EQ_SPLAT(vadd16, 0x00030003u);

View File

@ -1049,6 +1049,14 @@ TEST(Neon) {
"f2142970 vmul.i16 q1, q2, q8");
COMPARE(vmul(Neon32, q15, q0, q8),
"f260e970 vmul.i32 q15, q0, q8");
COMPARE(vrecpe(q15, q0),
"f3fbe540 vrecpe.f32 q15, q0");
COMPARE(vrecps(q15, q0, q8),
"f240ef70 vrecps.f32 q15, q0, q8");
COMPARE(vrsqrte(q15, q0),
"f3fbe5c0 vrsqrte.f32 q15, q0");
COMPARE(vrsqrts(q15, q0, q8),
"f260ef70 vrsqrts.f32 q15, q0, q8");
COMPARE(vtst(Neon8, q0, q1, q2),
"f2020854 vtst.i8 q0, q1, q2");
COMPARE(vtst(Neon16, q1, q2, q8),