[ARM] Add Neon shift instructions vshl, vshr.

LOG=N
BUG=v8:4124

Review-Url: https://codereview.chromium.org/2629223005
Cr-Commit-Position: refs/heads/master@{#42610}
This commit is contained in:
bbudge 2017-01-23 10:24:27 -08:00 committed by Commit bot
parent f9f5fc31fd
commit 67244dcef1
7 changed files with 247 additions and 9 deletions

View File

@ -4412,6 +4412,48 @@ void Assembler::vmax(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
emit(EncodeNeonBinOp(VMAX, dt, dst, src1, src2));
}
enum NeonShiftOp { VSHL, VSHR };
static Instr EncodeNeonShiftOp(NeonShiftOp op, NeonDataType dt,
QwNeonRegister dst, QwNeonRegister src,
int shift) {
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
int size_in_bits = kBitsPerByte << NeonSz(dt);
int op_encoding = 0;
int imm6 = 0;
if (op == VSHL) {
DCHECK(shift >= 0 && size_in_bits > shift);
imm6 = size_in_bits + shift;
op_encoding = 0x5 * B8;
} else {
DCHECK_EQ(VSHR, op);
DCHECK(shift > 0 && size_in_bits >= shift);
imm6 = 2 * size_in_bits - shift;
op_encoding = NeonU(dt) * B24;
}
return 0x1E5U * B23 | d * B22 | imm6 * B16 | vd * B12 | B6 | m * B5 | B4 |
vm | op_encoding;
}
void Assembler::vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
int shift) {
DCHECK(IsEnabled(NEON));
// Qd = vshl(Qm, bits) SIMD shift left immediate.
// Instruction details available in ARM DDI 0406C.b, A8-1046.
emit(EncodeNeonShiftOp(VSHL, dt, dst, src, shift));
}
void Assembler::vshr(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
int shift) {
DCHECK(IsEnabled(NEON));
// Qd = vshl(Qm, bits) SIMD shift right immediate.
// Instruction details available in ARM DDI 0406C.b, A8-1052.
emit(EncodeNeonShiftOp(VSHR, dt, dst, src, shift));
}
static Instr EncodeNeonEstimateOp(bool is_rsqrt, QwNeonRegister dst,
QwNeonRegister src) {
int vd, d;

View File

@ -1387,6 +1387,8 @@ class Assembler : public AssemblerBase {
void vmax(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
void vmax(NeonDataType dt, QwNeonRegister dst,
QwNeonRegister src1, QwNeonRegister src2);
void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
void vshr(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
// vrecpe and vrsqrte only support floating point lanes.
void vrecpe(QwNeonRegister dst, QwNeonRegister src);
void vrsqrte(QwNeonRegister dst, QwNeonRegister src);

View File

@ -1973,6 +1973,24 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vext.8 q%d, q%d, q%d, #%d",
Vd, Vn, Vm, imm4);
} else if (instr->Bits(11, 7) == 0xA && instr->Bit(4) == 1) {
// vshl.i<size> Qd, Qm, shift
int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
int shift = instr->Bits(21, 16) - size;
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vshl.i%d q%d, q%d, #%d",
size, Vd, Vm, shift);
} else if (instr->Bits(11, 7) == 0 && instr->Bit(4) == 1) {
// vshr.s<size> Qd, Qm, shift
int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
int shift = 2 * size - instr->Bits(21, 16);
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vshr.s%d q%d, q%d, #%d",
size, Vd, Vm, shift);
} else {
Unknown(instr);
}
@ -2162,15 +2180,24 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
Unknown(instr);
}
} else if (instr->Bits(19, 18) == 0x2 && instr->Bits(11, 8) == 0x5) {
// vrecpe/vrsqrte.f32 Qd, Qm.
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
const char* op = instr->Bit(7) == 0 ? "vrecpe" : "vrsqrte";
// vrecpe/vrsqrte.f32 Qd, Qm.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d", op, Vd, Vm);
} else {
Unknown(instr);
}
} else if (instr->Bits(11, 7) == 0 && instr->Bit(4) == 1) {
// vshr.u<size> Qd, Qm, shift
int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
int shift = 2 * size - instr->Bits(21, 16);
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vshr.u%d q%d, q%d, #%d",
size, Vd, Vm, shift);
} else {
Unknown(instr);
}

View File

@ -573,7 +573,6 @@ static bool AllOnOnePage(uintptr_t start, int size) {
return start_page == end_page;
}
void Simulator::set_last_debugger_input(char* input) {
DeleteArray(last_debugger_input_);
last_debugger_input_ = input;
@ -4355,6 +4354,84 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
dst[i] = src2[i - boundary];
}
set_q_register(Vd, dst);
} else if (instr->Bits(11, 7) == 0xA && instr->Bit(4) == 1) {
// vshl.i<size> Qd, Qm, shift
int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
int shift = instr->Bits(21, 16) - size;
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
NeonSize ns = static_cast<NeonSize>(size / 16);
switch (ns) {
case Neon8: {
uint8_t src[16];
get_q_register(Vm, src);
for (int i = 0; i < 16; i++) {
src[i] <<= shift;
}
set_q_register(Vd, src);
break;
}
case Neon16: {
uint16_t src[8];
get_q_register(Vm, src);
for (int i = 0; i < 8; i++) {
src[i] <<= shift;
}
set_q_register(Vd, src);
break;
}
case Neon32: {
uint32_t src[4];
get_q_register(Vm, src);
for (int i = 0; i < 4; i++) {
src[i] <<= shift;
}
set_q_register(Vd, src);
break;
}
default:
UNREACHABLE();
break;
}
} else if (instr->Bits(11, 7) == 0 && instr->Bit(4) == 1) {
// vshr.s<size> Qd, Qm, shift
int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
int shift = 2 * size - instr->Bits(21, 16);
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
NeonSize ns = static_cast<NeonSize>(size / 16);
switch (ns) {
case Neon8: {
int8_t src[16];
get_q_register(Vm, src);
for (int i = 0; i < 16; i++) {
src[i] = ArithmeticShiftRight(src[i], shift);
}
set_q_register(Vd, src);
break;
}
case Neon16: {
int16_t src[8];
get_q_register(Vm, src);
for (int i = 0; i < 8; i++) {
src[i] = ArithmeticShiftRight(src[i], shift);
}
set_q_register(Vd, src);
break;
}
case Neon32: {
int32_t src[4];
get_q_register(Vm, src);
for (int i = 0; i < 4; i++) {
src[i] = ArithmeticShiftRight(src[i], shift);
}
set_q_register(Vd, src);
break;
}
default:
UNREACHABLE();
break;
}
} else {
UNIMPLEMENTED();
}
@ -4993,6 +5070,45 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
} else {
UNIMPLEMENTED();
}
} else if (instr->Bits(11, 7) == 0 && instr->Bit(4) == 1) {
// vshr.u<size> Qd, Qm, shift
int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
int shift = 2 * size - instr->Bits(21, 16);
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
NeonSize ns = static_cast<NeonSize>(size / 16);
switch (ns) {
case Neon8: {
uint8_t src[16];
get_q_register(Vm, src);
for (int i = 0; i < 16; i++) {
src[i] >>= shift;
}
set_q_register(Vd, src);
break;
}
case Neon16: {
uint16_t src[8];
get_q_register(Vm, src);
for (int i = 0; i < 8; i++) {
src[i] >>= shift;
}
set_q_register(Vd, src);
break;
}
case Neon32: {
uint32_t src[4];
get_q_register(Vm, src);
for (int i = 0; i < 4; i++) {
src[i] >>= shift;
}
set_q_register(Vd, src);
break;
}
default:
UNREACHABLE();
break;
}
} else {
UNIMPLEMENTED();
}

View File

@ -137,15 +137,20 @@ inline int MostSignificantBit(uint32_t x) {
return nibble + msb4[x];
}
// The C++ standard leaves the semantics of '>>' undefined for
// negative signed operands. Most implementations do the right thing,
// though.
inline int ArithmeticShiftRight(int x, int s) {
return x >> s;
template <typename T>
static T ArithmeticShiftRight(T x, int shift) {
DCHECK_LE(0, shift);
if (x < 0) {
// Right shift of signed values is implementation defined. Simulate a
// true arithmetic right shift by adding leading sign bits.
using UnsignedT = typename std::make_unsigned<T>::type;
UnsignedT mask = ~(static_cast<UnsignedT>(~0) >> shift);
return (static_cast<UnsignedT>(x) >> shift) | mask;
} else {
return x >> shift;
}
}
template <typename T>
int Compare(const T& a, const T& b) {
if (a == b)

View File

@ -1301,6 +1301,8 @@ TEST(15) {
uint32_t vadd8[4], vadd16[4], vadd32[4];
uint32_t vsub8[4], vsub16[4], vsub32[4];
uint32_t vmul8[4], vmul16[4], vmul32[4];
uint32_t vshl8[4], vshl16[4], vshl32[5];
uint32_t vshr_s8[4], vshr_u16[4], vshr_s32[5];
uint32_t vceq[4], vceqf[4], vcgef[4], vcgtf[4];
uint32_t vcge_s8[4], vcge_u16[4], vcge_s32[4];
uint32_t vcgt_s8[4], vcgt_u16[4], vcgt_s32[4];
@ -1671,6 +1673,32 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmul32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vshl.
__ mov(r4, Operand(0x55));
__ vdup(Neon8, q0, r4);
__ vshl(NeonS8, q1, q0, 1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshl8))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ vshl(NeonU16, q1, q0, 9);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshl16))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ vshl(NeonS32, q1, q0, 17);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshl32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vshr.s, vshr.u.
__ mov(r4, Operand(0x80));
__ vdup(Neon8, q0, r4);
__ vshr(NeonS8, q1, q0, 1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshr_s8))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ vshr(NeonU16, q1, q0, 9);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshr_u16))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ vshr(NeonS32, q1, q0, 17);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshr_s32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vceq.
__ mov(r4, Operand(0x03));
__ vdup(Neon8, q0, r4);
@ -1926,6 +1954,12 @@ TEST(15) {
CHECK_EQ_SPLAT(vmul8, 0x04040404u);
CHECK_EQ_SPLAT(vmul16, 0x00040004u);
CHECK_EQ_SPLAT(vmul32, 0x00000004u);
CHECK_EQ_SPLAT(vshl8, 0xaaaaaaaau);
CHECK_EQ_SPLAT(vshl16, 0xaa00aa00u);
CHECK_EQ_SPLAT(vshl32, 0xaaaa0000u);
CHECK_EQ_SPLAT(vshr_s8, 0xc0c0c0c0u);
CHECK_EQ_SPLAT(vshr_u16, 0x00400040u);
CHECK_EQ_SPLAT(vshr_s32, 0xffffc040u);
CHECK_EQ_SPLAT(vceq, 0x00ff00ffu);
// [0, 3, 0, 3, ...] >= [3, 3, 3, 3, ...]
CHECK_EQ_SPLAT(vcge_s8, 0x00ff00ffu);

View File

@ -1063,6 +1063,18 @@ TEST(Neon) {
"f2142970 vmul.i16 q1, q2, q8");
COMPARE(vmul(Neon32, q15, q0, q8),
"f260e970 vmul.i32 q15, q0, q8");
COMPARE(vshl(NeonS8, q15, q0, 6),
"f2cee550 vshl.i8 q15, q0, #6");
COMPARE(vshl(NeonU16, q15, q0, 10),
"f2dae550 vshl.i16 q15, q0, #10");
COMPARE(vshl(NeonS32, q15, q0, 17),
"f2f1e550 vshl.i32 q15, q0, #17");
COMPARE(vshr(NeonS8, q15, q0, 6),
"f2cae050 vshr.s8 q15, q0, #6");
COMPARE(vshr(NeonU16, q15, q0, 10),
"f3d6e050 vshr.u16 q15, q0, #10");
COMPARE(vshr(NeonS32, q15, q0, 17),
"f2efe050 vshr.s32 q15, q0, #17");
COMPARE(vrecpe(q15, q0),
"f3fbe540 vrecpe.f32 q15, q0");
COMPARE(vrecps(q15, q0, q8),