[Turbofan] Add ARM NEON instructions for implementing SIMD.

- Adds NEON instructions to assembler, disassembler, simulator.
- Adds ExtractLane, ReplaceLane functions to macro assembler.

LOG=N
BUG=v8:4124

Review-Url: https://codereview.chromium.org/2546933002
Cr-Commit-Position: refs/heads/master@{#41737}
This commit is contained in:
bbudge 2016-12-15 10:15:23 -08:00 committed by Commit bot
parent 250e85f84a
commit 03f33f2e68
11 changed files with 1592 additions and 137 deletions

View File

@ -483,30 +483,6 @@ void NeonMemOperand::SetAlignment(int align) {
}
}
NeonListOperand::NeonListOperand(DoubleRegister base, int registers_count) {
base_ = base;
switch (registers_count) {
case 1:
type_ = nlt_1;
break;
case 2:
type_ = nlt_2;
break;
case 3:
type_ = nlt_3;
break;
case 4:
type_ = nlt_4;
break;
default:
UNREACHABLE();
type_ = nlt_1;
break;
}
}
// -----------------------------------------------------------------------------
// Specific instructions, constants, and masks.
@ -2968,7 +2944,6 @@ void Assembler::vmov(const Register dst,
emit(cond | 0xE*B24 | B20 | sn*B16 | dst.code()*B12 | 0xA*B8 | n*B7 | B4);
}
// Type of data to read from or write to VFP register.
// Used as specifier in generic vcvt instruction.
enum VFPType { S32, U32, F32, F64 };
@ -3902,6 +3877,57 @@ void Assembler::vmovl(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src) {
(dt & NeonDataTypeSizeMask)*B19 | vd*B12 | 0xA*B8 | m*B5 | B4 | vm);
}
static int EncodeScalar(NeonDataType dt, int index) {
int opc1_opc2 = 0;
DCHECK_LE(0, index);
switch (dt) {
case NeonS8:
case NeonU8:
DCHECK_GT(8, index);
opc1_opc2 = 0x8 | index;
break;
case NeonS16:
case NeonU16:
DCHECK_GT(4, index);
opc1_opc2 = 0x1 | (index << 1);
break;
case NeonS32:
case NeonU32:
DCHECK_GT(2, index);
opc1_opc2 = index << 2;
break;
default:
UNREACHABLE();
break;
}
return (opc1_opc2 >> 2) * B21 | (opc1_opc2 & 0x3) * B5;
}
void Assembler::vmov(NeonDataType dt, DwVfpRegister dst, int index,
Register src) {
// Instruction details available in ARM DDI 0406C.b, A8.8.940.
// vmov ARM core register to scalar.
DCHECK(dt == NeonS32 || dt == NeonU32 || IsEnabled(NEON));
int vd, d;
dst.split_code(&vd, &d);
int opc1_opc2 = EncodeScalar(dt, index);
emit(0xEEu * B24 | vd * B16 | src.code() * B12 | 0xB * B8 | d * B7 | B4 |
opc1_opc2);
}
void Assembler::vmov(NeonDataType dt, Register dst, DwVfpRegister src,
int index) {
// Instruction details available in ARM DDI 0406C.b, A8.8.942.
// vmov Arm scalar to core register.
DCHECK(dt == NeonS32 || dt == NeonU32 || IsEnabled(NEON));
int vn, n;
src.split_code(&vn, &n);
int opc1_opc2 = EncodeScalar(dt, index);
int u = (dt & NeonDataTypeUMask) != 0 ? 1 : 0;
emit(0xEEu * B24 | u * B23 | B20 | vn * B16 | dst.code() * B12 | 0xB * B8 |
n * B7 | B4 | opc1_opc2);
}
void Assembler::vmov(const QwNeonRegister dst, const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
// Instruction details available in ARM DDI 0406C.b, A8-938.
@ -3915,6 +3941,18 @@ void Assembler::vmov(const QwNeonRegister dst, const QwNeonRegister src) {
B6 | m * B5 | B4 | vm);
}
void Assembler::vmvn(const QwNeonRegister dst, const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
// Instruction details available in ARM DDI 0406C.b, A8-966.
DCHECK(VfpRegisterIsAvailable(dst));
DCHECK(VfpRegisterIsAvailable(src));
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
emit(0x1E7U * B23 | d * B22 | 3 * B20 | vd * B12 | 0x17 * B6 | m * B5 | vm);
}
void Assembler::vswp(DwVfpRegister dst, DwVfpRegister src) {
// Instruction details available in ARM DDI 0406C.b, A8.8.418.
// 1111(31-28) | 00111(27-23) | D(22) | 110010(21-16) |
@ -3940,8 +3978,105 @@ void Assembler::vswp(QwNeonRegister dst, QwNeonRegister src) {
vm);
}
void Assembler::vdup(NeonSize size, const QwNeonRegister dst,
const Register src) {
DCHECK(IsEnabled(NEON));
// Instruction details available in ARM DDI 0406C.b, A8-886.
int B = 0, E = 0;
switch (size) {
case Neon8:
B = 1;
break;
case Neon16:
E = 1;
break;
case Neon32:
break;
default:
UNREACHABLE();
break;
}
int vd, d;
dst.split_code(&vd, &d);
emit(al | 0x1D * B23 | B * B22 | B21 | vd * B16 | src.code() * B12 |
0xB * B8 | d * B7 | E * B5 | B4);
}
void Assembler::vdup(const QwNeonRegister dst, const SwVfpRegister src) {
DCHECK(IsEnabled(NEON));
// Instruction details available in ARM DDI 0406C.b, A8-884.
int index = src.code() & 1;
int d_reg = src.code() / 2;
int imm4 = 4 | index << 3; // esize = 32, index in bit 3.
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
DwVfpRegister::from_code(d_reg).split_code(&vm, &m);
emit(0x1E7U * B23 | d * B22 | 0x3 * B20 | imm4 * B16 | vd * B12 | 0x18 * B7 |
B6 | m * B5 | vm);
}
// Encode NEON vcvt.src_type.dst_type instruction.
static Instr EncodeNeonVCVT(const VFPType dst_type, const QwNeonRegister dst,
const VFPType src_type, const QwNeonRegister src) {
DCHECK(src_type != dst_type);
DCHECK(src_type == F32 || dst_type == F32);
// Instruction details available in ARM DDI 0406C.b, A8.8.868.
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
int op = 0;
if (src_type == F32) {
DCHECK(dst_type == S32 || dst_type == U32);
op = dst_type == U32 ? 3 : 2;
} else {
DCHECK(src_type == S32 || src_type == U32);
op = src_type == U32 ? 1 : 0;
}
return 0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x3 * B9 | op * B7 |
B6 | m * B5 | vm;
}
void Assembler::vcvt_f32_s32(const QwNeonRegister dst,
const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
DCHECK(VfpRegisterIsAvailable(dst));
DCHECK(VfpRegisterIsAvailable(src));
emit(EncodeNeonVCVT(F32, dst, S32, src));
}
void Assembler::vcvt_f32_u32(const QwNeonRegister dst,
const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
DCHECK(VfpRegisterIsAvailable(dst));
DCHECK(VfpRegisterIsAvailable(src));
emit(EncodeNeonVCVT(F32, dst, U32, src));
}
void Assembler::vcvt_s32_f32(const QwNeonRegister dst,
const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
DCHECK(VfpRegisterIsAvailable(dst));
DCHECK(VfpRegisterIsAvailable(src));
emit(EncodeNeonVCVT(S32, dst, F32, src));
}
void Assembler::vcvt_u32_f32(const QwNeonRegister dst,
const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
DCHECK(VfpRegisterIsAvailable(dst));
DCHECK(VfpRegisterIsAvailable(src));
emit(EncodeNeonVCVT(U32, dst, F32, src));
}
void Assembler::veor(DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2) {
// Dd = veor(Dn, Dm) 64 bit integer exclusive OR.
// Instruction details available in ARM DDI 0406C.b, A8.8.888.
DCHECK(IsEnabled(NEON));
int vd, d;
@ -3956,6 +4091,7 @@ void Assembler::veor(DwVfpRegister dst, DwVfpRegister src1,
void Assembler::veor(QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) {
// Qd = veor(Qn, Qm) SIMD integer exclusive OR.
// Instruction details available in ARM DDI 0406C.b, A8.8.888.
DCHECK(IsEnabled(NEON));
int vd, d;
@ -3968,6 +4104,146 @@ void Assembler::veor(QwNeonRegister dst, QwNeonRegister src1,
m * B5 | B4 | vm);
}
void Assembler::vadd(QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD floating point addition.
// Instruction details available in ARM DDI 0406C.b, A8-830.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
emit(0x1E4U * B23 | d * B22 | vn * B16 | vd * B12 | 0xD * B8 | n * B7 | B6 |
m * B5 | vm);
}
void Assembler::vadd(NeonSize size, QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD integer addition.
// Instruction details available in ARM DDI 0406C.b, A8-828.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
int sz = static_cast<int>(size);
emit(0x1E4U * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x8 * B8 |
n * B7 | B6 | m * B5 | vm);
}
void Assembler::vsub(QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vsub(Qn, Qm) SIMD floating point subtraction.
// Instruction details available in ARM DDI 0406C.b, A8-1086.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
emit(0x1E4U * B23 | d * B22 | B21 | vn * B16 | vd * B12 | 0xD * B8 | n * B7 |
B6 | m * B5 | vm);
}
void Assembler::vsub(NeonSize size, QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vsub(Qn, Qm) SIMD integer subtraction.
// Instruction details available in ARM DDI 0406C.b, A8-1084.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
int sz = static_cast<int>(size);
emit(0x1E6U * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x8 * B8 |
n * B7 | B6 | m * B5 | vm);
}
void Assembler::vtst(NeonSize size, QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vtst(Qn, Qm) SIMD test integer operands.
// Instruction details available in ARM DDI 0406C.b, A8-1098.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
int sz = static_cast<int>(size);
emit(0x1E4U * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x8 * B8 |
n * B7 | B6 | m * B5 | B4 | vm);
}
void Assembler::vceq(NeonSize size, QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vceq(Qn, Qm) SIMD integer compare equal.
// Instruction details available in ARM DDI 0406C.b, A8-844.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
int sz = static_cast<int>(size);
emit(0x1E6U * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x8 * B8 |
n * B7 | B6 | m * B5 | B4 | vm);
}
void Assembler::vbsl(QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vbsl(Qn, Qm) SIMD bitwise select.
// Instruction details available in ARM DDI 0406C.b, A8-844.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
int op = 1; // vbsl
emit(0x1E6U * B23 | d * B22 | op * B20 | vn * B16 | vd * B12 | 0x1 * B8 |
n * B7 | B6 | m * B5 | B4 | vm);
}
// Encode NEON vtbl / vtbx instruction.
static Instr EncodeNeonVTB(const DwVfpRegister dst, const NeonListOperand& list,
const DwVfpRegister index, bool vtbx) {
// Dd = vtbl(table, Dm) SIMD vector permute, zero at out of range indices.
// Instruction details available in ARM DDI 0406C.b, A8-1094.
// Dd = vtbx(table, Dm) SIMD vector permute, skip out of range indices.
// Instruction details available in ARM DDI 0406C.b, A8-1094.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
list.base().split_code(&vn, &n);
int vm, m;
index.split_code(&vm, &m);
int op = vtbx ? 1 : 0; // vtbl = 0, vtbx = 1.
return 0x1E7U * B23 | d * B22 | 0x3 * B20 | vn * B16 | vd * B12 | 0x2 * B10 |
list.length() * B8 | n * B7 | op * B6 | m * B5 | vm;
}
void Assembler::vtbl(const DwVfpRegister dst, const NeonListOperand& list,
const DwVfpRegister index) {
DCHECK(IsEnabled(NEON));
emit(EncodeNeonVTB(dst, list, index, false));
}
void Assembler::vtbx(const DwVfpRegister dst, const NeonListOperand& list,
const DwVfpRegister index) {
DCHECK(IsEnabled(NEON));
emit(EncodeNeonVTB(dst, list, index, true));
}
// Pseudo instructions.
void Assembler::nop(int type) {
// ARMv6{K/T2} and v7 have an actual NOP instruction but it serializes

View File

@ -640,12 +640,26 @@ class NeonMemOperand BASE_EMBEDDED {
// Class NeonListOperand represents a list of NEON registers
class NeonListOperand BASE_EMBEDDED {
public:
explicit NeonListOperand(DoubleRegister base, int registers_count = 1);
explicit NeonListOperand(DoubleRegister base, int register_count = 1)
: base_(base), register_count_(register_count) {}
explicit NeonListOperand(QwNeonRegister q_reg)
: base_(q_reg.low()), register_count_(2) {}
DoubleRegister base() const { return base_; }
NeonListType type() const { return type_; }
int register_count() { return register_count_; }
int length() const { return register_count_ - 1; }
NeonListType type() const {
switch (register_count_) {
default: UNREACHABLE();
// Fall through.
case 1: return nlt_1;
case 2: return nlt_2;
case 3: return nlt_3;
case 4: return nlt_4;
}
}
private:
DoubleRegister base_;
NeonListType type_;
int register_count_;
};
@ -1149,6 +1163,8 @@ class Assembler : public AssemblerBase {
void vmov(const DwVfpRegister dst,
const DwVfpRegister src,
const Condition cond = al);
// TODO(bbudge) Replace uses of these with the more general core register to
// scalar register vmov's.
void vmov(const DwVfpRegister dst,
const VmovIndex index,
const Register src,
@ -1329,11 +1345,43 @@ class Assembler : public AssemblerBase {
const NeonMemOperand& dst);
void vmovl(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src);
// Only unconditional core <-> scalar moves are currently supported.
void vmov(NeonDataType dt, DwVfpRegister dst, int index, Register src);
void vmov(NeonDataType dt, Register dst, DwVfpRegister src, int index);
void vmov(const QwNeonRegister dst, const QwNeonRegister src);
void vmvn(const QwNeonRegister dst, const QwNeonRegister src);
void vswp(DwVfpRegister dst, DwVfpRegister src);
void vswp(QwNeonRegister dst, QwNeonRegister src);
// vdup conditional execution isn't supported.
void vdup(NeonSize size, const QwNeonRegister dst, const Register src);
void vdup(const QwNeonRegister dst, const SwVfpRegister src);
void vcvt_f32_s32(const QwNeonRegister dst, const QwNeonRegister src);
void vcvt_f32_u32(const QwNeonRegister dst, const QwNeonRegister src);
void vcvt_s32_f32(const QwNeonRegister dst, const QwNeonRegister src);
void vcvt_u32_f32(const QwNeonRegister dst, const QwNeonRegister src);
void veor(DwVfpRegister dst, DwVfpRegister src1, DwVfpRegister src2);
void veor(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
void vadd(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vadd(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vsub(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vsub(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vtst(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vceq(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vbsl(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vtbl(const DwVfpRegister dst, const NeonListOperand& list,
const DwVfpRegister index);
void vtbx(const DwVfpRegister dst, const NeonListOperand& list,
const DwVfpRegister index);
// Pseudo instructions

View File

@ -190,6 +190,7 @@ enum {
B7 = 1 << 7,
B8 = 1 << 8,
B9 = 1 << 9,
B10 = 1 << 10,
B12 = 1 << 12,
B16 = 1 << 16,
B17 = 1 << 17,
@ -218,7 +219,6 @@ enum {
kOff8Mask = (1 << 8) - 1
};
enum BarrierOption {
OSHLD = 0x1,
OSHST = 0x2,
@ -667,7 +667,7 @@ class Instruction {
private:
// Join split register codes, depending on single or double precision.
// Join split register codes, depending on register precision.
// four_bit is the position of the least-significant bit of the four
// bit specifier. one_bit is the position of the additional single bit
// specifier.

View File

@ -1419,6 +1419,9 @@ int Decoder::DecodeType7(Instruction* instr) {
// Sd = vsqrt(Sm)
// vmrs
// vmsr
// Qd = vdup.size(Qd, Rt)
// vmov.size: Dd[i] = Rt
// vmov.sign.size: Rt = Dn[i]
void Decoder::DecodeTypeVFP(Instruction* instr) {
VERIFY((instr->TypeValue() == 7) && (instr->Bit(24) == 0x0) );
VERIFY(instr->Bits(11, 9) == 0x5);
@ -1531,22 +1534,72 @@ void Decoder::DecodeTypeVFP(Instruction* instr) {
if ((instr->VCValue() == 0x0) &&
(instr->VAValue() == 0x0)) {
DecodeVMOVBetweenCoreAndSinglePrecisionRegisters(instr);
} else if ((instr->VLValue() == 0x0) &&
(instr->VCValue() == 0x1) &&
(instr->Bit(23) == 0x0)) {
} else if ((instr->VLValue() == 0x0) && (instr->VCValue() == 0x1)) {
if (instr->Bit(23) == 0) {
int opc1_opc2 = (instr->Bits(22, 21) << 2) | instr->Bits(6, 5);
if ((opc1_opc2 & 0xb) == 0) {
// NeonS32/NeonU32
if (instr->Bit(21) == 0x0) {
Format(instr, "vmov'cond.32 'Dd[0], 'rt");
} else {
Format(instr, "vmov'cond.32 'Dd[1], 'rt");
}
} else if ((instr->VLValue() == 0x1) &&
(instr->VCValue() == 0x1) &&
(instr->Bit(23) == 0x0)) {
} else {
int vd = instr->VFPNRegValue(kDoublePrecision);
int rt = instr->RtValue();
if ((opc1_opc2 & 0x8) != 0) {
// NeonS8 / NeonU8
int i = opc1_opc2 & 0x7;
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmov.8 d%d[%d], r%d", vd, i, rt);
} else if ((opc1_opc2 & 0x1) != 0) {
// NeonS16 / NeonU16
int i = (opc1_opc2 >> 1) & 0x3;
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmov.16 d%d[%d], r%d", vd, i, rt);
} else {
Unknown(instr);
}
}
} else {
int size = 32;
if (instr->Bit(5) != 0)
size = 16;
else if (instr->Bit(22) != 0)
size = 8;
int Vd = instr->VFPNRegValue(kSimd128Precision);
int Rt = instr->RtValue();
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vdup.%i q%d, r%d", size, Vd, Rt);
}
} else if ((instr->VLValue() == 0x1) && (instr->VCValue() == 0x1)) {
int opc1_opc2 = (instr->Bits(22, 21) << 2) | instr->Bits(6, 5);
if ((opc1_opc2 & 0xb) == 0) {
// NeonS32 / NeonU32
if (instr->Bit(21) == 0x0) {
Format(instr, "vmov'cond.32 'rt, 'Dd[0]");
} else {
Format(instr, "vmov'cond.32 'rt, 'Dd[1]");
}
} else {
const char* sign = instr->Bit(23) != 0 ? "u" : "s";
int rt = instr->RtValue();
int vn = instr->VFPNRegValue(kDoublePrecision);
if ((opc1_opc2 & 0x8) != 0) {
// NeonS8 / NeonU8
int i = opc1_opc2 & 0x7;
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmov.%s8 r%d, d%d[%d]", sign, rt, vn, i);
} else if ((opc1_opc2 & 0x1) != 0) {
// NeonS16 / NeonU16
int i = (opc1_opc2 >> 1) & 0x3;
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vmov.%s16 r%d, d%d[%d]",
sign, rt, vn, i);
} else {
Unknown(instr);
}
}
} else if ((instr->VCValue() == 0x0) &&
(instr->VAValue() == 0x7) &&
(instr->Bits(19, 16) == 0x1)) {
@ -1563,6 +1616,8 @@ void Decoder::DecodeTypeVFP(Instruction* instr) {
Format(instr, "vmrs'cond 'rt, FPSCR");
}
}
} else {
Unknown(instr); // Not used by V8.
}
}
}
@ -1809,6 +1864,25 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vmov q%d, q%d", Vd, Vm);
} else if (instr->Bits(11, 8) == 8) {
const char* op = (instr->Bit(4) == 0) ? "vadd" : "vtst";
int size = kBitsPerByte * (1 << instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
// vadd/vtst.i<size> Qd, Qm, Qn.
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.i%d q%d, q%d, q%d", op,
size, Vd, Vn, Vm);
} else if (instr->Bits(11, 8) == 0xd && instr->Bit(4) == 0) {
const char* op = (instr->Bits(21, 20) == 0) ? "vadd" : "vsub";
int size = kBitsPerByte * (1 << instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
// vadd/vsub.f32 Qd, Qm, Qn.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm);
} else {
Unknown(instr);
}
@ -1828,7 +1902,28 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
}
break;
case 6:
if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 &&
if (instr->Bits(11, 8) == 8) {
int size = kBitsPerByte * (1 << instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
if (instr->Bit(4) == 0) {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vsub.i%d q%d, q%d, q%d",
size, Vd, Vn, Vm);
} else {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vceq.i%d q%d, q%d, q%d",
size, Vd, Vn, Vm);
}
} else if (instr->Bits(21, 20) == 1 && instr->Bits(11, 8) == 1 &&
instr->Bit(4) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vbsl q%d, q%d, q%d", Vd, Vn, Vm);
} else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 &&
instr->Bit(4) == 1) {
if (instr->Bit(6) == 0) {
// veor Dd, Dn, Dm
@ -1860,6 +1955,35 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
int imm3 = instr->Bits(21, 19);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmovl.u%d q%d, d%d", imm3*8, Vd, Vm);
} else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0 &&
instr->Bits(11, 6) == 0x17 && instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vmvn q%d, q%d", Vd, Vm);
} else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0xB &&
instr->Bits(11, 9) == 0x3 && instr->Bit(6) == 1 &&
instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
const char* suffix = nullptr;
int op = instr->Bits(8, 7);
switch (op) {
case 0:
suffix = "f32.s32";
break;
case 1:
suffix = "f32.u32";
break;
case 2:
suffix = "s32.f32";
break;
case 3:
suffix = "u32.f32";
break;
}
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vcvt.%s q%d, q%d", suffix, Vd, Vm);
} else if ((instr->Bits(21, 16) == 0x32) && (instr->Bits(11, 7) == 0) &&
(instr->Bit(4) == 0)) {
if (instr->Bit(6) == 0) {
@ -1873,6 +1997,26 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vswp q%d, q%d", Vd, Vm);
}
} else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 7) == 0x18 &&
instr->Bit(4) == 0x0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int index = instr->Bit(19);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vdup q%d, d%d[%d]", Vd, Vm, index);
} else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 10) == 0x2 &&
instr->Bit(4) == 0x0) {
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int len = instr->Bits(9, 8);
NeonListOperand list(DwVfpRegister::from_code(Vn), len + 1);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s d%d, ",
instr->Bit(6) == 0 ? "vtbl.8" : "vtbx.8", Vd);
FormatNeonList(Vn, list.type());
Print(", ");
PrintDRegister(Vm);
} else {
Unknown(instr);
}

View File

@ -1081,8 +1081,8 @@ void MacroAssembler::VmovLow(DwVfpRegister dst, Register src) {
}
void MacroAssembler::VmovExtended(Register dst, int src_code) {
DCHECK_LE(32, src_code);
DCHECK_GT(64, src_code);
DCHECK_LE(SwVfpRegister::kMaxNumRegisters, src_code);
DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, src_code);
if (src_code & 0x1) {
VmovHigh(dst, DwVfpRegister::from_code(src_code / 2));
} else {
@ -1091,8 +1091,8 @@ void MacroAssembler::VmovExtended(Register dst, int src_code) {
}
void MacroAssembler::VmovExtended(int dst_code, Register src) {
DCHECK_LE(32, dst_code);
DCHECK_GT(64, dst_code);
DCHECK_LE(SwVfpRegister::kMaxNumRegisters, dst_code);
DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, dst_code);
if (dst_code & 0x1) {
VmovHigh(DwVfpRegister::from_code(dst_code / 2), src);
} else {
@ -1102,22 +1102,23 @@ void MacroAssembler::VmovExtended(int dst_code, Register src) {
void MacroAssembler::VmovExtended(int dst_code, int src_code,
Register scratch) {
if (src_code < 32 && dst_code < 32) {
if (src_code < SwVfpRegister::kMaxNumRegisters &&
dst_code < SwVfpRegister::kMaxNumRegisters) {
// src and dst are both s-registers.
vmov(SwVfpRegister::from_code(dst_code),
SwVfpRegister::from_code(src_code));
} else if (src_code < 32) {
} else if (src_code < SwVfpRegister::kMaxNumRegisters) {
// src is an s-register.
vmov(scratch, SwVfpRegister::from_code(src_code));
VmovExtended(dst_code, scratch);
} else if (dst_code < 32) {
} else if (dst_code < SwVfpRegister::kMaxNumRegisters) {
// dst is an s-register.
VmovExtended(scratch, src_code);
vmov(SwVfpRegister::from_code(dst_code), scratch);
} else {
// Neither src or dst are s-registers.
DCHECK_GT(64, src_code);
DCHECK_GT(64, dst_code);
DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, src_code);
DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, dst_code);
VmovExtended(scratch, src_code);
VmovExtended(dst_code, scratch);
}
@ -1125,7 +1126,7 @@ void MacroAssembler::VmovExtended(int dst_code, int src_code,
void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src,
Register scratch) {
if (dst_code >= 32) {
if (dst_code >= SwVfpRegister::kMaxNumRegisters) {
ldr(scratch, src);
VmovExtended(dst_code, scratch);
} else {
@ -1135,7 +1136,7 @@ void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src,
void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code,
Register scratch) {
if (src_code >= 32) {
if (src_code >= SwVfpRegister::kMaxNumRegisters) {
VmovExtended(scratch, src_code);
str(scratch, dst);
} else {
@ -1143,6 +1144,47 @@ void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code,
}
}
void MacroAssembler::ExtractLane(Register dst, QwNeonRegister src,
NeonDataType dt, int lane) {
int bytes_per_lane = dt & NeonDataTypeSizeMask; // 1, 2, 4
int log2_bytes_per_lane = bytes_per_lane / 2; // 0, 1, 2
int byte = lane << log2_bytes_per_lane;
int double_word = byte >> kDoubleSizeLog2;
int double_byte = byte & (kDoubleSize - 1);
int double_lane = double_byte >> log2_bytes_per_lane;
DwVfpRegister double_source =
DwVfpRegister::from_code(src.code() * 2 + double_word);
vmov(dt, dst, double_source, double_lane);
}
void MacroAssembler::ExtractLane(SwVfpRegister dst, QwNeonRegister src,
Register scratch, int lane) {
int s_code = src.code() * 4 + lane;
VmovExtended(dst.code(), s_code, scratch);
}
void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
Register src_lane, NeonDataType dt, int lane) {
Move(dst, src);
int bytes_per_lane = dt & NeonDataTypeSizeMask; // 1, 2, 4
int log2_bytes_per_lane = bytes_per_lane / 2; // 0, 1, 2
int byte = lane << log2_bytes_per_lane;
int double_word = byte >> kDoubleSizeLog2;
int double_byte = byte & (kDoubleSize - 1);
int double_lane = double_byte >> log2_bytes_per_lane;
DwVfpRegister double_dst =
DwVfpRegister::from_code(dst.code() * 2 + double_word);
vmov(dt, double_dst, double_lane, src_lane);
}
void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
SwVfpRegister src_lane, Register scratch,
int lane) {
Move(dst, src);
int s_code = dst.code() * 4 + lane;
VmovExtended(s_code, src_lane.code(), scratch);
}
void MacroAssembler::LslPair(Register dst_low, Register dst_high,
Register src_low, Register src_high,
Register scratch, Register shift) {

View File

@ -561,6 +561,14 @@ class MacroAssembler: public Assembler {
void VmovExtended(int dst_code, const MemOperand& src, Register scratch);
void VmovExtended(const MemOperand& dst, int src_code, Register scratch);
void ExtractLane(Register dst, QwNeonRegister src, NeonDataType dt, int lane);
void ExtractLane(SwVfpRegister dst, QwNeonRegister src, Register scratch,
int lane);
void ReplaceLane(QwNeonRegister dst, QwNeonRegister src, Register src_lane,
NeonDataType dt, int lane);
void ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
SwVfpRegister src_lane, Register scratch, int lane);
void LslPair(Register dst_low, Register dst_high, Register src_low,
Register src_high, Register scratch, Register shift);
void LslPair(Register dst_low, Register dst_high, Register src_low,

View File

@ -3067,6 +3067,7 @@ void Simulator::DecodeType7(Instruction* instr) {
// Dd = vsqrt(Dm)
// Sd = vsqrt(Sm)
// vmrs
// vdup.size Qd, Rt.
void Simulator::DecodeTypeVFP(Instruction* instr) {
DCHECK((instr->TypeValue() == 7) && (instr->Bit(24) == 0x0) );
DCHECK(instr->Bits(11, 9) == 0x5);
@ -3277,24 +3278,116 @@ void Simulator::DecodeTypeVFP(Instruction* instr) {
if ((instr->VCValue() == 0x0) &&
(instr->VAValue() == 0x0)) {
DecodeVMOVBetweenCoreAndSinglePrecisionRegisters(instr);
} else if ((instr->VLValue() == 0x0) &&
(instr->VCValue() == 0x1) &&
(instr->Bit(23) == 0x0)) {
} else if ((instr->VLValue() == 0x0) && (instr->VCValue() == 0x1)) {
if (instr->Bit(23) == 0) {
// vmov (ARM core register to scalar)
int vd = instr->Bits(19, 16) | (instr->Bit(7) << 4);
int vd = instr->VFPNRegValue(kDoublePrecision);
int rt = instr->RtValue();
int opc1_opc2 = (instr->Bits(22, 21) << 2) | instr->Bits(6, 5);
if ((opc1_opc2 & 0xb) == 0) {
// NeonS32/NeonU32
uint32_t data[2];
get_d_register(vd, data);
data[instr->Bit(21)] = get_register(instr->RtValue());
data[instr->Bit(21)] = get_register(rt);
set_d_register(vd, data);
} else if ((instr->VLValue() == 0x1) &&
(instr->VCValue() == 0x1) &&
(instr->Bit(23) == 0x0)) {
} else {
uint64_t data;
get_d_register(vd, &data);
uint64_t rt_value = get_register(rt);
if ((opc1_opc2 & 0x8) != 0) {
// NeonS8 / NeonU8
int i = opc1_opc2 & 0x7;
int shift = i * kBitsPerByte;
const uint64_t mask = 0xFF;
data &= ~(mask << shift);
data |= (rt_value & mask) << shift;
set_d_register(vd, &data);
} else if ((opc1_opc2 & 0x1) != 0) {
// NeonS16 / NeonU16
int i = (opc1_opc2 >> 1) & 0x3;
int shift = i * kBitsPerByte * kShortSize;
const uint64_t mask = 0xFFFF;
data &= ~(mask << shift);
data |= (rt_value & mask) << shift;
set_d_register(vd, &data);
} else {
UNREACHABLE(); // Not used by V8.
}
}
} else {
// vdup.size Qd, Rt.
NeonSize size = Neon32;
if (instr->Bit(5) != 0)
size = Neon16;
else if (instr->Bit(22) != 0)
size = Neon8;
int vd = instr->VFPNRegValue(kSimd128Precision);
int rt = instr->RtValue();
uint32_t rt_value = get_register(rt);
uint32_t q_data[4];
switch (size) {
case Neon8: {
rt_value &= 0xFF;
uint8_t* dst = reinterpret_cast<uint8_t*>(q_data);
for (int i = 0; i < 16; i++) {
dst[i] = rt_value;
}
break;
}
case Neon16: {
// Perform pairwise ops instead of casting to uint16_t.
rt_value &= 0xFFFFu;
uint32_t rt_rt = (rt_value << 16) | (rt_value & 0xFFFFu);
for (int i = 0; i < 4; i++) {
q_data[i] = rt_rt;
}
break;
}
case Neon32: {
for (int i = 0; i < 4; i++) {
q_data[i] = rt_value;
}
break;
}
default:
UNREACHABLE();
break;
}
set_q_register(vd, q_data);
}
} else if ((instr->VLValue() == 0x1) && (instr->VCValue() == 0x1)) {
// vmov (scalar to ARM core register)
int vn = instr->Bits(19, 16) | (instr->Bit(7) << 4);
int vn = instr->VFPNRegValue(kDoublePrecision);
int rt = instr->RtValue();
int opc1_opc2 = (instr->Bits(22, 21) << 2) | instr->Bits(6, 5);
if ((opc1_opc2 & 0xb) == 0) {
// NeonS32 / NeonU32
double dn_value = get_double_from_d_register(vn);
int32_t data[2];
memcpy(data, &dn_value, 8);
set_register(instr->RtValue(), data[instr->Bit(21)]);
set_register(rt, data[instr->Bit(21)]);
} else {
uint64_t data;
get_d_register(vn, &data);
bool u = instr->Bit(23) != 0;
if ((opc1_opc2 & 0x8) != 0) {
// NeonS8 / NeonU8
int i = opc1_opc2 & 0x7;
int shift = i * kBitsPerByte;
uint32_t scalar = (data >> shift) & 0xFFu;
if (!u && (scalar & 0x80) != 0) scalar |= 0xffffff00;
set_register(rt, scalar);
} else if ((opc1_opc2 & 0x1) != 0) {
// NeonS16 / NeonU16
int i = (opc1_opc2 >> 1) & 0x3;
int shift = i * kBitsPerByte * kShortSize;
uint32_t scalar = (data >> shift) & 0xFFFFu;
if (!u && (scalar & 0x8000) != 0) scalar |= 0xffff0000;
set_register(rt, scalar);
} else {
UNREACHABLE(); // Not used by V8.
}
}
} else if ((instr->VLValue() == 0x1) &&
(instr->VCValue() == 0x0) &&
(instr->VAValue() == 0x7) &&
@ -3520,6 +3613,48 @@ int VFPConversionSaturate(double val, bool unsigned_res) {
}
}
int32_t Simulator::ConvertDoubleToInt(double val, bool unsigned_integer,
VFPRoundingMode mode) {
int32_t result =
unsigned_integer ? static_cast<uint32_t>(val) : static_cast<int32_t>(val);
inv_op_vfp_flag_ = get_inv_op_vfp_flag(mode, val, unsigned_integer);
double abs_diff = unsigned_integer
? std::fabs(val - static_cast<uint32_t>(result))
: std::fabs(val - result);
inexact_vfp_flag_ = (abs_diff != 0);
if (inv_op_vfp_flag_) {
result = VFPConversionSaturate(val, unsigned_integer);
} else {
switch (mode) {
case RN: {
int val_sign = (val > 0) ? 1 : -1;
if (abs_diff > 0.5) {
result += val_sign;
} else if (abs_diff == 0.5) {
// Round to even if exactly halfway.
result = ((result % 2) == 0) ? result : result + val_sign;
}
break;
}
case RM:
result = result > val ? result - 1 : result;
break;
case RZ:
// Nothing to do.
break;
default:
UNREACHABLE();
}
}
return result;
}
void Simulator::DecodeVCVTBetweenFloatingPointAndInteger(Instruction* instr) {
DCHECK((instr->Bit(4) == 0) && (instr->Opc1Value() == 0x7) &&
@ -3556,44 +3691,7 @@ void Simulator::DecodeVCVTBetweenFloatingPointAndInteger(Instruction* instr) {
double val = double_precision ? get_double_from_d_register(src)
: get_float_from_s_register(src);
int temp = unsigned_integer ? static_cast<uint32_t>(val)
: static_cast<int32_t>(val);
inv_op_vfp_flag_ = get_inv_op_vfp_flag(mode, val, unsigned_integer);
double abs_diff =
unsigned_integer ? std::fabs(val - static_cast<uint32_t>(temp))
: std::fabs(val - temp);
inexact_vfp_flag_ = (abs_diff != 0);
if (inv_op_vfp_flag_) {
temp = VFPConversionSaturate(val, unsigned_integer);
} else {
switch (mode) {
case RN: {
int val_sign = (val > 0) ? 1 : -1;
if (abs_diff > 0.5) {
temp += val_sign;
} else if (abs_diff == 0.5) {
// Round to even if exactly halfway.
temp = ((temp % 2) == 0) ? temp : temp + val_sign;
}
break;
}
case RM:
temp = temp > val ? temp - 1 : temp;
break;
case RZ:
// Nothing to do.
break;
default:
UNREACHABLE();
}
}
int32_t temp = ConvertDoubleToInt(val, unsigned_integer, mode);
// Update the destination register.
set_s_register_from_sinteger(dst, temp);
@ -3740,6 +3838,16 @@ void Simulator::DecodeType6CoprocessorIns(Instruction* instr) {
}
}
#define HIGH_16(x) ((x) >> 16)
#define LOW_16(x) ((x)&0xFFFFu)
#define COMBINE_32(high, low) ((high) << 16 | (low)&0xFFFFu)
#define PAIRWISE_OP(x, y, OP) \
COMBINE_32(OP(HIGH_16((x)), HIGH_16((y))), OP(LOW_16((x)), LOW_16((y))))
#define ADD_16(x, y) ((x) + (y))
#define SUB_16(x, y) ((x) - (y))
#define CEQ_16(x, y) ((x) == (y) ? 0xFFFFu : 0)
#define TST_16(x, y) (((x) & (y)) != 0 ? 0xFFFFu : 0)
void Simulator::DecodeSpecialCondition(Instruction* instr) {
switch (instr->SpecialValue()) {
@ -3752,6 +3860,91 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
uint32_t data[4];
get_q_register(Vm, data);
set_q_register(Vd, data);
} else if (instr->Bits(11, 8) == 8) {
// vadd/vtst
int size = static_cast<NeonSize>(instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
uint32_t src1[4], src2[4];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
if (instr->Bit(4) == 0) {
// vadd.i<size> Qd, Qm, Qn.
switch (size) {
case Neon8: {
uint8_t* s1 = reinterpret_cast<uint8_t*>(src1);
uint8_t* s2 = reinterpret_cast<uint8_t*>(src2);
for (int i = 0; i < 16; i++) {
s1[i] += s2[i];
}
break;
}
case Neon16: {
for (int i = 0; i < 4; i++) {
src1[i] = PAIRWISE_OP(src1[i], src2[i], ADD_16);
}
break;
}
case Neon32: {
for (int i = 0; i < 4; i++) {
src1[i] += src2[i];
}
break;
}
default:
UNREACHABLE();
break;
}
} else {
// vtst.i<size> Qd, Qm, Qn.
switch (size) {
case Neon8: {
uint8_t* s1 = reinterpret_cast<uint8_t*>(src1);
uint8_t* s2 = reinterpret_cast<uint8_t*>(src2);
for (int i = 0; i < 16; i++) {
s1[i] = (s1[i] & s2[i]) != 0 ? 0xFFu : 0;
}
break;
}
case Neon16: {
for (int i = 0; i < 4; i++) {
src1[i] = PAIRWISE_OP(src1[i], src2[i], TST_16);
}
break;
}
case Neon32: {
for (int i = 0; i < 4; i++) {
src1[i] = (src1[i] & src2[i]) != 0 ? 0xFFFFFFFFu : 0;
}
break;
}
default:
UNREACHABLE();
break;
}
}
set_q_register(Vd, src1);
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xd &&
instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
uint32_t src1[4], src2[4];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 4; i++) {
if (instr->Bit(21) == 0) {
// vadd.f32 Qd, Qm, Qn.
src1[i] = bit_cast<uint32_t>(bit_cast<float>(src1[i]) +
bit_cast<float>(src2[i]));
} else {
// vsub.f32 Qd, Qm, Qn.
src1[i] = bit_cast<uint32_t>(bit_cast<float>(src1[i]) -
bit_cast<float>(src2[i]));
}
}
set_q_register(Vd, src1);
} else {
UNIMPLEMENTED();
}
@ -3781,7 +3974,91 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
}
break;
case 6:
if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 &&
if (instr->Bits(11, 8) == 8 && instr->Bit(4) == 0) {
// vsub.size Qd, Qm, Qn.
int size = static_cast<NeonSize>(instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
uint32_t src1[4], src2[4];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
switch (size) {
case Neon8: {
uint8_t* s1 = reinterpret_cast<uint8_t*>(src1);
uint8_t* s2 = reinterpret_cast<uint8_t*>(src2);
for (int i = 0; i < 16; i++) {
s1[i] -= s2[i];
}
break;
}
case Neon16: {
for (int i = 0; i < 4; i++) {
src1[i] = PAIRWISE_OP(src1[i], src2[i], SUB_16);
}
break;
}
case Neon32: {
for (int i = 0; i < 4; i++) {
src1[i] -= src2[i];
}
break;
}
default:
UNREACHABLE();
break;
}
set_q_register(Vd, src1);
} else if (instr->Bits(11, 8) == 8 && instr->Bit(4) == 1) {
// vceq.size Qd, Qm, Qn.
int size = static_cast<NeonSize>(instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
uint32_t src1[4], src2[4];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
switch (size) {
case Neon8: {
uint8_t* s1 = reinterpret_cast<uint8_t*>(src1);
uint8_t* s2 = reinterpret_cast<uint8_t*>(src2);
for (int i = 0; i < 16; i++) {
s1[i] = s1[i] == s2[i] ? 0xFF : 0;
}
break;
}
case Neon16: {
for (int i = 0; i < 4; i++) {
src1[i] = PAIRWISE_OP(src1[i], src2[i], CEQ_16);
}
break;
}
case Neon32: {
for (int i = 0; i < 4; i++) {
src1[i] = src1[i] == src2[i] ? 0xFFFFFFFF : 0;
}
break;
}
default:
UNREACHABLE();
break;
}
set_q_register(Vd, src1);
} else if (instr->Bits(21, 20) == 1 && instr->Bits(11, 8) == 1 &&
instr->Bit(4) == 1) {
// vbsl.size Qd, Qm, Qn.
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
uint32_t dst[4], src1[4], src2[4];
get_q_register(Vd, dst);
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 4; i++) {
dst[i] = (dst[i] & src1[i]) | (~dst[i] & src2[i]);
}
set_q_register(Vd, dst);
} else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 &&
instr->Bit(4) == 1) {
if (instr->Bit(6) == 0) {
// veor Dd, Dn, Dm
@ -3829,6 +4106,40 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
e++;
}
set_q_register(Vd, reinterpret_cast<uint64_t*>(to));
} else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0xB &&
instr->Bits(11, 9) == 0x3 && instr->Bit(6) == 1 &&
instr->Bit(4) == 0) {
// vcvt.<Td>.<Tm> Qd, Qm.
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
uint32_t q_data[4];
get_q_register(Vm, q_data);
int op = instr->Bits(8, 7);
for (int i = 0; i < 4; i++) {
switch (op) {
case 0:
// f32 <- s32, round towards nearest.
q_data[i] = bit_cast<uint32_t>(
std::round(static_cast<float>(bit_cast<int32_t>(q_data[i]))));
break;
case 1:
// f32 <- u32, round towards nearest.
q_data[i] =
bit_cast<uint32_t>(std::round(static_cast<float>(q_data[i])));
break;
case 2:
// s32 <- f32, round to zero.
q_data[i] = static_cast<uint32_t>(
ConvertDoubleToInt(bit_cast<float>(q_data[i]), false, RZ));
break;
case 3:
// u32 <- f32, round to zero.
q_data[i] = static_cast<uint32_t>(
ConvertDoubleToInt(bit_cast<float>(q_data[i]), true, RZ));
break;
}
}
set_q_register(Vd, q_data);
} else if ((instr->Bits(21, 16) == 0x32) && (instr->Bits(11, 7) == 0) &&
(instr->Bit(4) == 0)) {
if (instr->Bit(6) == 0) {
@ -3850,6 +4161,49 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
set_q_register(vm, dval);
set_q_register(vd, mval);
}
} else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 7) == 0x18 &&
instr->Bit(4) == 0x0) {
// vdup.32 Qd, Sm.
int vd = instr->VFPDRegValue(kSimd128Precision);
int vm = instr->VFPMRegValue(kDoublePrecision);
int index = instr->Bit(19);
uint32_t s_data = get_s_register(vm * 2 + index);
uint32_t q_data[4];
for (int i = 0; i < 4; i++) q_data[i] = s_data;
set_q_register(vd, q_data);
} else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0 &&
instr->Bits(11, 6) == 0x17 && instr->Bit(4) == 0) {
// vmvn Qd, Qm.
int vd = instr->VFPDRegValue(kSimd128Precision);
int vm = instr->VFPMRegValue(kSimd128Precision);
uint32_t q_data[4];
get_q_register(vm, q_data);
for (int i = 0; i < 4; i++) q_data[i] = ~q_data[i];
set_q_register(vd, q_data);
} else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 10) == 0x2 &&
instr->Bit(4) == 0x0) {
// vtb[l,x] Dd, <list>, Dm.
int vd = instr->VFPDRegValue(kDoublePrecision);
int vn = instr->VFPNRegValue(kDoublePrecision);
int vm = instr->VFPMRegValue(kDoublePrecision);
int table_len = (instr->Bits(9, 8) + 1) * kDoubleSize;
bool vtbx = instr->Bit(6) != 0; // vtbl / vtbx
uint64_t destination = 0, indices = 0, result = 0;
get_d_register(vd, &destination);
get_d_register(vm, &indices);
for (int i = 0; i < kDoubleSize; i++) {
int shift = i * kBitsPerByte;
int index = (indices >> shift) & 0xFF;
if (index < table_len) {
uint64_t table;
get_d_register(vn + index / kDoubleSize, &table);
result |= ((table >> ((index % kDoubleSize) * kBitsPerByte)) & 0xFF)
<< shift;
} else if (vtbx) {
result |= destination & (0xFFull << shift);
}
}
set_d_register(vd, &result);
} else {
UNIMPLEMENTED();
}

View File

@ -339,6 +339,8 @@ class Simulator {
void DecodeVMOVBetweenCoreAndSinglePrecisionRegisters(Instruction* instr);
void DecodeVCMP(Instruction* instr);
void DecodeVCVTBetweenDoubleAndSingle(Instruction* instr);
int32_t ConvertDoubleToInt(double val, bool unsigned_integer,
VFPRoundingMode mode);
void DecodeVCVTBetweenFloatingPointAndInteger(Instruction* instr);
// Executes one instruction.

View File

@ -1221,6 +1221,10 @@ TEST(14) {
CHECK_EQ(kArmNanLower32, bit_cast<int64_t>(t.div_result) & 0xffffffffu);
}
#define INT32_TO_FLOAT(val) \
std::round(static_cast<float>(bit_cast<int32_t>(val)))
#define UINT32_TO_FLOAT(val) \
std::round(static_cast<float>(bit_cast<uint32_t>(val)))
TEST(15) {
// Test the Neon instructions.
@ -1255,8 +1259,20 @@ TEST(15) {
uint32_t dstA5;
uint32_t dstA6;
uint32_t dstA7;
uint32_t vmov_src[4], vmov_dst[4];
uint32_t veor_src[4], veor_dst[4];
uint64_t vmov_to_scalar1, vmov_to_scalar2;
uint32_t vmov_from_scalar_s8, vmov_from_scalar_u8;
uint32_t vmov_from_scalar_s16, vmov_from_scalar_u16;
uint32_t vmov_from_scalar_32;
uint32_t vmov_src[4], vmov_dst[4], vmvn[4];
int32_t vcvt_s32_f32[4];
uint32_t vcvt_u32_f32[4];
float vcvt_f32_s32[4], vcvt_f32_u32[4];
uint32_t vdup1[4], vdup2[4], vdup3[4], vdup4[4];
uint32_t veor[4];
uint32_t vadd8[4], vadd16[4], vadd32[4];
uint32_t vsub8[4], vsub16[4], vsub32[4];
uint32_t vtst[4], vceq[4], vbsl[4], vtbl[2], vtbx[2];
float vaddf[4], vsubf[4];
} T;
T t;
@ -1268,7 +1284,7 @@ TEST(15) {
if (CpuFeatures::IsSupported(NEON)) {
CpuFeatureScope scope(&assm, NEON);
__ stm(db_w, sp, r4.bit() | lr.bit());
__ stm(db_w, sp, r4.bit() | r5.bit() | lr.bit());
// Move 32 bytes with neon.
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, src0))));
__ vld1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(r4));
@ -1289,23 +1305,200 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, dstA4))));
__ vst1(Neon8, NeonListOperand(d2, 2), NeonMemOperand(r4));
// Test vmov for q-registers.
// ARM core register to scalar.
__ mov(r4, Operand(0xFFFFFFF8));
__ vmov(d0, 0);
__ vmov(NeonS8, d0, 1, r4);
__ vmov(NeonS16, d0, 1, r4);
__ vmov(NeonS32, d0, 1, r4);
__ vstr(d0, r0, offsetof(T, vmov_to_scalar1));
__ vmov(d0, 0);
__ vmov(NeonS8, d0, 3, r4);
__ vmov(NeonS16, d0, 3, r4);
__ vstr(d0, r0, offsetof(T, vmov_to_scalar2));
// Scalar to ARM core register.
__ mov(r4, Operand(0xFFFFFF00));
__ mov(r5, Operand(0xFFFFFFFF));
__ vmov(d0, r4, r5);
__ vmov(NeonS8, r4, d0, 1);
__ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_s8)));
__ vmov(NeonU8, r4, d0, 1);
__ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_u8)));
__ vmov(NeonS16, r4, d0, 1);
__ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_s16)));
__ vmov(NeonU16, r4, d0, 1);
__ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_u16)));
__ vmov(NeonS32, r4, d0, 1);
__ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_32)));
// vmov for q-registers.
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmov_src))));
__ vld1(Neon8, NeonListOperand(d0, 2), NeonMemOperand(r4));
__ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ vmov(q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmov_dst))));
__ vst1(Neon8, NeonListOperand(d2, 2), NeonMemOperand(r4));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// Test veor for q-registers.
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, veor_src))));
__ vld1(Neon8, NeonListOperand(d0, 2), NeonMemOperand(r4));
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, veor_dst))));
__ vld1(Neon8, NeonListOperand(d2, 2), NeonMemOperand(r4));
// vmvn.
__ mov(r4, Operand(0xFF));
__ vdup(Neon16, q0, r4);
__ vmvn(q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmvn))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vcvt for q-registers.
__ vmov(s0, -1.5);
__ vmov(s1, -1);
__ vmov(s2, 1);
__ vmov(s3, 1.5);
__ vcvt_s32_f32(q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcvt_s32_f32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ vcvt_u32_f32(q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcvt_u32_f32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ mov(r4, Operand(kMinInt));
__ mov(r5, Operand(kMaxInt));
__ vmov(d0, r4, r5);
__ mov(r4, Operand(kMaxUInt32));
__ mov(r5, Operand(kMinInt + 1));
__ vmov(d1, r4, r5); // q0 = [kMinInt, kMaxInt, kMaxUInt32, kMinInt + 1]
__ vcvt_f32_s32(q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcvt_f32_s32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ vcvt_f32_u32(q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcvt_f32_u32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// int vdup.
__ mov(r4, Operand(0xa));
__ vdup(Neon8, q0, r4);
__ vdup(Neon16, q1, r4);
__ vdup(Neon32, q2, r4);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup1))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup2))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup3))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
// float vdup.
__ vmov(s0, -1.0);
__ vdup(q0, s0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup4))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
// veor.
__ mov(r4, Operand(0x00aa));
__ vdup(Neon16, q0, r4);
__ mov(r4, Operand(0x0055));
__ vdup(Neon16, q1, r4);
__ veor(q1, q1, q0);
__ vst1(Neon8, NeonListOperand(d2, 2), NeonMemOperand(r4));
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, veor))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vadd(integer).
__ mov(r4, Operand(0x81));
__ vdup(Neon8, q0, r4);
__ mov(r4, Operand(0x82));
__ vdup(Neon8, q1, r4);
__ vadd(Neon8, q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vadd8))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ mov(r4, Operand(0x8001));
__ vdup(Neon16, q0, r4);
__ mov(r4, Operand(0x8002));
__ vdup(Neon16, q1, r4);
__ vadd(Neon16, q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vadd16))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ mov(r4, Operand(0x80000001));
__ vdup(Neon32, q0, r4);
__ mov(r4, Operand(0x80000002));
__ vdup(Neon32, q1, r4);
__ vadd(Neon32, q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vadd32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vadd(float).
__ vmov(s4, 1.0);
__ vdup(q0, s4);
__ vdup(q1, s4);
__ vadd(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vaddf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vsub(integer).
__ mov(r4, Operand(0x01));
__ vdup(Neon8, q0, r4);
__ mov(r4, Operand(0x02));
__ vdup(Neon8, q1, r4);
__ vsub(Neon8, q1, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsub8))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ mov(r4, Operand(0x0001));
__ vdup(Neon16, q0, r4);
__ mov(r4, Operand(0x0002));
__ vdup(Neon16, q1, r4);
__ vsub(Neon16, q1, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsub16))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ mov(r4, Operand(0x00000001));
__ vdup(Neon32, q0, r4);
__ mov(r4, Operand(0x00000002));
__ vdup(Neon32, q1, r4);
__ vsub(Neon32, q1, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsub32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vsub(float).
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vmov(s4, 1.0);
__ vdup(q1, s4);
__ vsub(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsubf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vceq.
__ mov(r4, Operand(0x03));
__ vdup(Neon8, q0, r4);
__ mov(r4, Operand(0x03));
__ vdup(Neon16, q1, r4);
__ vceq(Neon8, q1, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vceq))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vtst.
__ mov(r4, Operand(0x03));
__ vdup(Neon8, q0, r4);
__ mov(r4, Operand(0x02));
__ vdup(Neon16, q1, r4);
__ vtst(Neon8, q1, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vtst))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vbsl.
__ mov(r4, Operand(0x00ff));
__ vdup(Neon16, q0, r4);
__ mov(r4, Operand(0x01));
__ vdup(Neon8, q1, r4);
__ mov(r4, Operand(0x02));
__ vdup(Neon8, q2, r4);
__ vbsl(q0, q1, q2);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vbsl))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
// vtb[l/x].
__ mov(r4, Operand(0x06040200));
__ mov(r5, Operand(0xff050301));
__ vmov(d2, r4, r5); // d2 = ff05030106040200
__ vtbl(d0, NeonListOperand(d2, 1), d2);
__ vstr(d0, r0, offsetof(T, vtbl));
__ vtbx(d2, NeonListOperand(d2, 1), d2);
__ vstr(d2, r0, offsetof(T, vtbx));
// Restore and return.
__ ldm(ia_w, sp, r4.bit() | pc.bit());
__ ldm(ia_w, sp, r4.bit() | r5.bit() | pc.bit());
CodeDesc desc;
assm.GetCode(&desc);
@ -1344,10 +1537,9 @@ TEST(15) {
t.dstA7 = 0;
t.vmov_src[0] = t.vmov_src[1] = t.vmov_src[2] = t.vmov_src[3] = 1;
t.vmov_dst[0] = t.vmov_dst[1] = t.vmov_dst[2] = t.vmov_dst[3] = 0;
t.veor_src[0] = t.veor_src[1] = t.veor_src[2] = t.veor_src[3] = 0xAA;
t.veor_dst[0] = t.veor_dst[1] = t.veor_dst[2] = t.veor_dst[3] = 0x55;
Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0);
USE(dummy);
CHECK_EQ(0x01020304u, t.dst0);
CHECK_EQ(0x11121314u, t.dst1);
CHECK_EQ(0x21222324u, t.dst2);
@ -1364,14 +1556,57 @@ TEST(15) {
CHECK_EQ(0x00410042u, t.dstA5);
CHECK_EQ(0x00830084u, t.dstA6);
CHECK_EQ(0x00810082u, t.dstA7);
CHECK_EQ(0xfffffff8fff8f800u, t.vmov_to_scalar1);
CHECK_EQ(0xfff80000f8000000u, t.vmov_to_scalar2);
CHECK_EQ(0xFFFFFFFFu, t.vmov_from_scalar_s8);
CHECK_EQ(0xFFu, t.vmov_from_scalar_u8);
CHECK_EQ(0xFFFFFFFFu, t.vmov_from_scalar_s16);
CHECK_EQ(0xFFFFu, t.vmov_from_scalar_u16);
CHECK_EQ(0xFFFFFFFFu, t.vmov_from_scalar_32);
CHECK_EQ(1u, t.vmov_dst[0]);
CHECK_EQ(1u, t.vmov_dst[1]);
CHECK_EQ(1u, t.vmov_dst[2]);
CHECK_EQ(1u, t.vmov_dst[3]);
CHECK_EQ(0xFFu, t.veor_dst[0]);
CHECK_EQ(0xFFu, t.veor_dst[1]);
CHECK_EQ(0xFFu, t.veor_dst[2]);
CHECK_EQ(0xFFu, t.veor_dst[3]);
CHECK_EQ(-1, t.vcvt_s32_f32[0]);
CHECK_EQ(-1, t.vcvt_s32_f32[1]);
CHECK_EQ(1, t.vcvt_s32_f32[2]);
CHECK_EQ(1, t.vcvt_s32_f32[3]);
CHECK_EQ(0u, t.vcvt_u32_f32[0]);
CHECK_EQ(0u, t.vcvt_u32_f32[1]);
CHECK_EQ(1u, t.vcvt_u32_f32[2]);
CHECK_EQ(1u, t.vcvt_u32_f32[3]);
// src: [kMinInt, kMaxInt, kMaxUInt32, kMinInt + 1]
CHECK_EQ(INT32_TO_FLOAT(kMinInt), t.vcvt_f32_s32[0]);
CHECK_EQ(INT32_TO_FLOAT(kMaxInt), t.vcvt_f32_s32[1]);
CHECK_EQ(INT32_TO_FLOAT(kMaxUInt32), t.vcvt_f32_s32[2]);
CHECK_EQ(INT32_TO_FLOAT(kMinInt + 1), t.vcvt_f32_s32[3]);
CHECK_EQ(UINT32_TO_FLOAT(kMinInt), t.vcvt_f32_u32[0]);
CHECK_EQ(UINT32_TO_FLOAT(kMaxInt), t.vcvt_f32_u32[1]);
CHECK_EQ(UINT32_TO_FLOAT(kMaxUInt32), t.vcvt_f32_u32[2]);
CHECK_EQ(UINT32_TO_FLOAT(kMinInt + 1), t.vcvt_f32_u32[3]);
for (int i = 0; i < 4; i++) CHECK_EQ(0xFF00FF00, t.vmvn[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0x0a0a0a0au, t.vdup1[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0x000a000au, t.vdup2[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0x0000000au, t.vdup3[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0xbf800000u, t.vdup4[i]); // -1.0f
for (int i = 0; i < 4; i++) CHECK_EQ(0x00ff00ffu, t.veor[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(2.0, t.vaddf[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0x03030303u, t.vadd8[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0x00030003u, t.vadd16[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0x00000003u, t.vadd32[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(-1.0, t.vsubf[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0xffffffffu, t.vsub8[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0xffffffffu, t.vsub16[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0xffffffffu, t.vsub32[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0x00ff00ffu, t.vceq[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0x00ff00ffu, t.vtst[i]);
for (int i = 0; i < 4; i++) CHECK_EQ(0x02010201u, t.vbsl[i]);
CHECK_EQ(0x05010400u, t.vtbl[0]);
CHECK_EQ(0x00030602u, t.vtbl[1]);
CHECK_EQ(0x05010400u, t.vtbx[0]);
CHECK_EQ(0xff030602u, t.vtbx[1]);
}
}
@ -2963,9 +3198,9 @@ TEST(vswp) {
__ vmov(d11, r5, r5); // q5 = [-1.0, -1.0]
__ vswp(q4, q5);
__ add(r6, r0, Operand(static_cast<int32_t>(offsetof(T, vswp_q4))));
__ vst1(Neon8, NeonListOperand(d8, 2), NeonMemOperand(r6));
__ vst1(Neon8, NeonListOperand(q4), NeonMemOperand(r6));
__ add(r6, r0, Operand(static_cast<int32_t>(offsetof(T, vswp_q5))));
__ vst1(Neon8, NeonListOperand(d10, 2), NeonMemOperand(r6));
__ vst1(Neon8, NeonListOperand(q5), NeonMemOperand(r6));
__ ldm(ia_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | pc.bit());
__ bx(lr);

View File

@ -936,10 +936,45 @@ TEST(Neon) {
"f3886a11 vmovl.u8 q3, d1");
COMPARE(vmovl(NeonU8, q4, d2),
"f3888a12 vmovl.u8 q4, d2");
COMPARE(vmov(NeonS8, d0, 0, r0), "ee400b10 vmov.8 d0[0], r0");
COMPARE(vmov(NeonU8, d1, 1, r1), "ee411b30 vmov.8 d1[1], r1");
COMPARE(vmov(NeonS8, d2, 2, r2), "ee422b50 vmov.8 d2[2], r2");
COMPARE(vmov(NeonU8, d3, 3, r8), "ee438b70 vmov.8 d3[3], r8");
COMPARE(vmov(NeonS8, d4, 4, r0), "ee640b10 vmov.8 d4[4], r0");
COMPARE(vmov(NeonU8, d5, 5, r1), "ee651b30 vmov.8 d5[5], r1");
COMPARE(vmov(NeonS8, d6, 6, r2), "ee662b50 vmov.8 d6[6], r2");
COMPARE(vmov(NeonU8, d7, 7, r8), "ee678b70 vmov.8 d7[7], r8");
COMPARE(vmov(NeonS16, d0, 0, r0), "ee000b30 vmov.16 d0[0], r0");
COMPARE(vmov(NeonS16, d1, 1, r1), "ee011b70 vmov.16 d1[1], r1");
COMPARE(vmov(NeonS16, d2, 2, r2), "ee222b30 vmov.16 d2[2], r2");
COMPARE(vmov(NeonS16, d3, 3, r7), "ee237b70 vmov.16 d3[3], r7");
COMPARE(vmov(NeonS32, d0, 0, r0), "ee000b10 vmov.32 d0[0], r0");
COMPARE(vmov(NeonU32, d0, 1, r0), "ee200b10 vmov.32 d0[1], r0");
COMPARE(vmov(NeonS8, r0, d0, 0), "ee500b10 vmov.s8 r0, d0[0]");
COMPARE(vmov(NeonU8, r1, d1, 1), "eed11b30 vmov.u8 r1, d1[1]");
COMPARE(vmov(NeonS8, r2, d2, 2), "ee522b50 vmov.s8 r2, d2[2]");
COMPARE(vmov(NeonU8, r8, d3, 3), "eed38b70 vmov.u8 r8, d3[3]");
COMPARE(vmov(NeonS8, r0, d4, 4), "ee740b10 vmov.s8 r0, d4[4]");
COMPARE(vmov(NeonU8, r1, d5, 5), "eef51b30 vmov.u8 r1, d5[5]");
COMPARE(vmov(NeonS8, r2, d6, 6), "ee762b50 vmov.s8 r2, d6[6]");
COMPARE(vmov(NeonU8, r8, d7, 7), "eef78b70 vmov.u8 r8, d7[7]");
COMPARE(vmov(NeonS16, r0, d0, 0), "ee100b30 vmov.s16 r0, d0[0]");
COMPARE(vmov(NeonU16, r1, d1, 1), "ee911b70 vmov.u16 r1, d1[1]");
COMPARE(vmov(NeonS16, r2, d2, 2), "ee322b30 vmov.s16 r2, d2[2]");
COMPARE(vmov(NeonU16, r7, d3, 3), "eeb37b70 vmov.u16 r7, d3[3]");
COMPARE(vmov(NeonS32, r2, d15, 0), "ee1f2b10 vmov.32 r2, d15[0]");
COMPARE(vmov(NeonS32, r3, d14, 1), "ee3e3b10 vmov.32 r3, d14[1]");
COMPARE(vmov(q0, q15),
"f22e01fe vmov q0, q15");
COMPARE(vmov(q8, q9),
"f26201f2 vmov q8, q9");
COMPARE(vmvn(q0, q15),
"f3b005ee vmvn q0, q15");
COMPARE(vmvn(q8, q9),
"f3f005e2 vmvn q8, q9");
COMPARE(vswp(d0, d31),
"f3b2002f vswp d0, d31");
COMPARE(vswp(d16, d14),
@ -948,6 +983,24 @@ TEST(Neon) {
"f3b2006e vswp q0, q15");
COMPARE(vswp(q8, q9),
"f3f20062 vswp q8, q9");
COMPARE(vdup(Neon8, q0, r0),
"eee00b10 vdup.8 q0, r0");
COMPARE(vdup(Neon16, q1, r4),
"eea24b30 vdup.16 q1, r4");
COMPARE(vdup(Neon32, q15, r1),
"eeae1b90 vdup.32 q15, r1");
COMPARE(vdup(q0, s3),
"f3bc0c41 vdup q0, d1[1]");
COMPARE(vdup(q15, s2),
"f3f4ec41 vdup q15, d1[0]");
COMPARE(vcvt_f32_s32(q15, q1),
"f3fbe642 vcvt.f32.s32 q15, q1");
COMPARE(vcvt_f32_u32(q8, q9),
"f3fb06e2 vcvt.f32.u32 q8, q9");
COMPARE(vcvt_s32_f32(q15, q1),
"f3fbe742 vcvt.s32.f32 q15, q1");
COMPARE(vcvt_u32_f32(q8, q9),
"f3fb07e2 vcvt.u32.f32 q8, q9");
COMPARE(veor(d0, d1, d2),
"f3010112 veor d0, d1, d2");
COMPARE(veor(d0, d30, d31),
@ -956,6 +1009,54 @@ TEST(Neon) {
"f3020154 veor q0, q1, q2");
COMPARE(veor(q15, q0, q8),
"f340e170 veor q15, q0, q8");
COMPARE(vadd(q15, q0, q8),
"f240ed60 vadd.f32 q15, q0, q8");
COMPARE(vadd(Neon8, q0, q1, q2),
"f2020844 vadd.i8 q0, q1, q2");
COMPARE(vadd(Neon16, q1, q2, q8),
"f2142860 vadd.i16 q1, q2, q8");
COMPARE(vadd(Neon32, q15, q0, q8),
"f260e860 vadd.i32 q15, q0, q8");
COMPARE(vsub(q15, q0, q8),
"f260ed60 vsub.f32 q15, q0, q8");
COMPARE(vsub(Neon8, q0, q1, q2),
"f3020844 vsub.i8 q0, q1, q2");
COMPARE(vsub(Neon16, q1, q2, q8),
"f3142860 vsub.i16 q1, q2, q8");
COMPARE(vsub(Neon32, q15, q0, q8),
"f360e860 vsub.i32 q15, q0, q8");
COMPARE(vtst(Neon8, q0, q1, q2),
"f2020854 vtst.i8 q0, q1, q2");
COMPARE(vtst(Neon16, q1, q2, q8),
"f2142870 vtst.i16 q1, q2, q8");
COMPARE(vtst(Neon32, q15, q0, q8),
"f260e870 vtst.i32 q15, q0, q8");
COMPARE(vceq(Neon8, q0, q1, q2),
"f3020854 vceq.i8 q0, q1, q2");
COMPARE(vceq(Neon16, q1, q2, q8),
"f3142870 vceq.i16 q1, q2, q8");
COMPARE(vceq(Neon32, q15, q0, q8),
"f360e870 vceq.i32 q15, q0, q8");
COMPARE(vbsl(q0, q1, q2),
"f3120154 vbsl q0, q1, q2");
COMPARE(vbsl(q15, q0, q8),
"f350e170 vbsl q15, q0, q8");
COMPARE(vtbl(d0, NeonListOperand(d1, 1), d2),
"f3b10802 vtbl.8 d0, {d1}, d2");
COMPARE(vtbl(d31, NeonListOperand(d0, 2), d4),
"f3f0f904 vtbl.8 d31, {d0, d1}, d4");
COMPARE(vtbl(d15, NeonListOperand(d1, 3), d5),
"f3b1fa05 vtbl.8 d15, {d1, d2, d3}, d5");
COMPARE(vtbl(d15, NeonListOperand(d1, 4), d5),
"f3b1fb05 vtbl.8 d15, {d1, d2, d3, d4}, d5");
COMPARE(vtbx(d0, NeonListOperand(d1, 1), d2),
"f3b10842 vtbx.8 d0, {d1}, d2");
COMPARE(vtbx(d31, NeonListOperand(d0, 2), d4),
"f3f0f944 vtbx.8 d31, {d0, d1}, d4");
COMPARE(vtbx(d15, NeonListOperand(d1, 3), d5),
"f3b1fa45 vtbx.8 d15, {d1, d2, d3}, d5");
COMPARE(vtbx(d15, NeonListOperand(d1, 4), d5),
"f3b1fb45 vtbx.8 d15, {d1, d2, d3, d4}, d5");
}
VERIFY_RUN();

View File

@ -42,6 +42,7 @@ typedef void* (*F)(int x, int y, int p2, int p3, int p4);
#define __ masm->
typedef Object* (*F3)(void* p0, int p1, int p2, int p3, int p4);
typedef int (*F5)(void*, void*, void*, void*, void*);
@ -134,4 +135,248 @@ TEST(LoadAndStoreWithRepresentation) {
CHECK(!CALL_GENERATED_CODE(isolate, f, 0, 0, 0, 0, 0));
}
TEST(ExtractLane) {
if (!CpuFeatures::IsSupported(NEON)) return;
// Allocate an executable page of memory.
size_t actual_size;
byte* buffer = static_cast<byte*>(v8::base::OS::Allocate(
Assembler::kMinimalBufferSize, &actual_size, true));
CHECK(buffer);
Isolate* isolate = CcTest::i_isolate();
HandleScope handles(isolate);
MacroAssembler assembler(isolate, buffer, static_cast<int>(actual_size),
v8::internal::CodeObjectRequired::kYes);
MacroAssembler* masm = &assembler; // Create a pointer for the __ macro.
typedef struct {
int32_t i32x4_low[4];
int32_t i32x4_high[4];
int32_t i16x8_low[8];
int32_t i16x8_high[8];
int32_t i8x16_low[16];
int32_t i8x16_high[16];
int32_t f32x4_low[4];
int32_t f32x4_high[4];
} T;
T t;
__ stm(db_w, sp, r4.bit() | r5.bit() | lr.bit());
for (int i = 0; i < 4; i++) {
__ mov(r4, Operand(i));
__ vdup(Neon32, q1, r4);
__ ExtractLane(r5, q1, NeonS32, i);
__ str(r5, MemOperand(r0, offsetof(T, i32x4_low) + 4 * i));
SwVfpRegister si = SwVfpRegister::from_code(i);
__ ExtractLane(si, q1, r4, i);
__ vstr(si, r0, offsetof(T, f32x4_low) + 4 * i);
}
for (int i = 0; i < 8; i++) {
__ mov(r4, Operand(i));
__ vdup(Neon16, q1, r4);
__ ExtractLane(r5, q1, NeonS16, i);
__ str(r5, MemOperand(r0, offsetof(T, i16x8_low) + 4 * i));
}
for (int i = 0; i < 16; i++) {
__ mov(r4, Operand(i));
__ vdup(Neon8, q1, r4);
__ ExtractLane(r5, q1, NeonS8, i);
__ str(r5, MemOperand(r0, offsetof(T, i8x16_low) + 4 * i));
}
if (CpuFeatures::IsSupported(VFP32DREGS)) {
for (int i = 0; i < 4; i++) {
__ mov(r4, Operand(-i));
__ vdup(Neon32, q15, r4);
__ ExtractLane(r5, q15, NeonS32, i);
__ str(r5, MemOperand(r0, offsetof(T, i32x4_high) + 4 * i));
SwVfpRegister si = SwVfpRegister::from_code(i);
__ ExtractLane(si, q15, r4, i);
__ vstr(si, r0, offsetof(T, f32x4_high) + 4 * i);
}
for (int i = 0; i < 8; i++) {
__ mov(r4, Operand(-i));
__ vdup(Neon16, q15, r4);
__ ExtractLane(r5, q15, NeonS16, i);
__ str(r5, MemOperand(r0, offsetof(T, i16x8_high) + 4 * i));
}
for (int i = 0; i < 16; i++) {
__ mov(r4, Operand(-i));
__ vdup(Neon8, q15, r4);
__ ExtractLane(r5, q15, NeonS8, i);
__ str(r5, MemOperand(r0, offsetof(T, i8x16_high) + 4 * i));
}
}
__ ldm(ia_w, sp, r4.bit() | r5.bit() | pc.bit());
CodeDesc desc;
masm->GetCode(&desc);
Handle<Code> code = isolate->factory()->NewCode(
desc, Code::ComputeFlags(Code::STUB), Handle<Code>());
#ifdef DEBUG
OFStream os(stdout);
code->Print(os);
#endif
F3 f = FUNCTION_CAST<F3>(code->entry());
Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0);
USE(dummy);
for (int i = 0; i < 4; i++) {
CHECK_EQ(i, t.i32x4_low[i]);
CHECK_EQ(i, t.f32x4_low[i]);
}
for (int i = 0; i < 8; i++) {
CHECK_EQ(i, t.i16x8_low[i]);
}
for (int i = 0; i < 16; i++) {
CHECK_EQ(i, t.i8x16_low[i]);
}
if (CpuFeatures::IsSupported(VFP32DREGS)) {
for (int i = 0; i < 4; i++) {
CHECK_EQ(-i, t.i32x4_high[i]);
CHECK_EQ(-i, t.f32x4_high[i]);
}
for (int i = 0; i < 8; i++) {
CHECK_EQ(-i, t.i16x8_high[i]);
}
for (int i = 0; i < 16; i++) {
CHECK_EQ(-i, t.i8x16_high[i]);
}
}
}
TEST(ReplaceLane) {
if (!CpuFeatures::IsSupported(NEON)) return;
// Allocate an executable page of memory.
size_t actual_size;
byte* buffer = static_cast<byte*>(v8::base::OS::Allocate(
Assembler::kMinimalBufferSize, &actual_size, true));
CHECK(buffer);
Isolate* isolate = CcTest::i_isolate();
HandleScope handles(isolate);
MacroAssembler assembler(isolate, buffer, static_cast<int>(actual_size),
v8::internal::CodeObjectRequired::kYes);
MacroAssembler* masm = &assembler; // Create a pointer for the __ macro.
typedef struct {
int32_t i32x4_low[4];
int32_t i32x4_high[4];
int16_t i16x8_low[8];
int16_t i16x8_high[8];
int8_t i8x16_low[16];
int8_t i8x16_high[16];
int32_t f32x4_low[4];
int32_t f32x4_high[4];
} T;
T t;
__ stm(db_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | lr.bit());
const Register kScratch = r5;
__ veor(q0, q0, q0); // Zero
__ veor(q1, q1, q1); // Zero
for (int i = 0; i < 4; i++) {
__ mov(r4, Operand(i));
__ ReplaceLane(q0, q0, r4, NeonS32, i);
SwVfpRegister si = SwVfpRegister::from_code(i);
__ vmov(si, r4);
__ ReplaceLane(q1, q1, si, kScratch, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i32x4_low))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, f32x4_low))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ veor(q0, q0, q0); // Zero
for (int i = 0; i < 8; i++) {
__ mov(r4, Operand(i));
__ ReplaceLane(q0, q0, r4, NeonS16, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i16x8_low))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ veor(q0, q0, q0); // Zero
for (int i = 0; i < 16; i++) {
__ mov(r4, Operand(i));
__ ReplaceLane(q0, q0, r4, NeonS8, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i8x16_low))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
if (CpuFeatures::IsSupported(VFP32DREGS)) {
__ veor(q14, q14, q14); // Zero
__ veor(q15, q15, q15); // Zero
for (int i = 0; i < 4; i++) {
__ mov(r4, Operand(-i));
__ ReplaceLane(q14, q14, r4, NeonS32, i);
SwVfpRegister si = SwVfpRegister::from_code(i);
__ vmov(si, r4);
__ ReplaceLane(q15, q15, si, kScratch, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i32x4_high))));
__ vst1(Neon8, NeonListOperand(q14), NeonMemOperand(r4));
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, f32x4_high))));
__ vst1(Neon8, NeonListOperand(q15), NeonMemOperand(r4));
__ veor(q14, q14, q14); // Zero
for (int i = 0; i < 8; i++) {
__ mov(r4, Operand(-i));
__ ReplaceLane(q14, q14, r4, NeonS16, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i16x8_high))));
__ vst1(Neon8, NeonListOperand(q14), NeonMemOperand(r4));
__ veor(q14, q14, q14); // Zero
for (int i = 0; i < 16; i++) {
__ mov(r4, Operand(-i));
__ ReplaceLane(q14, q14, r4, NeonS8, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i8x16_high))));
__ vst1(Neon8, NeonListOperand(q14), NeonMemOperand(r4));
}
__ ldm(ia_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | pc.bit());
CodeDesc desc;
masm->GetCode(&desc);
Handle<Code> code = isolate->factory()->NewCode(
desc, Code::ComputeFlags(Code::STUB), Handle<Code>());
#ifdef DEBUG
OFStream os(stdout);
code->Print(os);
#endif
F3 f = FUNCTION_CAST<F3>(code->entry());
Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0);
USE(dummy);
for (int i = 0; i < 4; i++) {
CHECK_EQ(i, t.i32x4_low[i]);
CHECK_EQ(i, t.f32x4_low[i]);
}
for (int i = 0; i < 8; i++) {
CHECK_EQ(i, t.i16x8_low[i]);
}
for (int i = 0; i < 16; i++) {
CHECK_EQ(i, t.i8x16_low[i]);
}
if (CpuFeatures::IsSupported(VFP32DREGS)) {
for (int i = 0; i < 4; i++) {
CHECK_EQ(-i, t.i32x4_high[i]);
CHECK_EQ(-i, t.f32x4_high[i]);
}
for (int i = 0; i < 8; i++) {
CHECK_EQ(-i, t.i16x8_high[i]);
}
for (int i = 0; i < 16; i++) {
CHECK_EQ(-i, t.i8x16_high[i]);
}
}
}
#undef __