From 03f33f2e68641903a982a025b0ba645c8ba9f9b3 Mon Sep 17 00:00:00 2001 From: bbudge Date: Thu, 15 Dec 2016 10:15:23 -0800 Subject: [PATCH] [Turbofan] Add ARM NEON instructions for implementing SIMD. - Adds NEON instructions to assembler, disassembler, simulator. - Adds ExtractLane, ReplaceLane functions to macro assembler. LOG=N BUG=v8:4124 Review-Url: https://codereview.chromium.org/2546933002 Cr-Commit-Position: refs/heads/master@{#41737} --- src/arm/assembler-arm.cc | 326 +++++++++++++++-- src/arm/assembler-arm.h | 54 ++- src/arm/constants-arm.h | 12 +- src/arm/disasm-arm.cc | 172 ++++++++- src/arm/macro-assembler-arm.cc | 64 +++- src/arm/macro-assembler-arm.h | 8 + src/arm/simulator-arm.cc | 468 +++++++++++++++++++++--- src/arm/simulator-arm.h | 2 + test/cctest/test-assembler-arm.cc | 277 ++++++++++++-- test/cctest/test-disasm-arm.cc | 101 +++++ test/cctest/test-macro-assembler-arm.cc | 245 +++++++++++++ 11 files changed, 1592 insertions(+), 137 deletions(-) diff --git a/src/arm/assembler-arm.cc b/src/arm/assembler-arm.cc index 6e0b2db281..aa6be2110b 100644 --- a/src/arm/assembler-arm.cc +++ b/src/arm/assembler-arm.cc @@ -483,30 +483,6 @@ void NeonMemOperand::SetAlignment(int align) { } } - -NeonListOperand::NeonListOperand(DoubleRegister base, int registers_count) { - base_ = base; - switch (registers_count) { - case 1: - type_ = nlt_1; - break; - case 2: - type_ = nlt_2; - break; - case 3: - type_ = nlt_3; - break; - case 4: - type_ = nlt_4; - break; - default: - UNREACHABLE(); - type_ = nlt_1; - break; - } -} - - // ----------------------------------------------------------------------------- // Specific instructions, constants, and masks. @@ -2968,7 +2944,6 @@ void Assembler::vmov(const Register dst, emit(cond | 0xE*B24 | B20 | sn*B16 | dst.code()*B12 | 0xA*B8 | n*B7 | B4); } - // Type of data to read from or write to VFP register. // Used as specifier in generic vcvt instruction. enum VFPType { S32, U32, F32, F64 }; @@ -3902,6 +3877,57 @@ void Assembler::vmovl(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src) { (dt & NeonDataTypeSizeMask)*B19 | vd*B12 | 0xA*B8 | m*B5 | B4 | vm); } +static int EncodeScalar(NeonDataType dt, int index) { + int opc1_opc2 = 0; + DCHECK_LE(0, index); + switch (dt) { + case NeonS8: + case NeonU8: + DCHECK_GT(8, index); + opc1_opc2 = 0x8 | index; + break; + case NeonS16: + case NeonU16: + DCHECK_GT(4, index); + opc1_opc2 = 0x1 | (index << 1); + break; + case NeonS32: + case NeonU32: + DCHECK_GT(2, index); + opc1_opc2 = index << 2; + break; + default: + UNREACHABLE(); + break; + } + return (opc1_opc2 >> 2) * B21 | (opc1_opc2 & 0x3) * B5; +} + +void Assembler::vmov(NeonDataType dt, DwVfpRegister dst, int index, + Register src) { + // Instruction details available in ARM DDI 0406C.b, A8.8.940. + // vmov ARM core register to scalar. + DCHECK(dt == NeonS32 || dt == NeonU32 || IsEnabled(NEON)); + int vd, d; + dst.split_code(&vd, &d); + int opc1_opc2 = EncodeScalar(dt, index); + emit(0xEEu * B24 | vd * B16 | src.code() * B12 | 0xB * B8 | d * B7 | B4 | + opc1_opc2); +} + +void Assembler::vmov(NeonDataType dt, Register dst, DwVfpRegister src, + int index) { + // Instruction details available in ARM DDI 0406C.b, A8.8.942. + // vmov Arm scalar to core register. + DCHECK(dt == NeonS32 || dt == NeonU32 || IsEnabled(NEON)); + int vn, n; + src.split_code(&vn, &n); + int opc1_opc2 = EncodeScalar(dt, index); + int u = (dt & NeonDataTypeUMask) != 0 ? 1 : 0; + emit(0xEEu * B24 | u * B23 | B20 | vn * B16 | dst.code() * B12 | 0xB * B8 | + n * B7 | B4 | opc1_opc2); +} + void Assembler::vmov(const QwNeonRegister dst, const QwNeonRegister src) { DCHECK(IsEnabled(NEON)); // Instruction details available in ARM DDI 0406C.b, A8-938. @@ -3915,6 +3941,18 @@ void Assembler::vmov(const QwNeonRegister dst, const QwNeonRegister src) { B6 | m * B5 | B4 | vm); } +void Assembler::vmvn(const QwNeonRegister dst, const QwNeonRegister src) { + DCHECK(IsEnabled(NEON)); + // Instruction details available in ARM DDI 0406C.b, A8-966. + DCHECK(VfpRegisterIsAvailable(dst)); + DCHECK(VfpRegisterIsAvailable(src)); + int vd, d; + dst.split_code(&vd, &d); + int vm, m; + src.split_code(&vm, &m); + emit(0x1E7U * B23 | d * B22 | 3 * B20 | vd * B12 | 0x17 * B6 | m * B5 | vm); +} + void Assembler::vswp(DwVfpRegister dst, DwVfpRegister src) { // Instruction details available in ARM DDI 0406C.b, A8.8.418. // 1111(31-28) | 00111(27-23) | D(22) | 110010(21-16) | @@ -3940,8 +3978,105 @@ void Assembler::vswp(QwNeonRegister dst, QwNeonRegister src) { vm); } +void Assembler::vdup(NeonSize size, const QwNeonRegister dst, + const Register src) { + DCHECK(IsEnabled(NEON)); + // Instruction details available in ARM DDI 0406C.b, A8-886. + int B = 0, E = 0; + switch (size) { + case Neon8: + B = 1; + break; + case Neon16: + E = 1; + break; + case Neon32: + break; + default: + UNREACHABLE(); + break; + } + int vd, d; + dst.split_code(&vd, &d); + + emit(al | 0x1D * B23 | B * B22 | B21 | vd * B16 | src.code() * B12 | + 0xB * B8 | d * B7 | E * B5 | B4); +} + +void Assembler::vdup(const QwNeonRegister dst, const SwVfpRegister src) { + DCHECK(IsEnabled(NEON)); + // Instruction details available in ARM DDI 0406C.b, A8-884. + int index = src.code() & 1; + int d_reg = src.code() / 2; + int imm4 = 4 | index << 3; // esize = 32, index in bit 3. + int vd, d; + dst.split_code(&vd, &d); + int vm, m; + DwVfpRegister::from_code(d_reg).split_code(&vm, &m); + + emit(0x1E7U * B23 | d * B22 | 0x3 * B20 | imm4 * B16 | vd * B12 | 0x18 * B7 | + B6 | m * B5 | vm); +} + +// Encode NEON vcvt.src_type.dst_type instruction. +static Instr EncodeNeonVCVT(const VFPType dst_type, const QwNeonRegister dst, + const VFPType src_type, const QwNeonRegister src) { + DCHECK(src_type != dst_type); + DCHECK(src_type == F32 || dst_type == F32); + // Instruction details available in ARM DDI 0406C.b, A8.8.868. + int vd, d; + dst.split_code(&vd, &d); + int vm, m; + src.split_code(&vm, &m); + + int op = 0; + if (src_type == F32) { + DCHECK(dst_type == S32 || dst_type == U32); + op = dst_type == U32 ? 3 : 2; + } else { + DCHECK(src_type == S32 || src_type == U32); + op = src_type == U32 ? 1 : 0; + } + + return 0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x3 * B9 | op * B7 | + B6 | m * B5 | vm; +} + +void Assembler::vcvt_f32_s32(const QwNeonRegister dst, + const QwNeonRegister src) { + DCHECK(IsEnabled(NEON)); + DCHECK(VfpRegisterIsAvailable(dst)); + DCHECK(VfpRegisterIsAvailable(src)); + emit(EncodeNeonVCVT(F32, dst, S32, src)); +} + +void Assembler::vcvt_f32_u32(const QwNeonRegister dst, + const QwNeonRegister src) { + DCHECK(IsEnabled(NEON)); + DCHECK(VfpRegisterIsAvailable(dst)); + DCHECK(VfpRegisterIsAvailable(src)); + emit(EncodeNeonVCVT(F32, dst, U32, src)); +} + +void Assembler::vcvt_s32_f32(const QwNeonRegister dst, + const QwNeonRegister src) { + DCHECK(IsEnabled(NEON)); + DCHECK(VfpRegisterIsAvailable(dst)); + DCHECK(VfpRegisterIsAvailable(src)); + emit(EncodeNeonVCVT(S32, dst, F32, src)); +} + +void Assembler::vcvt_u32_f32(const QwNeonRegister dst, + const QwNeonRegister src) { + DCHECK(IsEnabled(NEON)); + DCHECK(VfpRegisterIsAvailable(dst)); + DCHECK(VfpRegisterIsAvailable(src)); + emit(EncodeNeonVCVT(U32, dst, F32, src)); +} + void Assembler::veor(DwVfpRegister dst, DwVfpRegister src1, DwVfpRegister src2) { + // Dd = veor(Dn, Dm) 64 bit integer exclusive OR. // Instruction details available in ARM DDI 0406C.b, A8.8.888. DCHECK(IsEnabled(NEON)); int vd, d; @@ -3956,6 +4091,7 @@ void Assembler::veor(DwVfpRegister dst, DwVfpRegister src1, void Assembler::veor(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2) { + // Qd = veor(Qn, Qm) SIMD integer exclusive OR. // Instruction details available in ARM DDI 0406C.b, A8.8.888. DCHECK(IsEnabled(NEON)); int vd, d; @@ -3968,6 +4104,146 @@ void Assembler::veor(QwNeonRegister dst, QwNeonRegister src1, m * B5 | B4 | vm); } +void Assembler::vadd(QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2) { + DCHECK(IsEnabled(NEON)); + // Qd = vadd(Qn, Qm) SIMD floating point addition. + // Instruction details available in ARM DDI 0406C.b, A8-830. + int vd, d; + dst.split_code(&vd, &d); + int vn, n; + src1.split_code(&vn, &n); + int vm, m; + src2.split_code(&vm, &m); + emit(0x1E4U * B23 | d * B22 | vn * B16 | vd * B12 | 0xD * B8 | n * B7 | B6 | + m * B5 | vm); +} + +void Assembler::vadd(NeonSize size, QwNeonRegister dst, + const QwNeonRegister src1, const QwNeonRegister src2) { + DCHECK(IsEnabled(NEON)); + // Qd = vadd(Qn, Qm) SIMD integer addition. + // Instruction details available in ARM DDI 0406C.b, A8-828. + int vd, d; + dst.split_code(&vd, &d); + int vn, n; + src1.split_code(&vn, &n); + int vm, m; + src2.split_code(&vm, &m); + int sz = static_cast(size); + emit(0x1E4U * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x8 * B8 | + n * B7 | B6 | m * B5 | vm); +} + +void Assembler::vsub(QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2) { + DCHECK(IsEnabled(NEON)); + // Qd = vsub(Qn, Qm) SIMD floating point subtraction. + // Instruction details available in ARM DDI 0406C.b, A8-1086. + int vd, d; + dst.split_code(&vd, &d); + int vn, n; + src1.split_code(&vn, &n); + int vm, m; + src2.split_code(&vm, &m); + emit(0x1E4U * B23 | d * B22 | B21 | vn * B16 | vd * B12 | 0xD * B8 | n * B7 | + B6 | m * B5 | vm); +} + +void Assembler::vsub(NeonSize size, QwNeonRegister dst, + const QwNeonRegister src1, const QwNeonRegister src2) { + DCHECK(IsEnabled(NEON)); + // Qd = vsub(Qn, Qm) SIMD integer subtraction. + // Instruction details available in ARM DDI 0406C.b, A8-1084. + int vd, d; + dst.split_code(&vd, &d); + int vn, n; + src1.split_code(&vn, &n); + int vm, m; + src2.split_code(&vm, &m); + int sz = static_cast(size); + emit(0x1E6U * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x8 * B8 | + n * B7 | B6 | m * B5 | vm); +} + +void Assembler::vtst(NeonSize size, QwNeonRegister dst, + const QwNeonRegister src1, const QwNeonRegister src2) { + DCHECK(IsEnabled(NEON)); + // Qd = vtst(Qn, Qm) SIMD test integer operands. + // Instruction details available in ARM DDI 0406C.b, A8-1098. + int vd, d; + dst.split_code(&vd, &d); + int vn, n; + src1.split_code(&vn, &n); + int vm, m; + src2.split_code(&vm, &m); + int sz = static_cast(size); + emit(0x1E4U * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x8 * B8 | + n * B7 | B6 | m * B5 | B4 | vm); +} + +void Assembler::vceq(NeonSize size, QwNeonRegister dst, + const QwNeonRegister src1, const QwNeonRegister src2) { + DCHECK(IsEnabled(NEON)); + // Qd = vceq(Qn, Qm) SIMD integer compare equal. + // Instruction details available in ARM DDI 0406C.b, A8-844. + int vd, d; + dst.split_code(&vd, &d); + int vn, n; + src1.split_code(&vn, &n); + int vm, m; + src2.split_code(&vm, &m); + int sz = static_cast(size); + emit(0x1E6U * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x8 * B8 | + n * B7 | B6 | m * B5 | B4 | vm); +} + +void Assembler::vbsl(QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2) { + DCHECK(IsEnabled(NEON)); + // Qd = vbsl(Qn, Qm) SIMD bitwise select. + // Instruction details available in ARM DDI 0406C.b, A8-844. + int vd, d; + dst.split_code(&vd, &d); + int vn, n; + src1.split_code(&vn, &n); + int vm, m; + src2.split_code(&vm, &m); + int op = 1; // vbsl + emit(0x1E6U * B23 | d * B22 | op * B20 | vn * B16 | vd * B12 | 0x1 * B8 | + n * B7 | B6 | m * B5 | B4 | vm); +} + +// Encode NEON vtbl / vtbx instruction. +static Instr EncodeNeonVTB(const DwVfpRegister dst, const NeonListOperand& list, + const DwVfpRegister index, bool vtbx) { + // Dd = vtbl(table, Dm) SIMD vector permute, zero at out of range indices. + // Instruction details available in ARM DDI 0406C.b, A8-1094. + // Dd = vtbx(table, Dm) SIMD vector permute, skip out of range indices. + // Instruction details available in ARM DDI 0406C.b, A8-1094. + int vd, d; + dst.split_code(&vd, &d); + int vn, n; + list.base().split_code(&vn, &n); + int vm, m; + index.split_code(&vm, &m); + int op = vtbx ? 1 : 0; // vtbl = 0, vtbx = 1. + return 0x1E7U * B23 | d * B22 | 0x3 * B20 | vn * B16 | vd * B12 | 0x2 * B10 | + list.length() * B8 | n * B7 | op * B6 | m * B5 | vm; +} + +void Assembler::vtbl(const DwVfpRegister dst, const NeonListOperand& list, + const DwVfpRegister index) { + DCHECK(IsEnabled(NEON)); + emit(EncodeNeonVTB(dst, list, index, false)); +} + +void Assembler::vtbx(const DwVfpRegister dst, const NeonListOperand& list, + const DwVfpRegister index) { + DCHECK(IsEnabled(NEON)); + emit(EncodeNeonVTB(dst, list, index, true)); +} + // Pseudo instructions. void Assembler::nop(int type) { // ARMv6{K/T2} and v7 have an actual NOP instruction but it serializes diff --git a/src/arm/assembler-arm.h b/src/arm/assembler-arm.h index 3ee980699d..235d80be83 100644 --- a/src/arm/assembler-arm.h +++ b/src/arm/assembler-arm.h @@ -640,12 +640,26 @@ class NeonMemOperand BASE_EMBEDDED { // Class NeonListOperand represents a list of NEON registers class NeonListOperand BASE_EMBEDDED { public: - explicit NeonListOperand(DoubleRegister base, int registers_count = 1); + explicit NeonListOperand(DoubleRegister base, int register_count = 1) + : base_(base), register_count_(register_count) {} + explicit NeonListOperand(QwNeonRegister q_reg) + : base_(q_reg.low()), register_count_(2) {} DoubleRegister base() const { return base_; } - NeonListType type() const { return type_; } + int register_count() { return register_count_; } + int length() const { return register_count_ - 1; } + NeonListType type() const { + switch (register_count_) { + default: UNREACHABLE(); + // Fall through. + case 1: return nlt_1; + case 2: return nlt_2; + case 3: return nlt_3; + case 4: return nlt_4; + } + } private: DoubleRegister base_; - NeonListType type_; + int register_count_; }; @@ -1149,6 +1163,8 @@ class Assembler : public AssemblerBase { void vmov(const DwVfpRegister dst, const DwVfpRegister src, const Condition cond = al); + // TODO(bbudge) Replace uses of these with the more general core register to + // scalar register vmov's. void vmov(const DwVfpRegister dst, const VmovIndex index, const Register src, @@ -1329,11 +1345,43 @@ class Assembler : public AssemblerBase { const NeonMemOperand& dst); void vmovl(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src); + // Only unconditional core <-> scalar moves are currently supported. + void vmov(NeonDataType dt, DwVfpRegister dst, int index, Register src); + void vmov(NeonDataType dt, Register dst, DwVfpRegister src, int index); + void vmov(const QwNeonRegister dst, const QwNeonRegister src); + void vmvn(const QwNeonRegister dst, const QwNeonRegister src); void vswp(DwVfpRegister dst, DwVfpRegister src); void vswp(QwNeonRegister dst, QwNeonRegister src); + // vdup conditional execution isn't supported. + void vdup(NeonSize size, const QwNeonRegister dst, const Register src); + void vdup(const QwNeonRegister dst, const SwVfpRegister src); + + void vcvt_f32_s32(const QwNeonRegister dst, const QwNeonRegister src); + void vcvt_f32_u32(const QwNeonRegister dst, const QwNeonRegister src); + void vcvt_s32_f32(const QwNeonRegister dst, const QwNeonRegister src); + void vcvt_u32_f32(const QwNeonRegister dst, const QwNeonRegister src); + void veor(DwVfpRegister dst, DwVfpRegister src1, DwVfpRegister src2); void veor(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2); + void vadd(const QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2); + void vadd(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2); + void vsub(const QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2); + void vsub(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2); + void vtst(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2); + void vceq(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2); + void vbsl(const QwNeonRegister dst, const QwNeonRegister src1, + const QwNeonRegister src2); + void vtbl(const DwVfpRegister dst, const NeonListOperand& list, + const DwVfpRegister index); + void vtbx(const DwVfpRegister dst, const NeonListOperand& list, + const DwVfpRegister index); // Pseudo instructions diff --git a/src/arm/constants-arm.h b/src/arm/constants-arm.h index 0c2ffdf4b9..e0c91fd4bf 100644 --- a/src/arm/constants-arm.h +++ b/src/arm/constants-arm.h @@ -190,6 +190,7 @@ enum { B7 = 1 << 7, B8 = 1 << 8, B9 = 1 << 9, + B10 = 1 << 10, B12 = 1 << 12, B16 = 1 << 16, B17 = 1 << 17, @@ -218,7 +219,6 @@ enum { kOff8Mask = (1 << 8) - 1 }; - enum BarrierOption { OSHLD = 0x1, OSHST = 0x2, @@ -327,12 +327,12 @@ enum LFlag { // NEON data type enum NeonDataType { - NeonS8 = 0x1, // U = 0, imm3 = 0b001 - NeonS16 = 0x2, // U = 0, imm3 = 0b010 - NeonS32 = 0x4, // U = 0, imm3 = 0b100 + NeonS8 = 0x1, // U = 0, imm3 = 0b001 + NeonS16 = 0x2, // U = 0, imm3 = 0b010 + NeonS32 = 0x4, // U = 0, imm3 = 0b100 NeonU8 = 1 << 24 | 0x1, // U = 1, imm3 = 0b001 NeonU16 = 1 << 24 | 0x2, // U = 1, imm3 = 0b010 - NeonU32 = 1 << 24 | 0x4, // U = 1, imm3 = 0b100 + NeonU32 = 1 << 24 | 0x4, // U = 1, imm3 = 0b100 NeonDataTypeSizeMask = 0x7, NeonDataTypeUMask = 1 << 24 }; @@ -667,7 +667,7 @@ class Instruction { private: - // Join split register codes, depending on single or double precision. + // Join split register codes, depending on register precision. // four_bit is the position of the least-significant bit of the four // bit specifier. one_bit is the position of the additional single bit // specifier. diff --git a/src/arm/disasm-arm.cc b/src/arm/disasm-arm.cc index abb13dd9f0..7a42386d74 100644 --- a/src/arm/disasm-arm.cc +++ b/src/arm/disasm-arm.cc @@ -1419,6 +1419,9 @@ int Decoder::DecodeType7(Instruction* instr) { // Sd = vsqrt(Sm) // vmrs // vmsr +// Qd = vdup.size(Qd, Rt) +// vmov.size: Dd[i] = Rt +// vmov.sign.size: Rt = Dn[i] void Decoder::DecodeTypeVFP(Instruction* instr) { VERIFY((instr->TypeValue() == 7) && (instr->Bit(24) == 0x0) ); VERIFY(instr->Bits(11, 9) == 0x5); @@ -1531,21 +1534,71 @@ void Decoder::DecodeTypeVFP(Instruction* instr) { if ((instr->VCValue() == 0x0) && (instr->VAValue() == 0x0)) { DecodeVMOVBetweenCoreAndSinglePrecisionRegisters(instr); - } else if ((instr->VLValue() == 0x0) && - (instr->VCValue() == 0x1) && - (instr->Bit(23) == 0x0)) { - if (instr->Bit(21) == 0x0) { - Format(instr, "vmov'cond.32 'Dd[0], 'rt"); + } else if ((instr->VLValue() == 0x0) && (instr->VCValue() == 0x1)) { + if (instr->Bit(23) == 0) { + int opc1_opc2 = (instr->Bits(22, 21) << 2) | instr->Bits(6, 5); + if ((opc1_opc2 & 0xb) == 0) { + // NeonS32/NeonU32 + if (instr->Bit(21) == 0x0) { + Format(instr, "vmov'cond.32 'Dd[0], 'rt"); + } else { + Format(instr, "vmov'cond.32 'Dd[1], 'rt"); + } + } else { + int vd = instr->VFPNRegValue(kDoublePrecision); + int rt = instr->RtValue(); + if ((opc1_opc2 & 0x8) != 0) { + // NeonS8 / NeonU8 + int i = opc1_opc2 & 0x7; + out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, + "vmov.8 d%d[%d], r%d", vd, i, rt); + } else if ((opc1_opc2 & 0x1) != 0) { + // NeonS16 / NeonU16 + int i = (opc1_opc2 >> 1) & 0x3; + out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, + "vmov.16 d%d[%d], r%d", vd, i, rt); + } else { + Unknown(instr); + } + } } else { - Format(instr, "vmov'cond.32 'Dd[1], 'rt"); + int size = 32; + if (instr->Bit(5) != 0) + size = 16; + else if (instr->Bit(22) != 0) + size = 8; + int Vd = instr->VFPNRegValue(kSimd128Precision); + int Rt = instr->RtValue(); + out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, + "vdup.%i q%d, r%d", size, Vd, Rt); } - } else if ((instr->VLValue() == 0x1) && - (instr->VCValue() == 0x1) && - (instr->Bit(23) == 0x0)) { - if (instr->Bit(21) == 0x0) { - Format(instr, "vmov'cond.32 'rt, 'Dd[0]"); + } else if ((instr->VLValue() == 0x1) && (instr->VCValue() == 0x1)) { + int opc1_opc2 = (instr->Bits(22, 21) << 2) | instr->Bits(6, 5); + if ((opc1_opc2 & 0xb) == 0) { + // NeonS32 / NeonU32 + if (instr->Bit(21) == 0x0) { + Format(instr, "vmov'cond.32 'rt, 'Dd[0]"); + } else { + Format(instr, "vmov'cond.32 'rt, 'Dd[1]"); + } } else { - Format(instr, "vmov'cond.32 'rt, 'Dd[1]"); + const char* sign = instr->Bit(23) != 0 ? "u" : "s"; + int rt = instr->RtValue(); + int vn = instr->VFPNRegValue(kDoublePrecision); + if ((opc1_opc2 & 0x8) != 0) { + // NeonS8 / NeonU8 + int i = opc1_opc2 & 0x7; + out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, + "vmov.%s8 r%d, d%d[%d]", sign, rt, vn, i); + } else if ((opc1_opc2 & 0x1) != 0) { + // NeonS16 / NeonU16 + int i = (opc1_opc2 >> 1) & 0x3; + out_buffer_pos_ += + SNPrintF(out_buffer_ + out_buffer_pos_, "vmov.%s16 r%d, d%d[%d]", + sign, rt, vn, i); + } else { + Unknown(instr); + } } } else if ((instr->VCValue() == 0x0) && (instr->VAValue() == 0x7) && @@ -1563,6 +1616,8 @@ void Decoder::DecodeTypeVFP(Instruction* instr) { Format(instr, "vmrs'cond 'rt, FPSCR"); } } + } else { + Unknown(instr); // Not used by V8. } } } @@ -1809,6 +1864,25 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { int Vm = instr->VFPMRegValue(kSimd128Precision); out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, "vmov q%d, q%d", Vd, Vm); + } else if (instr->Bits(11, 8) == 8) { + const char* op = (instr->Bit(4) == 0) ? "vadd" : "vtst"; + int size = kBitsPerByte * (1 << instr->Bits(21, 20)); + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kSimd128Precision); + // vadd/vtst.i Qd, Qm, Qn. + out_buffer_pos_ += + SNPrintF(out_buffer_ + out_buffer_pos_, "%s.i%d q%d, q%d, q%d", op, + size, Vd, Vn, Vm); + } else if (instr->Bits(11, 8) == 0xd && instr->Bit(4) == 0) { + const char* op = (instr->Bits(21, 20) == 0) ? "vadd" : "vsub"; + int size = kBitsPerByte * (1 << instr->Bits(21, 20)); + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kSimd128Precision); + // vadd/vsub.f32 Qd, Qm, Qn. + out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, + "%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm); } else { Unknown(instr); } @@ -1828,8 +1902,29 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { } break; case 6: - if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 && - instr->Bit(4) == 1) { + if (instr->Bits(11, 8) == 8) { + int size = kBitsPerByte * (1 << instr->Bits(21, 20)); + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kSimd128Precision); + if (instr->Bit(4) == 0) { + out_buffer_pos_ += + SNPrintF(out_buffer_ + out_buffer_pos_, "vsub.i%d q%d, q%d, q%d", + size, Vd, Vn, Vm); + } else { + out_buffer_pos_ += + SNPrintF(out_buffer_ + out_buffer_pos_, "vceq.i%d q%d, q%d, q%d", + size, Vd, Vn, Vm); + } + } else if (instr->Bits(21, 20) == 1 && instr->Bits(11, 8) == 1 && + instr->Bit(4) == 1) { + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kSimd128Precision); + out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, + "vbsl q%d, q%d, q%d", Vd, Vn, Vm); + } else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 && + instr->Bit(4) == 1) { if (instr->Bit(6) == 0) { // veor Dd, Dn, Dm int Vd = instr->VFPDRegValue(kDoublePrecision); @@ -1860,6 +1955,35 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { int imm3 = instr->Bits(21, 19); out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, "vmovl.u%d q%d, d%d", imm3*8, Vd, Vm); + } else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0 && + instr->Bits(11, 6) == 0x17 && instr->Bit(4) == 0) { + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + out_buffer_pos_ += + SNPrintF(out_buffer_ + out_buffer_pos_, "vmvn q%d, q%d", Vd, Vm); + } else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0xB && + instr->Bits(11, 9) == 0x3 && instr->Bit(6) == 1 && + instr->Bit(4) == 0) { + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + const char* suffix = nullptr; + int op = instr->Bits(8, 7); + switch (op) { + case 0: + suffix = "f32.s32"; + break; + case 1: + suffix = "f32.u32"; + break; + case 2: + suffix = "s32.f32"; + break; + case 3: + suffix = "u32.f32"; + break; + } + out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, + "vcvt.%s q%d, q%d", suffix, Vd, Vm); } else if ((instr->Bits(21, 16) == 0x32) && (instr->Bits(11, 7) == 0) && (instr->Bit(4) == 0)) { if (instr->Bit(6) == 0) { @@ -1873,6 +1997,26 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, "vswp q%d, q%d", Vd, Vm); } + } else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 7) == 0x18 && + instr->Bit(4) == 0x0) { + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kDoublePrecision); + int index = instr->Bit(19); + out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, + "vdup q%d, d%d[%d]", Vd, Vm, index); + } else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 10) == 0x2 && + instr->Bit(4) == 0x0) { + int Vd = instr->VFPDRegValue(kDoublePrecision); + int Vn = instr->VFPNRegValue(kDoublePrecision); + int Vm = instr->VFPMRegValue(kDoublePrecision); + int len = instr->Bits(9, 8); + NeonListOperand list(DwVfpRegister::from_code(Vn), len + 1); + out_buffer_pos_ += + SNPrintF(out_buffer_ + out_buffer_pos_, "%s d%d, ", + instr->Bit(6) == 0 ? "vtbl.8" : "vtbx.8", Vd); + FormatNeonList(Vn, list.type()); + Print(", "); + PrintDRegister(Vm); } else { Unknown(instr); } diff --git a/src/arm/macro-assembler-arm.cc b/src/arm/macro-assembler-arm.cc index 3211dea083..8363d5e44c 100644 --- a/src/arm/macro-assembler-arm.cc +++ b/src/arm/macro-assembler-arm.cc @@ -1081,8 +1081,8 @@ void MacroAssembler::VmovLow(DwVfpRegister dst, Register src) { } void MacroAssembler::VmovExtended(Register dst, int src_code) { - DCHECK_LE(32, src_code); - DCHECK_GT(64, src_code); + DCHECK_LE(SwVfpRegister::kMaxNumRegisters, src_code); + DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, src_code); if (src_code & 0x1) { VmovHigh(dst, DwVfpRegister::from_code(src_code / 2)); } else { @@ -1091,8 +1091,8 @@ void MacroAssembler::VmovExtended(Register dst, int src_code) { } void MacroAssembler::VmovExtended(int dst_code, Register src) { - DCHECK_LE(32, dst_code); - DCHECK_GT(64, dst_code); + DCHECK_LE(SwVfpRegister::kMaxNumRegisters, dst_code); + DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, dst_code); if (dst_code & 0x1) { VmovHigh(DwVfpRegister::from_code(dst_code / 2), src); } else { @@ -1102,22 +1102,23 @@ void MacroAssembler::VmovExtended(int dst_code, Register src) { void MacroAssembler::VmovExtended(int dst_code, int src_code, Register scratch) { - if (src_code < 32 && dst_code < 32) { + if (src_code < SwVfpRegister::kMaxNumRegisters && + dst_code < SwVfpRegister::kMaxNumRegisters) { // src and dst are both s-registers. vmov(SwVfpRegister::from_code(dst_code), SwVfpRegister::from_code(src_code)); - } else if (src_code < 32) { + } else if (src_code < SwVfpRegister::kMaxNumRegisters) { // src is an s-register. vmov(scratch, SwVfpRegister::from_code(src_code)); VmovExtended(dst_code, scratch); - } else if (dst_code < 32) { + } else if (dst_code < SwVfpRegister::kMaxNumRegisters) { // dst is an s-register. VmovExtended(scratch, src_code); vmov(SwVfpRegister::from_code(dst_code), scratch); } else { // Neither src or dst are s-registers. - DCHECK_GT(64, src_code); - DCHECK_GT(64, dst_code); + DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, src_code); + DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, dst_code); VmovExtended(scratch, src_code); VmovExtended(dst_code, scratch); } @@ -1125,7 +1126,7 @@ void MacroAssembler::VmovExtended(int dst_code, int src_code, void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src, Register scratch) { - if (dst_code >= 32) { + if (dst_code >= SwVfpRegister::kMaxNumRegisters) { ldr(scratch, src); VmovExtended(dst_code, scratch); } else { @@ -1135,7 +1136,7 @@ void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src, void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code, Register scratch) { - if (src_code >= 32) { + if (src_code >= SwVfpRegister::kMaxNumRegisters) { VmovExtended(scratch, src_code); str(scratch, dst); } else { @@ -1143,6 +1144,47 @@ void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code, } } +void MacroAssembler::ExtractLane(Register dst, QwNeonRegister src, + NeonDataType dt, int lane) { + int bytes_per_lane = dt & NeonDataTypeSizeMask; // 1, 2, 4 + int log2_bytes_per_lane = bytes_per_lane / 2; // 0, 1, 2 + int byte = lane << log2_bytes_per_lane; + int double_word = byte >> kDoubleSizeLog2; + int double_byte = byte & (kDoubleSize - 1); + int double_lane = double_byte >> log2_bytes_per_lane; + DwVfpRegister double_source = + DwVfpRegister::from_code(src.code() * 2 + double_word); + vmov(dt, dst, double_source, double_lane); +} + +void MacroAssembler::ExtractLane(SwVfpRegister dst, QwNeonRegister src, + Register scratch, int lane) { + int s_code = src.code() * 4 + lane; + VmovExtended(dst.code(), s_code, scratch); +} + +void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src, + Register src_lane, NeonDataType dt, int lane) { + Move(dst, src); + int bytes_per_lane = dt & NeonDataTypeSizeMask; // 1, 2, 4 + int log2_bytes_per_lane = bytes_per_lane / 2; // 0, 1, 2 + int byte = lane << log2_bytes_per_lane; + int double_word = byte >> kDoubleSizeLog2; + int double_byte = byte & (kDoubleSize - 1); + int double_lane = double_byte >> log2_bytes_per_lane; + DwVfpRegister double_dst = + DwVfpRegister::from_code(dst.code() * 2 + double_word); + vmov(dt, double_dst, double_lane, src_lane); +} + +void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src, + SwVfpRegister src_lane, Register scratch, + int lane) { + Move(dst, src); + int s_code = dst.code() * 4 + lane; + VmovExtended(s_code, src_lane.code(), scratch); +} + void MacroAssembler::LslPair(Register dst_low, Register dst_high, Register src_low, Register src_high, Register scratch, Register shift) { diff --git a/src/arm/macro-assembler-arm.h b/src/arm/macro-assembler-arm.h index 02df3cef6b..5a0a2b6794 100644 --- a/src/arm/macro-assembler-arm.h +++ b/src/arm/macro-assembler-arm.h @@ -561,6 +561,14 @@ class MacroAssembler: public Assembler { void VmovExtended(int dst_code, const MemOperand& src, Register scratch); void VmovExtended(const MemOperand& dst, int src_code, Register scratch); + void ExtractLane(Register dst, QwNeonRegister src, NeonDataType dt, int lane); + void ExtractLane(SwVfpRegister dst, QwNeonRegister src, Register scratch, + int lane); + void ReplaceLane(QwNeonRegister dst, QwNeonRegister src, Register src_lane, + NeonDataType dt, int lane); + void ReplaceLane(QwNeonRegister dst, QwNeonRegister src, + SwVfpRegister src_lane, Register scratch, int lane); + void LslPair(Register dst_low, Register dst_high, Register src_low, Register src_high, Register scratch, Register shift); void LslPair(Register dst_low, Register dst_high, Register src_low, diff --git a/src/arm/simulator-arm.cc b/src/arm/simulator-arm.cc index 5b0c7d603e..12654834f2 100644 --- a/src/arm/simulator-arm.cc +++ b/src/arm/simulator-arm.cc @@ -3067,6 +3067,7 @@ void Simulator::DecodeType7(Instruction* instr) { // Dd = vsqrt(Dm) // Sd = vsqrt(Sm) // vmrs +// vdup.size Qd, Rt. void Simulator::DecodeTypeVFP(Instruction* instr) { DCHECK((instr->TypeValue() == 7) && (instr->Bit(24) == 0x0) ); DCHECK(instr->Bits(11, 9) == 0x5); @@ -3277,24 +3278,116 @@ void Simulator::DecodeTypeVFP(Instruction* instr) { if ((instr->VCValue() == 0x0) && (instr->VAValue() == 0x0)) { DecodeVMOVBetweenCoreAndSinglePrecisionRegisters(instr); - } else if ((instr->VLValue() == 0x0) && - (instr->VCValue() == 0x1) && - (instr->Bit(23) == 0x0)) { - // vmov (ARM core register to scalar) - int vd = instr->Bits(19, 16) | (instr->Bit(7) << 4); - uint32_t data[2]; - get_d_register(vd, data); - data[instr->Bit(21)] = get_register(instr->RtValue()); - set_d_register(vd, data); - } else if ((instr->VLValue() == 0x1) && - (instr->VCValue() == 0x1) && - (instr->Bit(23) == 0x0)) { + } else if ((instr->VLValue() == 0x0) && (instr->VCValue() == 0x1)) { + if (instr->Bit(23) == 0) { + // vmov (ARM core register to scalar) + int vd = instr->VFPNRegValue(kDoublePrecision); + int rt = instr->RtValue(); + int opc1_opc2 = (instr->Bits(22, 21) << 2) | instr->Bits(6, 5); + if ((opc1_opc2 & 0xb) == 0) { + // NeonS32/NeonU32 + uint32_t data[2]; + get_d_register(vd, data); + data[instr->Bit(21)] = get_register(rt); + set_d_register(vd, data); + } else { + uint64_t data; + get_d_register(vd, &data); + uint64_t rt_value = get_register(rt); + if ((opc1_opc2 & 0x8) != 0) { + // NeonS8 / NeonU8 + int i = opc1_opc2 & 0x7; + int shift = i * kBitsPerByte; + const uint64_t mask = 0xFF; + data &= ~(mask << shift); + data |= (rt_value & mask) << shift; + set_d_register(vd, &data); + } else if ((opc1_opc2 & 0x1) != 0) { + // NeonS16 / NeonU16 + int i = (opc1_opc2 >> 1) & 0x3; + int shift = i * kBitsPerByte * kShortSize; + const uint64_t mask = 0xFFFF; + data &= ~(mask << shift); + data |= (rt_value & mask) << shift; + set_d_register(vd, &data); + } else { + UNREACHABLE(); // Not used by V8. + } + } + } else { + // vdup.size Qd, Rt. + NeonSize size = Neon32; + if (instr->Bit(5) != 0) + size = Neon16; + else if (instr->Bit(22) != 0) + size = Neon8; + int vd = instr->VFPNRegValue(kSimd128Precision); + int rt = instr->RtValue(); + uint32_t rt_value = get_register(rt); + uint32_t q_data[4]; + switch (size) { + case Neon8: { + rt_value &= 0xFF; + uint8_t* dst = reinterpret_cast(q_data); + for (int i = 0; i < 16; i++) { + dst[i] = rt_value; + } + break; + } + case Neon16: { + // Perform pairwise ops instead of casting to uint16_t. + rt_value &= 0xFFFFu; + uint32_t rt_rt = (rt_value << 16) | (rt_value & 0xFFFFu); + for (int i = 0; i < 4; i++) { + q_data[i] = rt_rt; + } + break; + } + case Neon32: { + for (int i = 0; i < 4; i++) { + q_data[i] = rt_value; + } + break; + } + default: + UNREACHABLE(); + break; + } + set_q_register(vd, q_data); + } + } else if ((instr->VLValue() == 0x1) && (instr->VCValue() == 0x1)) { // vmov (scalar to ARM core register) - int vn = instr->Bits(19, 16) | (instr->Bit(7) << 4); - double dn_value = get_double_from_d_register(vn); - int32_t data[2]; - memcpy(data, &dn_value, 8); - set_register(instr->RtValue(), data[instr->Bit(21)]); + int vn = instr->VFPNRegValue(kDoublePrecision); + int rt = instr->RtValue(); + int opc1_opc2 = (instr->Bits(22, 21) << 2) | instr->Bits(6, 5); + if ((opc1_opc2 & 0xb) == 0) { + // NeonS32 / NeonU32 + double dn_value = get_double_from_d_register(vn); + int32_t data[2]; + memcpy(data, &dn_value, 8); + set_register(rt, data[instr->Bit(21)]); + } else { + uint64_t data; + get_d_register(vn, &data); + bool u = instr->Bit(23) != 0; + if ((opc1_opc2 & 0x8) != 0) { + // NeonS8 / NeonU8 + int i = opc1_opc2 & 0x7; + int shift = i * kBitsPerByte; + uint32_t scalar = (data >> shift) & 0xFFu; + if (!u && (scalar & 0x80) != 0) scalar |= 0xffffff00; + set_register(rt, scalar); + } else if ((opc1_opc2 & 0x1) != 0) { + // NeonS16 / NeonU16 + int i = (opc1_opc2 >> 1) & 0x3; + int shift = i * kBitsPerByte * kShortSize; + uint32_t scalar = (data >> shift) & 0xFFFFu; + if (!u && (scalar & 0x8000) != 0) scalar |= 0xffff0000; + set_register(rt, scalar); + } else { + UNREACHABLE(); // Not used by V8. + } + } } else if ((instr->VLValue() == 0x1) && (instr->VCValue() == 0x0) && (instr->VAValue() == 0x7) && @@ -3520,6 +3613,48 @@ int VFPConversionSaturate(double val, bool unsigned_res) { } } +int32_t Simulator::ConvertDoubleToInt(double val, bool unsigned_integer, + VFPRoundingMode mode) { + int32_t result = + unsigned_integer ? static_cast(val) : static_cast(val); + + inv_op_vfp_flag_ = get_inv_op_vfp_flag(mode, val, unsigned_integer); + + double abs_diff = unsigned_integer + ? std::fabs(val - static_cast(result)) + : std::fabs(val - result); + + inexact_vfp_flag_ = (abs_diff != 0); + + if (inv_op_vfp_flag_) { + result = VFPConversionSaturate(val, unsigned_integer); + } else { + switch (mode) { + case RN: { + int val_sign = (val > 0) ? 1 : -1; + if (abs_diff > 0.5) { + result += val_sign; + } else if (abs_diff == 0.5) { + // Round to even if exactly halfway. + result = ((result % 2) == 0) ? result : result + val_sign; + } + break; + } + + case RM: + result = result > val ? result - 1 : result; + break; + + case RZ: + // Nothing to do. + break; + + default: + UNREACHABLE(); + } + } + return result; +} void Simulator::DecodeVCVTBetweenFloatingPointAndInteger(Instruction* instr) { DCHECK((instr->Bit(4) == 0) && (instr->Opc1Value() == 0x7) && @@ -3556,44 +3691,7 @@ void Simulator::DecodeVCVTBetweenFloatingPointAndInteger(Instruction* instr) { double val = double_precision ? get_double_from_d_register(src) : get_float_from_s_register(src); - int temp = unsigned_integer ? static_cast(val) - : static_cast(val); - - inv_op_vfp_flag_ = get_inv_op_vfp_flag(mode, val, unsigned_integer); - - double abs_diff = - unsigned_integer ? std::fabs(val - static_cast(temp)) - : std::fabs(val - temp); - - inexact_vfp_flag_ = (abs_diff != 0); - - if (inv_op_vfp_flag_) { - temp = VFPConversionSaturate(val, unsigned_integer); - } else { - switch (mode) { - case RN: { - int val_sign = (val > 0) ? 1 : -1; - if (abs_diff > 0.5) { - temp += val_sign; - } else if (abs_diff == 0.5) { - // Round to even if exactly halfway. - temp = ((temp % 2) == 0) ? temp : temp + val_sign; - } - break; - } - - case RM: - temp = temp > val ? temp - 1 : temp; - break; - - case RZ: - // Nothing to do. - break; - - default: - UNREACHABLE(); - } - } + int32_t temp = ConvertDoubleToInt(val, unsigned_integer, mode); // Update the destination register. set_s_register_from_sinteger(dst, temp); @@ -3740,6 +3838,16 @@ void Simulator::DecodeType6CoprocessorIns(Instruction* instr) { } } +#define HIGH_16(x) ((x) >> 16) +#define LOW_16(x) ((x)&0xFFFFu) +#define COMBINE_32(high, low) ((high) << 16 | (low)&0xFFFFu) +#define PAIRWISE_OP(x, y, OP) \ + COMBINE_32(OP(HIGH_16((x)), HIGH_16((y))), OP(LOW_16((x)), LOW_16((y)))) + +#define ADD_16(x, y) ((x) + (y)) +#define SUB_16(x, y) ((x) - (y)) +#define CEQ_16(x, y) ((x) == (y) ? 0xFFFFu : 0) +#define TST_16(x, y) (((x) & (y)) != 0 ? 0xFFFFu : 0) void Simulator::DecodeSpecialCondition(Instruction* instr) { switch (instr->SpecialValue()) { @@ -3752,6 +3860,91 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { uint32_t data[4]; get_q_register(Vm, data); set_q_register(Vd, data); + } else if (instr->Bits(11, 8) == 8) { + // vadd/vtst + int size = static_cast(instr->Bits(21, 20)); + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kSimd128Precision); + uint32_t src1[4], src2[4]; + get_q_register(Vn, src1); + get_q_register(Vm, src2); + if (instr->Bit(4) == 0) { + // vadd.i Qd, Qm, Qn. + switch (size) { + case Neon8: { + uint8_t* s1 = reinterpret_cast(src1); + uint8_t* s2 = reinterpret_cast(src2); + for (int i = 0; i < 16; i++) { + s1[i] += s2[i]; + } + break; + } + case Neon16: { + for (int i = 0; i < 4; i++) { + src1[i] = PAIRWISE_OP(src1[i], src2[i], ADD_16); + } + break; + } + case Neon32: { + for (int i = 0; i < 4; i++) { + src1[i] += src2[i]; + } + break; + } + default: + UNREACHABLE(); + break; + } + } else { + // vtst.i Qd, Qm, Qn. + switch (size) { + case Neon8: { + uint8_t* s1 = reinterpret_cast(src1); + uint8_t* s2 = reinterpret_cast(src2); + for (int i = 0; i < 16; i++) { + s1[i] = (s1[i] & s2[i]) != 0 ? 0xFFu : 0; + } + break; + } + case Neon16: { + for (int i = 0; i < 4; i++) { + src1[i] = PAIRWISE_OP(src1[i], src2[i], TST_16); + } + break; + } + case Neon32: { + for (int i = 0; i < 4; i++) { + src1[i] = (src1[i] & src2[i]) != 0 ? 0xFFFFFFFFu : 0; + } + break; + } + default: + UNREACHABLE(); + break; + } + } + set_q_register(Vd, src1); + } else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xd && + instr->Bit(4) == 0) { + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kSimd128Precision); + uint32_t src1[4], src2[4]; + get_q_register(Vn, src1); + get_q_register(Vm, src2); + for (int i = 0; i < 4; i++) { + if (instr->Bit(21) == 0) { + // vadd.f32 Qd, Qm, Qn. + src1[i] = bit_cast(bit_cast(src1[i]) + + bit_cast(src2[i])); + } else { + // vsub.f32 Qd, Qm, Qn. + src1[i] = bit_cast(bit_cast(src1[i]) - + bit_cast(src2[i])); + } + } + set_q_register(Vd, src1); } else { UNIMPLEMENTED(); } @@ -3781,8 +3974,92 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { } break; case 6: - if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 && - instr->Bit(4) == 1) { + if (instr->Bits(11, 8) == 8 && instr->Bit(4) == 0) { + // vsub.size Qd, Qm, Qn. + int size = static_cast(instr->Bits(21, 20)); + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kSimd128Precision); + uint32_t src1[4], src2[4]; + get_q_register(Vn, src1); + get_q_register(Vm, src2); + switch (size) { + case Neon8: { + uint8_t* s1 = reinterpret_cast(src1); + uint8_t* s2 = reinterpret_cast(src2); + for (int i = 0; i < 16; i++) { + s1[i] -= s2[i]; + } + break; + } + case Neon16: { + for (int i = 0; i < 4; i++) { + src1[i] = PAIRWISE_OP(src1[i], src2[i], SUB_16); + } + break; + } + case Neon32: { + for (int i = 0; i < 4; i++) { + src1[i] -= src2[i]; + } + break; + } + default: + UNREACHABLE(); + break; + } + set_q_register(Vd, src1); + } else if (instr->Bits(11, 8) == 8 && instr->Bit(4) == 1) { + // vceq.size Qd, Qm, Qn. + int size = static_cast(instr->Bits(21, 20)); + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kSimd128Precision); + uint32_t src1[4], src2[4]; + get_q_register(Vn, src1); + get_q_register(Vm, src2); + switch (size) { + case Neon8: { + uint8_t* s1 = reinterpret_cast(src1); + uint8_t* s2 = reinterpret_cast(src2); + for (int i = 0; i < 16; i++) { + s1[i] = s1[i] == s2[i] ? 0xFF : 0; + } + break; + } + case Neon16: { + for (int i = 0; i < 4; i++) { + src1[i] = PAIRWISE_OP(src1[i], src2[i], CEQ_16); + } + break; + } + case Neon32: { + for (int i = 0; i < 4; i++) { + src1[i] = src1[i] == src2[i] ? 0xFFFFFFFF : 0; + } + break; + } + default: + UNREACHABLE(); + break; + } + set_q_register(Vd, src1); + } else if (instr->Bits(21, 20) == 1 && instr->Bits(11, 8) == 1 && + instr->Bit(4) == 1) { + // vbsl.size Qd, Qm, Qn. + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + int Vn = instr->VFPNRegValue(kSimd128Precision); + uint32_t dst[4], src1[4], src2[4]; + get_q_register(Vd, dst); + get_q_register(Vn, src1); + get_q_register(Vm, src2); + for (int i = 0; i < 4; i++) { + dst[i] = (dst[i] & src1[i]) | (~dst[i] & src2[i]); + } + set_q_register(Vd, dst); + } else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 && + instr->Bit(4) == 1) { if (instr->Bit(6) == 0) { // veor Dd, Dn, Dm int Vd = instr->VFPDRegValue(kDoublePrecision); @@ -3829,6 +4106,40 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { e++; } set_q_register(Vd, reinterpret_cast(to)); + } else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0xB && + instr->Bits(11, 9) == 0x3 && instr->Bit(6) == 1 && + instr->Bit(4) == 0) { + // vcvt.. Qd, Qm. + int Vd = instr->VFPDRegValue(kSimd128Precision); + int Vm = instr->VFPMRegValue(kSimd128Precision); + uint32_t q_data[4]; + get_q_register(Vm, q_data); + int op = instr->Bits(8, 7); + for (int i = 0; i < 4; i++) { + switch (op) { + case 0: + // f32 <- s32, round towards nearest. + q_data[i] = bit_cast( + std::round(static_cast(bit_cast(q_data[i])))); + break; + case 1: + // f32 <- u32, round towards nearest. + q_data[i] = + bit_cast(std::round(static_cast(q_data[i]))); + break; + case 2: + // s32 <- f32, round to zero. + q_data[i] = static_cast( + ConvertDoubleToInt(bit_cast(q_data[i]), false, RZ)); + break; + case 3: + // u32 <- f32, round to zero. + q_data[i] = static_cast( + ConvertDoubleToInt(bit_cast(q_data[i]), true, RZ)); + break; + } + } + set_q_register(Vd, q_data); } else if ((instr->Bits(21, 16) == 0x32) && (instr->Bits(11, 7) == 0) && (instr->Bit(4) == 0)) { if (instr->Bit(6) == 0) { @@ -3850,6 +4161,49 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { set_q_register(vm, dval); set_q_register(vd, mval); } + } else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 7) == 0x18 && + instr->Bit(4) == 0x0) { + // vdup.32 Qd, Sm. + int vd = instr->VFPDRegValue(kSimd128Precision); + int vm = instr->VFPMRegValue(kDoublePrecision); + int index = instr->Bit(19); + uint32_t s_data = get_s_register(vm * 2 + index); + uint32_t q_data[4]; + for (int i = 0; i < 4; i++) q_data[i] = s_data; + set_q_register(vd, q_data); + } else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0 && + instr->Bits(11, 6) == 0x17 && instr->Bit(4) == 0) { + // vmvn Qd, Qm. + int vd = instr->VFPDRegValue(kSimd128Precision); + int vm = instr->VFPMRegValue(kSimd128Precision); + uint32_t q_data[4]; + get_q_register(vm, q_data); + for (int i = 0; i < 4; i++) q_data[i] = ~q_data[i]; + set_q_register(vd, q_data); + } else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 10) == 0x2 && + instr->Bit(4) == 0x0) { + // vtb[l,x] Dd, , Dm. + int vd = instr->VFPDRegValue(kDoublePrecision); + int vn = instr->VFPNRegValue(kDoublePrecision); + int vm = instr->VFPMRegValue(kDoublePrecision); + int table_len = (instr->Bits(9, 8) + 1) * kDoubleSize; + bool vtbx = instr->Bit(6) != 0; // vtbl / vtbx + uint64_t destination = 0, indices = 0, result = 0; + get_d_register(vd, &destination); + get_d_register(vm, &indices); + for (int i = 0; i < kDoubleSize; i++) { + int shift = i * kBitsPerByte; + int index = (indices >> shift) & 0xFF; + if (index < table_len) { + uint64_t table; + get_d_register(vn + index / kDoubleSize, &table); + result |= ((table >> ((index % kDoubleSize) * kBitsPerByte)) & 0xFF) + << shift; + } else if (vtbx) { + result |= destination & (0xFFull << shift); + } + } + set_d_register(vd, &result); } else { UNIMPLEMENTED(); } diff --git a/src/arm/simulator-arm.h b/src/arm/simulator-arm.h index 7435b77255..e214abdcbd 100644 --- a/src/arm/simulator-arm.h +++ b/src/arm/simulator-arm.h @@ -339,6 +339,8 @@ class Simulator { void DecodeVMOVBetweenCoreAndSinglePrecisionRegisters(Instruction* instr); void DecodeVCMP(Instruction* instr); void DecodeVCVTBetweenDoubleAndSingle(Instruction* instr); + int32_t ConvertDoubleToInt(double val, bool unsigned_integer, + VFPRoundingMode mode); void DecodeVCVTBetweenFloatingPointAndInteger(Instruction* instr); // Executes one instruction. diff --git a/test/cctest/test-assembler-arm.cc b/test/cctest/test-assembler-arm.cc index cb99c4ace5..787371409a 100644 --- a/test/cctest/test-assembler-arm.cc +++ b/test/cctest/test-assembler-arm.cc @@ -1221,6 +1221,10 @@ TEST(14) { CHECK_EQ(kArmNanLower32, bit_cast(t.div_result) & 0xffffffffu); } +#define INT32_TO_FLOAT(val) \ + std::round(static_cast(bit_cast(val))) +#define UINT32_TO_FLOAT(val) \ + std::round(static_cast(bit_cast(val))) TEST(15) { // Test the Neon instructions. @@ -1255,8 +1259,20 @@ TEST(15) { uint32_t dstA5; uint32_t dstA6; uint32_t dstA7; - uint32_t vmov_src[4], vmov_dst[4]; - uint32_t veor_src[4], veor_dst[4]; + uint64_t vmov_to_scalar1, vmov_to_scalar2; + uint32_t vmov_from_scalar_s8, vmov_from_scalar_u8; + uint32_t vmov_from_scalar_s16, vmov_from_scalar_u16; + uint32_t vmov_from_scalar_32; + uint32_t vmov_src[4], vmov_dst[4], vmvn[4]; + int32_t vcvt_s32_f32[4]; + uint32_t vcvt_u32_f32[4]; + float vcvt_f32_s32[4], vcvt_f32_u32[4]; + uint32_t vdup1[4], vdup2[4], vdup3[4], vdup4[4]; + uint32_t veor[4]; + uint32_t vadd8[4], vadd16[4], vadd32[4]; + uint32_t vsub8[4], vsub16[4], vsub32[4]; + uint32_t vtst[4], vceq[4], vbsl[4], vtbl[2], vtbx[2]; + float vaddf[4], vsubf[4]; } T; T t; @@ -1268,7 +1284,7 @@ TEST(15) { if (CpuFeatures::IsSupported(NEON)) { CpuFeatureScope scope(&assm, NEON); - __ stm(db_w, sp, r4.bit() | lr.bit()); + __ stm(db_w, sp, r4.bit() | r5.bit() | lr.bit()); // Move 32 bytes with neon. __ add(r4, r0, Operand(static_cast(offsetof(T, src0)))); __ vld1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(r4)); @@ -1289,23 +1305,200 @@ TEST(15) { __ add(r4, r0, Operand(static_cast(offsetof(T, dstA4)))); __ vst1(Neon8, NeonListOperand(d2, 2), NeonMemOperand(r4)); - // Test vmov for q-registers. + // ARM core register to scalar. + __ mov(r4, Operand(0xFFFFFFF8)); + __ vmov(d0, 0); + __ vmov(NeonS8, d0, 1, r4); + __ vmov(NeonS16, d0, 1, r4); + __ vmov(NeonS32, d0, 1, r4); + __ vstr(d0, r0, offsetof(T, vmov_to_scalar1)); + __ vmov(d0, 0); + __ vmov(NeonS8, d0, 3, r4); + __ vmov(NeonS16, d0, 3, r4); + __ vstr(d0, r0, offsetof(T, vmov_to_scalar2)); + + // Scalar to ARM core register. + __ mov(r4, Operand(0xFFFFFF00)); + __ mov(r5, Operand(0xFFFFFFFF)); + __ vmov(d0, r4, r5); + __ vmov(NeonS8, r4, d0, 1); + __ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_s8))); + __ vmov(NeonU8, r4, d0, 1); + __ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_u8))); + __ vmov(NeonS16, r4, d0, 1); + __ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_s16))); + __ vmov(NeonU16, r4, d0, 1); + __ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_u16))); + __ vmov(NeonS32, r4, d0, 1); + __ str(r4, MemOperand(r0, offsetof(T, vmov_from_scalar_32))); + + // vmov for q-registers. __ add(r4, r0, Operand(static_cast(offsetof(T, vmov_src)))); - __ vld1(Neon8, NeonListOperand(d0, 2), NeonMemOperand(r4)); + __ vld1(Neon8, NeonListOperand(q0), NeonMemOperand(r4)); __ vmov(q1, q0); __ add(r4, r0, Operand(static_cast(offsetof(T, vmov_dst)))); - __ vst1(Neon8, NeonListOperand(d2, 2), NeonMemOperand(r4)); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); - // Test veor for q-registers. - __ add(r4, r0, Operand(static_cast(offsetof(T, veor_src)))); - __ vld1(Neon8, NeonListOperand(d0, 2), NeonMemOperand(r4)); - __ add(r4, r0, Operand(static_cast(offsetof(T, veor_dst)))); - __ vld1(Neon8, NeonListOperand(d2, 2), NeonMemOperand(r4)); + // vmvn. + __ mov(r4, Operand(0xFF)); + __ vdup(Neon16, q0, r4); + __ vmvn(q1, q0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vmvn)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + + // vcvt for q-registers. + __ vmov(s0, -1.5); + __ vmov(s1, -1); + __ vmov(s2, 1); + __ vmov(s3, 1.5); + __ vcvt_s32_f32(q1, q0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vcvt_s32_f32)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + __ vcvt_u32_f32(q1, q0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vcvt_u32_f32)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + __ mov(r4, Operand(kMinInt)); + __ mov(r5, Operand(kMaxInt)); + __ vmov(d0, r4, r5); + __ mov(r4, Operand(kMaxUInt32)); + __ mov(r5, Operand(kMinInt + 1)); + __ vmov(d1, r4, r5); // q0 = [kMinInt, kMaxInt, kMaxUInt32, kMinInt + 1] + __ vcvt_f32_s32(q1, q0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vcvt_f32_s32)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + __ vcvt_f32_u32(q1, q0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vcvt_f32_u32)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + + // int vdup. + __ mov(r4, Operand(0xa)); + __ vdup(Neon8, q0, r4); + __ vdup(Neon16, q1, r4); + __ vdup(Neon32, q2, r4); + __ add(r4, r0, Operand(static_cast(offsetof(T, vdup1)))); + __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4)); + __ add(r4, r0, Operand(static_cast(offsetof(T, vdup2)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + __ add(r4, r0, Operand(static_cast(offsetof(T, vdup3)))); + __ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4)); + // float vdup. + __ vmov(s0, -1.0); + __ vdup(q0, s0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vdup4)))); + __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4)); + + // veor. + __ mov(r4, Operand(0x00aa)); + __ vdup(Neon16, q0, r4); + __ mov(r4, Operand(0x0055)); + __ vdup(Neon16, q1, r4); __ veor(q1, q1, q0); - __ vst1(Neon8, NeonListOperand(d2, 2), NeonMemOperand(r4)); + __ add(r4, r0, Operand(static_cast(offsetof(T, veor)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + + // vadd(integer). + __ mov(r4, Operand(0x81)); + __ vdup(Neon8, q0, r4); + __ mov(r4, Operand(0x82)); + __ vdup(Neon8, q1, r4); + __ vadd(Neon8, q1, q1, q0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vadd8)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + __ mov(r4, Operand(0x8001)); + __ vdup(Neon16, q0, r4); + __ mov(r4, Operand(0x8002)); + __ vdup(Neon16, q1, r4); + __ vadd(Neon16, q1, q1, q0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vadd16)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + __ mov(r4, Operand(0x80000001)); + __ vdup(Neon32, q0, r4); + __ mov(r4, Operand(0x80000002)); + __ vdup(Neon32, q1, r4); + __ vadd(Neon32, q1, q1, q0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vadd32)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + + // vadd(float). + __ vmov(s4, 1.0); + __ vdup(q0, s4); + __ vdup(q1, s4); + __ vadd(q1, q1, q0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vaddf)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + + // vsub(integer). + __ mov(r4, Operand(0x01)); + __ vdup(Neon8, q0, r4); + __ mov(r4, Operand(0x02)); + __ vdup(Neon8, q1, r4); + __ vsub(Neon8, q1, q0, q1); + __ add(r4, r0, Operand(static_cast(offsetof(T, vsub8)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + __ mov(r4, Operand(0x0001)); + __ vdup(Neon16, q0, r4); + __ mov(r4, Operand(0x0002)); + __ vdup(Neon16, q1, r4); + __ vsub(Neon16, q1, q0, q1); + __ add(r4, r0, Operand(static_cast(offsetof(T, vsub16)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + __ mov(r4, Operand(0x00000001)); + __ vdup(Neon32, q0, r4); + __ mov(r4, Operand(0x00000002)); + __ vdup(Neon32, q1, r4); + __ vsub(Neon32, q1, q0, q1); + __ add(r4, r0, Operand(static_cast(offsetof(T, vsub32)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + + // vsub(float). + __ vmov(s4, 2.0); + __ vdup(q0, s4); + __ vmov(s4, 1.0); + __ vdup(q1, s4); + __ vsub(q1, q1, q0); + __ add(r4, r0, Operand(static_cast(offsetof(T, vsubf)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + + // vceq. + __ mov(r4, Operand(0x03)); + __ vdup(Neon8, q0, r4); + __ mov(r4, Operand(0x03)); + __ vdup(Neon16, q1, r4); + __ vceq(Neon8, q1, q0, q1); + __ add(r4, r0, Operand(static_cast(offsetof(T, vceq)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + + // vtst. + __ mov(r4, Operand(0x03)); + __ vdup(Neon8, q0, r4); + __ mov(r4, Operand(0x02)); + __ vdup(Neon16, q1, r4); + __ vtst(Neon8, q1, q0, q1); + __ add(r4, r0, Operand(static_cast(offsetof(T, vtst)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + + // vbsl. + __ mov(r4, Operand(0x00ff)); + __ vdup(Neon16, q0, r4); + __ mov(r4, Operand(0x01)); + __ vdup(Neon8, q1, r4); + __ mov(r4, Operand(0x02)); + __ vdup(Neon8, q2, r4); + __ vbsl(q0, q1, q2); + __ add(r4, r0, Operand(static_cast(offsetof(T, vbsl)))); + __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4)); + + // vtb[l/x]. + __ mov(r4, Operand(0x06040200)); + __ mov(r5, Operand(0xff050301)); + __ vmov(d2, r4, r5); // d2 = ff05030106040200 + __ vtbl(d0, NeonListOperand(d2, 1), d2); + __ vstr(d0, r0, offsetof(T, vtbl)); + __ vtbx(d2, NeonListOperand(d2, 1), d2); + __ vstr(d2, r0, offsetof(T, vtbx)); // Restore and return. - __ ldm(ia_w, sp, r4.bit() | pc.bit()); + __ ldm(ia_w, sp, r4.bit() | r5.bit() | pc.bit()); CodeDesc desc; assm.GetCode(&desc); @@ -1344,10 +1537,9 @@ TEST(15) { t.dstA7 = 0; t.vmov_src[0] = t.vmov_src[1] = t.vmov_src[2] = t.vmov_src[3] = 1; t.vmov_dst[0] = t.vmov_dst[1] = t.vmov_dst[2] = t.vmov_dst[3] = 0; - t.veor_src[0] = t.veor_src[1] = t.veor_src[2] = t.veor_src[3] = 0xAA; - t.veor_dst[0] = t.veor_dst[1] = t.veor_dst[2] = t.veor_dst[3] = 0x55; Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0); USE(dummy); + CHECK_EQ(0x01020304u, t.dst0); CHECK_EQ(0x11121314u, t.dst1); CHECK_EQ(0x21222324u, t.dst2); @@ -1364,14 +1556,57 @@ TEST(15) { CHECK_EQ(0x00410042u, t.dstA5); CHECK_EQ(0x00830084u, t.dstA6); CHECK_EQ(0x00810082u, t.dstA7); + CHECK_EQ(0xfffffff8fff8f800u, t.vmov_to_scalar1); + CHECK_EQ(0xfff80000f8000000u, t.vmov_to_scalar2); + CHECK_EQ(0xFFFFFFFFu, t.vmov_from_scalar_s8); + CHECK_EQ(0xFFu, t.vmov_from_scalar_u8); + CHECK_EQ(0xFFFFFFFFu, t.vmov_from_scalar_s16); + CHECK_EQ(0xFFFFu, t.vmov_from_scalar_u16); + CHECK_EQ(0xFFFFFFFFu, t.vmov_from_scalar_32); CHECK_EQ(1u, t.vmov_dst[0]); CHECK_EQ(1u, t.vmov_dst[1]); CHECK_EQ(1u, t.vmov_dst[2]); CHECK_EQ(1u, t.vmov_dst[3]); - CHECK_EQ(0xFFu, t.veor_dst[0]); - CHECK_EQ(0xFFu, t.veor_dst[1]); - CHECK_EQ(0xFFu, t.veor_dst[2]); - CHECK_EQ(0xFFu, t.veor_dst[3]); + CHECK_EQ(-1, t.vcvt_s32_f32[0]); + CHECK_EQ(-1, t.vcvt_s32_f32[1]); + CHECK_EQ(1, t.vcvt_s32_f32[2]); + CHECK_EQ(1, t.vcvt_s32_f32[3]); + CHECK_EQ(0u, t.vcvt_u32_f32[0]); + CHECK_EQ(0u, t.vcvt_u32_f32[1]); + CHECK_EQ(1u, t.vcvt_u32_f32[2]); + CHECK_EQ(1u, t.vcvt_u32_f32[3]); + + // src: [kMinInt, kMaxInt, kMaxUInt32, kMinInt + 1] + CHECK_EQ(INT32_TO_FLOAT(kMinInt), t.vcvt_f32_s32[0]); + CHECK_EQ(INT32_TO_FLOAT(kMaxInt), t.vcvt_f32_s32[1]); + CHECK_EQ(INT32_TO_FLOAT(kMaxUInt32), t.vcvt_f32_s32[2]); + CHECK_EQ(INT32_TO_FLOAT(kMinInt + 1), t.vcvt_f32_s32[3]); + CHECK_EQ(UINT32_TO_FLOAT(kMinInt), t.vcvt_f32_u32[0]); + CHECK_EQ(UINT32_TO_FLOAT(kMaxInt), t.vcvt_f32_u32[1]); + CHECK_EQ(UINT32_TO_FLOAT(kMaxUInt32), t.vcvt_f32_u32[2]); + CHECK_EQ(UINT32_TO_FLOAT(kMinInt + 1), t.vcvt_f32_u32[3]); + + for (int i = 0; i < 4; i++) CHECK_EQ(0xFF00FF00, t.vmvn[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0x0a0a0a0au, t.vdup1[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0x000a000au, t.vdup2[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0x0000000au, t.vdup3[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0xbf800000u, t.vdup4[i]); // -1.0f + for (int i = 0; i < 4; i++) CHECK_EQ(0x00ff00ffu, t.veor[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(2.0, t.vaddf[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0x03030303u, t.vadd8[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0x00030003u, t.vadd16[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0x00000003u, t.vadd32[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(-1.0, t.vsubf[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0xffffffffu, t.vsub8[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0xffffffffu, t.vsub16[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0xffffffffu, t.vsub32[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0x00ff00ffu, t.vceq[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0x00ff00ffu, t.vtst[i]); + for (int i = 0; i < 4; i++) CHECK_EQ(0x02010201u, t.vbsl[i]); + CHECK_EQ(0x05010400u, t.vtbl[0]); + CHECK_EQ(0x00030602u, t.vtbl[1]); + CHECK_EQ(0x05010400u, t.vtbx[0]); + CHECK_EQ(0xff030602u, t.vtbx[1]); } } @@ -2963,9 +3198,9 @@ TEST(vswp) { __ vmov(d11, r5, r5); // q5 = [-1.0, -1.0] __ vswp(q4, q5); __ add(r6, r0, Operand(static_cast(offsetof(T, vswp_q4)))); - __ vst1(Neon8, NeonListOperand(d8, 2), NeonMemOperand(r6)); + __ vst1(Neon8, NeonListOperand(q4), NeonMemOperand(r6)); __ add(r6, r0, Operand(static_cast(offsetof(T, vswp_q5)))); - __ vst1(Neon8, NeonListOperand(d10, 2), NeonMemOperand(r6)); + __ vst1(Neon8, NeonListOperand(q5), NeonMemOperand(r6)); __ ldm(ia_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | pc.bit()); __ bx(lr); diff --git a/test/cctest/test-disasm-arm.cc b/test/cctest/test-disasm-arm.cc index 7b2dcd4207..75cd843aae 100644 --- a/test/cctest/test-disasm-arm.cc +++ b/test/cctest/test-disasm-arm.cc @@ -936,10 +936,45 @@ TEST(Neon) { "f3886a11 vmovl.u8 q3, d1"); COMPARE(vmovl(NeonU8, q4, d2), "f3888a12 vmovl.u8 q4, d2"); + + COMPARE(vmov(NeonS8, d0, 0, r0), "ee400b10 vmov.8 d0[0], r0"); + COMPARE(vmov(NeonU8, d1, 1, r1), "ee411b30 vmov.8 d1[1], r1"); + COMPARE(vmov(NeonS8, d2, 2, r2), "ee422b50 vmov.8 d2[2], r2"); + COMPARE(vmov(NeonU8, d3, 3, r8), "ee438b70 vmov.8 d3[3], r8"); + COMPARE(vmov(NeonS8, d4, 4, r0), "ee640b10 vmov.8 d4[4], r0"); + COMPARE(vmov(NeonU8, d5, 5, r1), "ee651b30 vmov.8 d5[5], r1"); + COMPARE(vmov(NeonS8, d6, 6, r2), "ee662b50 vmov.8 d6[6], r2"); + COMPARE(vmov(NeonU8, d7, 7, r8), "ee678b70 vmov.8 d7[7], r8"); + COMPARE(vmov(NeonS16, d0, 0, r0), "ee000b30 vmov.16 d0[0], r0"); + COMPARE(vmov(NeonS16, d1, 1, r1), "ee011b70 vmov.16 d1[1], r1"); + COMPARE(vmov(NeonS16, d2, 2, r2), "ee222b30 vmov.16 d2[2], r2"); + COMPARE(vmov(NeonS16, d3, 3, r7), "ee237b70 vmov.16 d3[3], r7"); + COMPARE(vmov(NeonS32, d0, 0, r0), "ee000b10 vmov.32 d0[0], r0"); + COMPARE(vmov(NeonU32, d0, 1, r0), "ee200b10 vmov.32 d0[1], r0"); + + COMPARE(vmov(NeonS8, r0, d0, 0), "ee500b10 vmov.s8 r0, d0[0]"); + COMPARE(vmov(NeonU8, r1, d1, 1), "eed11b30 vmov.u8 r1, d1[1]"); + COMPARE(vmov(NeonS8, r2, d2, 2), "ee522b50 vmov.s8 r2, d2[2]"); + COMPARE(vmov(NeonU8, r8, d3, 3), "eed38b70 vmov.u8 r8, d3[3]"); + COMPARE(vmov(NeonS8, r0, d4, 4), "ee740b10 vmov.s8 r0, d4[4]"); + COMPARE(vmov(NeonU8, r1, d5, 5), "eef51b30 vmov.u8 r1, d5[5]"); + COMPARE(vmov(NeonS8, r2, d6, 6), "ee762b50 vmov.s8 r2, d6[6]"); + COMPARE(vmov(NeonU8, r8, d7, 7), "eef78b70 vmov.u8 r8, d7[7]"); + COMPARE(vmov(NeonS16, r0, d0, 0), "ee100b30 vmov.s16 r0, d0[0]"); + COMPARE(vmov(NeonU16, r1, d1, 1), "ee911b70 vmov.u16 r1, d1[1]"); + COMPARE(vmov(NeonS16, r2, d2, 2), "ee322b30 vmov.s16 r2, d2[2]"); + COMPARE(vmov(NeonU16, r7, d3, 3), "eeb37b70 vmov.u16 r7, d3[3]"); + COMPARE(vmov(NeonS32, r2, d15, 0), "ee1f2b10 vmov.32 r2, d15[0]"); + COMPARE(vmov(NeonS32, r3, d14, 1), "ee3e3b10 vmov.32 r3, d14[1]"); + COMPARE(vmov(q0, q15), "f22e01fe vmov q0, q15"); COMPARE(vmov(q8, q9), "f26201f2 vmov q8, q9"); + COMPARE(vmvn(q0, q15), + "f3b005ee vmvn q0, q15"); + COMPARE(vmvn(q8, q9), + "f3f005e2 vmvn q8, q9"); COMPARE(vswp(d0, d31), "f3b2002f vswp d0, d31"); COMPARE(vswp(d16, d14), @@ -948,6 +983,24 @@ TEST(Neon) { "f3b2006e vswp q0, q15"); COMPARE(vswp(q8, q9), "f3f20062 vswp q8, q9"); + COMPARE(vdup(Neon8, q0, r0), + "eee00b10 vdup.8 q0, r0"); + COMPARE(vdup(Neon16, q1, r4), + "eea24b30 vdup.16 q1, r4"); + COMPARE(vdup(Neon32, q15, r1), + "eeae1b90 vdup.32 q15, r1"); + COMPARE(vdup(q0, s3), + "f3bc0c41 vdup q0, d1[1]"); + COMPARE(vdup(q15, s2), + "f3f4ec41 vdup q15, d1[0]"); + COMPARE(vcvt_f32_s32(q15, q1), + "f3fbe642 vcvt.f32.s32 q15, q1"); + COMPARE(vcvt_f32_u32(q8, q9), + "f3fb06e2 vcvt.f32.u32 q8, q9"); + COMPARE(vcvt_s32_f32(q15, q1), + "f3fbe742 vcvt.s32.f32 q15, q1"); + COMPARE(vcvt_u32_f32(q8, q9), + "f3fb07e2 vcvt.u32.f32 q8, q9"); COMPARE(veor(d0, d1, d2), "f3010112 veor d0, d1, d2"); COMPARE(veor(d0, d30, d31), @@ -956,6 +1009,54 @@ TEST(Neon) { "f3020154 veor q0, q1, q2"); COMPARE(veor(q15, q0, q8), "f340e170 veor q15, q0, q8"); + COMPARE(vadd(q15, q0, q8), + "f240ed60 vadd.f32 q15, q0, q8"); + COMPARE(vadd(Neon8, q0, q1, q2), + "f2020844 vadd.i8 q0, q1, q2"); + COMPARE(vadd(Neon16, q1, q2, q8), + "f2142860 vadd.i16 q1, q2, q8"); + COMPARE(vadd(Neon32, q15, q0, q8), + "f260e860 vadd.i32 q15, q0, q8"); + COMPARE(vsub(q15, q0, q8), + "f260ed60 vsub.f32 q15, q0, q8"); + COMPARE(vsub(Neon8, q0, q1, q2), + "f3020844 vsub.i8 q0, q1, q2"); + COMPARE(vsub(Neon16, q1, q2, q8), + "f3142860 vsub.i16 q1, q2, q8"); + COMPARE(vsub(Neon32, q15, q0, q8), + "f360e860 vsub.i32 q15, q0, q8"); + COMPARE(vtst(Neon8, q0, q1, q2), + "f2020854 vtst.i8 q0, q1, q2"); + COMPARE(vtst(Neon16, q1, q2, q8), + "f2142870 vtst.i16 q1, q2, q8"); + COMPARE(vtst(Neon32, q15, q0, q8), + "f260e870 vtst.i32 q15, q0, q8"); + COMPARE(vceq(Neon8, q0, q1, q2), + "f3020854 vceq.i8 q0, q1, q2"); + COMPARE(vceq(Neon16, q1, q2, q8), + "f3142870 vceq.i16 q1, q2, q8"); + COMPARE(vceq(Neon32, q15, q0, q8), + "f360e870 vceq.i32 q15, q0, q8"); + COMPARE(vbsl(q0, q1, q2), + "f3120154 vbsl q0, q1, q2"); + COMPARE(vbsl(q15, q0, q8), + "f350e170 vbsl q15, q0, q8"); + COMPARE(vtbl(d0, NeonListOperand(d1, 1), d2), + "f3b10802 vtbl.8 d0, {d1}, d2"); + COMPARE(vtbl(d31, NeonListOperand(d0, 2), d4), + "f3f0f904 vtbl.8 d31, {d0, d1}, d4"); + COMPARE(vtbl(d15, NeonListOperand(d1, 3), d5), + "f3b1fa05 vtbl.8 d15, {d1, d2, d3}, d5"); + COMPARE(vtbl(d15, NeonListOperand(d1, 4), d5), + "f3b1fb05 vtbl.8 d15, {d1, d2, d3, d4}, d5"); + COMPARE(vtbx(d0, NeonListOperand(d1, 1), d2), + "f3b10842 vtbx.8 d0, {d1}, d2"); + COMPARE(vtbx(d31, NeonListOperand(d0, 2), d4), + "f3f0f944 vtbx.8 d31, {d0, d1}, d4"); + COMPARE(vtbx(d15, NeonListOperand(d1, 3), d5), + "f3b1fa45 vtbx.8 d15, {d1, d2, d3}, d5"); + COMPARE(vtbx(d15, NeonListOperand(d1, 4), d5), + "f3b1fb45 vtbx.8 d15, {d1, d2, d3, d4}, d5"); } VERIFY_RUN(); diff --git a/test/cctest/test-macro-assembler-arm.cc b/test/cctest/test-macro-assembler-arm.cc index 06efc58cfa..6b69296e11 100644 --- a/test/cctest/test-macro-assembler-arm.cc +++ b/test/cctest/test-macro-assembler-arm.cc @@ -42,6 +42,7 @@ typedef void* (*F)(int x, int y, int p2, int p3, int p4); #define __ masm-> +typedef Object* (*F3)(void* p0, int p1, int p2, int p3, int p4); typedef int (*F5)(void*, void*, void*, void*, void*); @@ -134,4 +135,248 @@ TEST(LoadAndStoreWithRepresentation) { CHECK(!CALL_GENERATED_CODE(isolate, f, 0, 0, 0, 0, 0)); } +TEST(ExtractLane) { + if (!CpuFeatures::IsSupported(NEON)) return; + + // Allocate an executable page of memory. + size_t actual_size; + byte* buffer = static_cast(v8::base::OS::Allocate( + Assembler::kMinimalBufferSize, &actual_size, true)); + CHECK(buffer); + Isolate* isolate = CcTest::i_isolate(); + HandleScope handles(isolate); + MacroAssembler assembler(isolate, buffer, static_cast(actual_size), + v8::internal::CodeObjectRequired::kYes); + MacroAssembler* masm = &assembler; // Create a pointer for the __ macro. + + typedef struct { + int32_t i32x4_low[4]; + int32_t i32x4_high[4]; + int32_t i16x8_low[8]; + int32_t i16x8_high[8]; + int32_t i8x16_low[16]; + int32_t i8x16_high[16]; + int32_t f32x4_low[4]; + int32_t f32x4_high[4]; + } T; + T t; + + __ stm(db_w, sp, r4.bit() | r5.bit() | lr.bit()); + + for (int i = 0; i < 4; i++) { + __ mov(r4, Operand(i)); + __ vdup(Neon32, q1, r4); + __ ExtractLane(r5, q1, NeonS32, i); + __ str(r5, MemOperand(r0, offsetof(T, i32x4_low) + 4 * i)); + SwVfpRegister si = SwVfpRegister::from_code(i); + __ ExtractLane(si, q1, r4, i); + __ vstr(si, r0, offsetof(T, f32x4_low) + 4 * i); + } + + for (int i = 0; i < 8; i++) { + __ mov(r4, Operand(i)); + __ vdup(Neon16, q1, r4); + __ ExtractLane(r5, q1, NeonS16, i); + __ str(r5, MemOperand(r0, offsetof(T, i16x8_low) + 4 * i)); + } + + for (int i = 0; i < 16; i++) { + __ mov(r4, Operand(i)); + __ vdup(Neon8, q1, r4); + __ ExtractLane(r5, q1, NeonS8, i); + __ str(r5, MemOperand(r0, offsetof(T, i8x16_low) + 4 * i)); + } + + if (CpuFeatures::IsSupported(VFP32DREGS)) { + for (int i = 0; i < 4; i++) { + __ mov(r4, Operand(-i)); + __ vdup(Neon32, q15, r4); + __ ExtractLane(r5, q15, NeonS32, i); + __ str(r5, MemOperand(r0, offsetof(T, i32x4_high) + 4 * i)); + SwVfpRegister si = SwVfpRegister::from_code(i); + __ ExtractLane(si, q15, r4, i); + __ vstr(si, r0, offsetof(T, f32x4_high) + 4 * i); + } + + for (int i = 0; i < 8; i++) { + __ mov(r4, Operand(-i)); + __ vdup(Neon16, q15, r4); + __ ExtractLane(r5, q15, NeonS16, i); + __ str(r5, MemOperand(r0, offsetof(T, i16x8_high) + 4 * i)); + } + + for (int i = 0; i < 16; i++) { + __ mov(r4, Operand(-i)); + __ vdup(Neon8, q15, r4); + __ ExtractLane(r5, q15, NeonS8, i); + __ str(r5, MemOperand(r0, offsetof(T, i8x16_high) + 4 * i)); + } + } + + __ ldm(ia_w, sp, r4.bit() | r5.bit() | pc.bit()); + + CodeDesc desc; + masm->GetCode(&desc); + Handle code = isolate->factory()->NewCode( + desc, Code::ComputeFlags(Code::STUB), Handle()); +#ifdef DEBUG + OFStream os(stdout); + code->Print(os); +#endif + F3 f = FUNCTION_CAST(code->entry()); + Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0); + USE(dummy); + for (int i = 0; i < 4; i++) { + CHECK_EQ(i, t.i32x4_low[i]); + CHECK_EQ(i, t.f32x4_low[i]); + } + for (int i = 0; i < 8; i++) { + CHECK_EQ(i, t.i16x8_low[i]); + } + for (int i = 0; i < 16; i++) { + CHECK_EQ(i, t.i8x16_low[i]); + } + if (CpuFeatures::IsSupported(VFP32DREGS)) { + for (int i = 0; i < 4; i++) { + CHECK_EQ(-i, t.i32x4_high[i]); + CHECK_EQ(-i, t.f32x4_high[i]); + } + for (int i = 0; i < 8; i++) { + CHECK_EQ(-i, t.i16x8_high[i]); + } + for (int i = 0; i < 16; i++) { + CHECK_EQ(-i, t.i8x16_high[i]); + } + } +} + +TEST(ReplaceLane) { + if (!CpuFeatures::IsSupported(NEON)) return; + + // Allocate an executable page of memory. + size_t actual_size; + byte* buffer = static_cast(v8::base::OS::Allocate( + Assembler::kMinimalBufferSize, &actual_size, true)); + CHECK(buffer); + Isolate* isolate = CcTest::i_isolate(); + HandleScope handles(isolate); + MacroAssembler assembler(isolate, buffer, static_cast(actual_size), + v8::internal::CodeObjectRequired::kYes); + MacroAssembler* masm = &assembler; // Create a pointer for the __ macro. + + typedef struct { + int32_t i32x4_low[4]; + int32_t i32x4_high[4]; + int16_t i16x8_low[8]; + int16_t i16x8_high[8]; + int8_t i8x16_low[16]; + int8_t i8x16_high[16]; + int32_t f32x4_low[4]; + int32_t f32x4_high[4]; + } T; + T t; + + __ stm(db_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | lr.bit()); + + const Register kScratch = r5; + + __ veor(q0, q0, q0); // Zero + __ veor(q1, q1, q1); // Zero + for (int i = 0; i < 4; i++) { + __ mov(r4, Operand(i)); + __ ReplaceLane(q0, q0, r4, NeonS32, i); + SwVfpRegister si = SwVfpRegister::from_code(i); + __ vmov(si, r4); + __ ReplaceLane(q1, q1, si, kScratch, i); + } + __ add(r4, r0, Operand(static_cast(offsetof(T, i32x4_low)))); + __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4)); + __ add(r4, r0, Operand(static_cast(offsetof(T, f32x4_low)))); + __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); + + __ veor(q0, q0, q0); // Zero + for (int i = 0; i < 8; i++) { + __ mov(r4, Operand(i)); + __ ReplaceLane(q0, q0, r4, NeonS16, i); + } + __ add(r4, r0, Operand(static_cast(offsetof(T, i16x8_low)))); + __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4)); + + __ veor(q0, q0, q0); // Zero + for (int i = 0; i < 16; i++) { + __ mov(r4, Operand(i)); + __ ReplaceLane(q0, q0, r4, NeonS8, i); + } + __ add(r4, r0, Operand(static_cast(offsetof(T, i8x16_low)))); + __ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4)); + + if (CpuFeatures::IsSupported(VFP32DREGS)) { + __ veor(q14, q14, q14); // Zero + __ veor(q15, q15, q15); // Zero + for (int i = 0; i < 4; i++) { + __ mov(r4, Operand(-i)); + __ ReplaceLane(q14, q14, r4, NeonS32, i); + SwVfpRegister si = SwVfpRegister::from_code(i); + __ vmov(si, r4); + __ ReplaceLane(q15, q15, si, kScratch, i); + } + __ add(r4, r0, Operand(static_cast(offsetof(T, i32x4_high)))); + __ vst1(Neon8, NeonListOperand(q14), NeonMemOperand(r4)); + __ add(r4, r0, Operand(static_cast(offsetof(T, f32x4_high)))); + __ vst1(Neon8, NeonListOperand(q15), NeonMemOperand(r4)); + + __ veor(q14, q14, q14); // Zero + for (int i = 0; i < 8; i++) { + __ mov(r4, Operand(-i)); + __ ReplaceLane(q14, q14, r4, NeonS16, i); + } + __ add(r4, r0, Operand(static_cast(offsetof(T, i16x8_high)))); + __ vst1(Neon8, NeonListOperand(q14), NeonMemOperand(r4)); + + __ veor(q14, q14, q14); // Zero + for (int i = 0; i < 16; i++) { + __ mov(r4, Operand(-i)); + __ ReplaceLane(q14, q14, r4, NeonS8, i); + } + __ add(r4, r0, Operand(static_cast(offsetof(T, i8x16_high)))); + __ vst1(Neon8, NeonListOperand(q14), NeonMemOperand(r4)); + } + + __ ldm(ia_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | pc.bit()); + + CodeDesc desc; + masm->GetCode(&desc); + Handle code = isolate->factory()->NewCode( + desc, Code::ComputeFlags(Code::STUB), Handle()); +#ifdef DEBUG + OFStream os(stdout); + code->Print(os); +#endif + F3 f = FUNCTION_CAST(code->entry()); + Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0); + USE(dummy); + for (int i = 0; i < 4; i++) { + CHECK_EQ(i, t.i32x4_low[i]); + CHECK_EQ(i, t.f32x4_low[i]); + } + for (int i = 0; i < 8; i++) { + CHECK_EQ(i, t.i16x8_low[i]); + } + for (int i = 0; i < 16; i++) { + CHECK_EQ(i, t.i8x16_low[i]); + } + if (CpuFeatures::IsSupported(VFP32DREGS)) { + for (int i = 0; i < 4; i++) { + CHECK_EQ(-i, t.i32x4_high[i]); + CHECK_EQ(-i, t.f32x4_high[i]); + } + for (int i = 0; i < 8; i++) { + CHECK_EQ(-i, t.i16x8_high[i]); + } + for (int i = 0; i < 16; i++) { + CHECK_EQ(-i, t.i8x16_high[i]); + } + } +} + #undef __