[ARM] Improve VFP register moves.
- Adds vdup.<size> Dd/Qd, Dm[i] instruction. - Adds vsli, vsri instructions. - Changes VMovExtended to use these to avoid moves to core registers. LOG=N BUG=v8:6020 Review-Url: https://codereview.chromium.org/2868603002 Cr-Commit-Position: refs/heads/master@{#45351}
This commit is contained in:
parent
211cc58565
commit
b3acc27265
@ -3910,19 +3910,47 @@ void Assembler::vdup(NeonSize size, QwNeonRegister dst, Register src) {
|
||||
0xB * B8 | d * B7 | E * B5 | B4);
|
||||
}
|
||||
|
||||
void Assembler::vdup(QwNeonRegister dst, SwVfpRegister src) {
|
||||
enum NeonRegType { NEON_D, NEON_Q };
|
||||
|
||||
void NeonSplitCode(NeonRegType type, int code, int* vm, int* m, int* encoding) {
|
||||
if (type == NEON_D) {
|
||||
DwVfpRegister::split_code(code, vm, m);
|
||||
} else {
|
||||
DCHECK_EQ(type, NEON_Q);
|
||||
QwNeonRegister::split_code(code, vm, m);
|
||||
*encoding |= B6;
|
||||
}
|
||||
}
|
||||
|
||||
static Instr EncodeNeonDupOp(NeonSize size, NeonRegType reg_type, int dst_code,
|
||||
DwVfpRegister src, int index) {
|
||||
DCHECK_NE(Neon64, size);
|
||||
int sz = static_cast<int>(size);
|
||||
DCHECK_LE(0, index);
|
||||
DCHECK_GT(kSimd128Size / (1 << sz), index);
|
||||
int imm4 = (1 << sz) | ((index << (sz + 1)) & 0xF);
|
||||
int qbit = 0;
|
||||
int vd, d;
|
||||
NeonSplitCode(reg_type, dst_code, &vd, &d, &qbit);
|
||||
int vm, m;
|
||||
src.split_code(&vm, &m);
|
||||
|
||||
return 0x1E7U * B23 | d * B22 | 0x3 * B20 | imm4 * B16 | vd * B12 |
|
||||
0x18 * B7 | qbit | m * B5 | vm;
|
||||
}
|
||||
|
||||
void Assembler::vdup(NeonSize size, DwVfpRegister dst, DwVfpRegister src,
|
||||
int index) {
|
||||
DCHECK(IsEnabled(NEON));
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-884.
|
||||
int index = src.code() & 1;
|
||||
int d_reg = src.code() / 2;
|
||||
int imm4 = 4 | index << 3; // esize = 32, index in bit 3.
|
||||
int vd, d;
|
||||
dst.split_code(&vd, &d);
|
||||
int vm, m;
|
||||
DwVfpRegister::from_code(d_reg).split_code(&vm, &m);
|
||||
emit(EncodeNeonDupOp(size, NEON_D, dst.code(), src, index));
|
||||
}
|
||||
|
||||
emit(0x1E7U * B23 | d * B22 | 0x3 * B20 | imm4 * B16 | vd * B12 | 0x18 * B7 |
|
||||
B6 | m * B5 | vm);
|
||||
void Assembler::vdup(NeonSize size, QwNeonRegister dst, DwVfpRegister src,
|
||||
int index) {
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-884.
|
||||
DCHECK(IsEnabled(NEON));
|
||||
emit(EncodeNeonDupOp(size, NEON_Q, dst.code(), src, index));
|
||||
}
|
||||
|
||||
// Encode NEON vcvt.src_type.dst_type instruction.
|
||||
@ -3977,18 +4005,6 @@ void Assembler::vcvt_u32_f32(QwNeonRegister dst, QwNeonRegister src) {
|
||||
emit(EncodeNeonVCVT(U32, dst, F32, src));
|
||||
}
|
||||
|
||||
enum NeonRegType { NEON_D, NEON_Q };
|
||||
|
||||
void NeonSplitCode(NeonRegType type, int code, int* vm, int* m, int* encoding) {
|
||||
if (type == NEON_D) {
|
||||
DwVfpRegister::split_code(code, vm, m);
|
||||
} else {
|
||||
DCHECK_EQ(type, NEON_Q);
|
||||
QwNeonRegister::split_code(code, vm, m);
|
||||
*encoding |= B6;
|
||||
}
|
||||
}
|
||||
|
||||
enum UnaryOp { VMVN, VSWP, VABS, VABSF, VNEG, VNEGF };
|
||||
|
||||
static Instr EncodeNeonUnaryOp(UnaryOp op, NeonRegType reg_type, NeonSize size,
|
||||
@ -4403,30 +4419,55 @@ void Assembler::vmax(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
|
||||
emit(EncodeNeonBinOp(VMAX, dt, dst, src1, src2));
|
||||
}
|
||||
|
||||
enum NeonShiftOp { VSHL, VSHR };
|
||||
enum NeonShiftOp { VSHL, VSHR, VSLI, VSRI };
|
||||
|
||||
static Instr EncodeNeonShiftOp(NeonShiftOp op, NeonDataType dt,
|
||||
QwNeonRegister dst, QwNeonRegister src,
|
||||
static Instr EncodeNeonShiftOp(NeonShiftOp op, NeonSize size, bool is_unsigned,
|
||||
NeonRegType reg_type, int dst_code, int src_code,
|
||||
int shift) {
|
||||
int vd, d;
|
||||
dst.split_code(&vd, &d);
|
||||
int vm, m;
|
||||
src.split_code(&vm, &m);
|
||||
int size_in_bits = kBitsPerByte << NeonSz(dt);
|
||||
int op_encoding = 0;
|
||||
int imm6 = 0;
|
||||
if (op == VSHL) {
|
||||
int size_in_bits = kBitsPerByte << static_cast<int>(size);
|
||||
int op_encoding = 0;
|
||||
switch (op) {
|
||||
case VSHL: {
|
||||
DCHECK(shift >= 0 && size_in_bits > shift);
|
||||
imm6 = size_in_bits + shift;
|
||||
op_encoding = 0x5 * B8;
|
||||
} else {
|
||||
DCHECK_EQ(VSHR, op);
|
||||
break;
|
||||
}
|
||||
case VSHR: {
|
||||
DCHECK(shift > 0 && size_in_bits >= shift);
|
||||
imm6 = 2 * size_in_bits - shift;
|
||||
op_encoding = NeonU(dt) * B24;
|
||||
if (is_unsigned) op_encoding |= B24;
|
||||
break;
|
||||
}
|
||||
return 0x1E5U * B23 | d * B22 | imm6 * B16 | vd * B12 | B6 | m * B5 | B4 |
|
||||
vm | op_encoding;
|
||||
case VSLI: {
|
||||
DCHECK(shift >= 0 && size_in_bits > shift);
|
||||
imm6 = size_in_bits + shift;
|
||||
int L = imm6 >> 6;
|
||||
imm6 &= 0x3F;
|
||||
op_encoding = B24 | 0x5 * B8 | L * B7;
|
||||
break;
|
||||
}
|
||||
case VSRI: {
|
||||
DCHECK(shift > 0 && size_in_bits >= shift);
|
||||
imm6 = 2 * size_in_bits - shift;
|
||||
int L = imm6 >> 6;
|
||||
imm6 &= 0x3F;
|
||||
op_encoding = B24 | 0x4 * B8 | L * B7;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
UNREACHABLE();
|
||||
break;
|
||||
}
|
||||
|
||||
int vd, d;
|
||||
NeonSplitCode(reg_type, dst_code, &vd, &d, &op_encoding);
|
||||
int vm, m;
|
||||
NeonSplitCode(reg_type, src_code, &vm, &m, &op_encoding);
|
||||
|
||||
return 0x1E5U * B23 | d * B22 | imm6 * B16 | vd * B12 | m * B5 | B4 | vm |
|
||||
op_encoding;
|
||||
}
|
||||
|
||||
void Assembler::vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
|
||||
@ -4434,7 +4475,8 @@ void Assembler::vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
|
||||
DCHECK(IsEnabled(NEON));
|
||||
// Qd = vshl(Qm, bits) SIMD shift left immediate.
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-1046.
|
||||
emit(EncodeNeonShiftOp(VSHL, dt, dst, src, shift));
|
||||
emit(EncodeNeonShiftOp(VSHL, NeonDataTypeToSize(dt), false, NEON_Q,
|
||||
dst.code(), src.code(), shift));
|
||||
}
|
||||
|
||||
void Assembler::vshr(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
|
||||
@ -4442,7 +4484,26 @@ void Assembler::vshr(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
|
||||
DCHECK(IsEnabled(NEON));
|
||||
// Qd = vshl(Qm, bits) SIMD shift right immediate.
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-1052.
|
||||
emit(EncodeNeonShiftOp(VSHR, dt, dst, src, shift));
|
||||
emit(EncodeNeonShiftOp(VSHR, NeonDataTypeToSize(dt), NeonU(dt), NEON_Q,
|
||||
dst.code(), src.code(), shift));
|
||||
}
|
||||
|
||||
void Assembler::vsli(NeonSize size, DwVfpRegister dst, DwVfpRegister src,
|
||||
int shift) {
|
||||
DCHECK(IsEnabled(NEON));
|
||||
// Dd = vsli(Dm, bits) SIMD shift left and insert.
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-1056.
|
||||
emit(EncodeNeonShiftOp(VSLI, size, false, NEON_D, dst.code(), src.code(),
|
||||
shift));
|
||||
}
|
||||
|
||||
void Assembler::vsri(NeonSize size, DwVfpRegister dst, DwVfpRegister src,
|
||||
int shift) {
|
||||
DCHECK(IsEnabled(NEON));
|
||||
// Dd = vsri(Dm, bits) SIMD shift right and insert.
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-1062.
|
||||
emit(EncodeNeonShiftOp(VSRI, size, false, NEON_D, dst.code(), src.code(),
|
||||
shift));
|
||||
}
|
||||
|
||||
static Instr EncodeNeonEstimateOp(bool is_rsqrt, QwNeonRegister dst,
|
||||
@ -4539,7 +4600,7 @@ void Assembler::vpadd(NeonSize size, DwVfpRegister dst, DwVfpRegister src1,
|
||||
DCHECK(IsEnabled(NEON));
|
||||
// Dd = vpadd(Dn, Dm) SIMD integer pairwise ADD.
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-980.
|
||||
emit(EncodeNeonPairwiseOp(VPADD, NeonSizeToDatatype(size), dst, src1, src2));
|
||||
emit(EncodeNeonPairwiseOp(VPADD, NeonSizeToDataType(size), dst, src1, src2));
|
||||
}
|
||||
|
||||
void Assembler::vpmin(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
|
||||
|
@ -426,9 +426,10 @@ constexpr LowDwVfpRegister kLastCalleeSavedDoubleReg = d15;
|
||||
constexpr LowDwVfpRegister kDoubleRegZero = d13;
|
||||
constexpr LowDwVfpRegister kScratchDoubleReg = d14;
|
||||
// This scratch q-register aliases d14 (kScratchDoubleReg) and d15, but is only
|
||||
// used when NEON is supported. d15 is still allocatable if there are only 16
|
||||
// VFP registers.
|
||||
// used if NEON is supported, which implies VFP32DREGS. When there are only 16
|
||||
// d-registers, d15 is still allocatable.
|
||||
constexpr QwNeonRegister kScratchQuadReg = q7;
|
||||
constexpr LowDwVfpRegister kScratchDoubleReg2 = d15;
|
||||
|
||||
// Coprocessor register
|
||||
struct CRegister {
|
||||
@ -1331,7 +1332,8 @@ class Assembler : public AssemblerBase {
|
||||
|
||||
void vmov(QwNeonRegister dst, QwNeonRegister src);
|
||||
void vdup(NeonSize size, QwNeonRegister dst, Register src);
|
||||
void vdup(QwNeonRegister dst, SwVfpRegister src);
|
||||
void vdup(NeonSize size, QwNeonRegister dst, DwVfpRegister src, int index);
|
||||
void vdup(NeonSize size, DwVfpRegister dst, DwVfpRegister src, int index);
|
||||
|
||||
void vcvt_f32_s32(QwNeonRegister dst, QwNeonRegister src);
|
||||
void vcvt_f32_u32(QwNeonRegister dst, QwNeonRegister src);
|
||||
@ -1380,6 +1382,8 @@ class Assembler : public AssemblerBase {
|
||||
DwVfpRegister src2);
|
||||
void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
|
||||
void vshr(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
|
||||
void vsli(NeonSize size, DwVfpRegister dst, DwVfpRegister src, int shift);
|
||||
void vsri(NeonSize size, DwVfpRegister dst, DwVfpRegister src, int shift);
|
||||
// vrecpe and vrsqrte only support floating point lanes.
|
||||
void vrecpe(QwNeonRegister dst, QwNeonRegister src);
|
||||
void vrsqrte(QwNeonRegister dst, QwNeonRegister src);
|
||||
|
@ -342,10 +342,15 @@ inline int NeonU(NeonDataType dt) { return static_cast<int>(dt) >> 2; }
|
||||
inline int NeonSz(NeonDataType dt) { return static_cast<int>(dt) & 0x3; }
|
||||
|
||||
// Convert sizes to data types (U bit is clear).
|
||||
inline NeonDataType NeonSizeToDatatype(NeonSize size) {
|
||||
inline NeonDataType NeonSizeToDataType(NeonSize size) {
|
||||
DCHECK_NE(Neon64, size);
|
||||
return static_cast<NeonDataType>(size);
|
||||
}
|
||||
|
||||
inline NeonSize NeonDataTypeToSize(NeonDataType dt) {
|
||||
return static_cast<NeonSize>(NeonSz(dt));
|
||||
}
|
||||
|
||||
enum NeonListType {
|
||||
nlt_1 = 0x7,
|
||||
nlt_2 = 0xA,
|
||||
|
@ -2211,11 +2211,30 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
|
||||
"vmovl.u%d q%d, d%d", imm3 * 8, Vd, Vm);
|
||||
} else if (instr->Opc1Value() == 7 && instr->Bit(4) == 0) {
|
||||
if (instr->Bits(11, 7) == 0x18) {
|
||||
int Vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
int Vm = instr->VFPMRegValue(kDoublePrecision);
|
||||
int index = instr->Bit(19);
|
||||
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
|
||||
"vdup q%d, d%d[%d]", Vd, Vm, index);
|
||||
int imm4 = instr->Bits(19, 16);
|
||||
int size = 0, index = 0;
|
||||
if ((imm4 & 0x1) != 0) {
|
||||
size = 8;
|
||||
index = imm4 >> 1;
|
||||
} else if ((imm4 & 0x2) != 0) {
|
||||
size = 16;
|
||||
index = imm4 >> 2;
|
||||
} else {
|
||||
size = 32;
|
||||
index = imm4 >> 3;
|
||||
}
|
||||
if (instr->Bit(6) == 0) {
|
||||
int Vd = instr->VFPDRegValue(kDoublePrecision);
|
||||
out_buffer_pos_ +=
|
||||
SNPrintF(out_buffer_ + out_buffer_pos_, "vdup.%i d%d, d%d[%d]",
|
||||
size, Vd, Vm, index);
|
||||
} else {
|
||||
int Vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
out_buffer_pos_ +=
|
||||
SNPrintF(out_buffer_ + out_buffer_pos_, "vdup.%i q%d, d%d[%d]",
|
||||
size, Vd, Vm, index);
|
||||
}
|
||||
} else if (instr->Bits(11, 10) == 0x2) {
|
||||
int Vd = instr->VFPDRegValue(kDoublePrecision);
|
||||
int Vn = instr->VFPNRegValue(kDoublePrecision);
|
||||
@ -2346,6 +2365,27 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
|
||||
out_buffer_pos_ +=
|
||||
SNPrintF(out_buffer_ + out_buffer_pos_, "vshr.u%d q%d, q%d, #%d",
|
||||
size, Vd, Vm, shift);
|
||||
} else if (instr->Bit(10) == 1 && instr->Bit(6) == 0 &&
|
||||
instr->Bit(4) == 1) {
|
||||
// vsli.<size> Dd, Dm, shift
|
||||
// vsri.<size> Dd, Dm, shift
|
||||
int imm7 = instr->Bits(21, 16);
|
||||
if (instr->Bit(7) != 0) imm7 += 64;
|
||||
int size = base::bits::RoundDownToPowerOfTwo32(imm7);
|
||||
int shift;
|
||||
char direction;
|
||||
if (instr->Bit(8) == 1) {
|
||||
shift = imm7 - size;
|
||||
direction = 'l'; // vsli
|
||||
} else {
|
||||
shift = 2 * size - imm7;
|
||||
direction = 'r'; // vsri
|
||||
}
|
||||
int Vd = instr->VFPDRegValue(kDoublePrecision);
|
||||
int Vm = instr->VFPMRegValue(kDoublePrecision);
|
||||
out_buffer_pos_ +=
|
||||
SNPrintF(out_buffer_ + out_buffer_pos_, "vs%ci.%d d%d, d%d, #%d",
|
||||
direction, size, Vd, Vm, shift);
|
||||
} else {
|
||||
Unknown(instr);
|
||||
}
|
||||
|
@ -1079,47 +1079,90 @@ void MacroAssembler::VmovExtended(int dst_code, Register src) {
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::VmovExtended(int dst_code, int src_code,
|
||||
Register scratch) {
|
||||
void MacroAssembler::VmovExtended(int dst_code, int src_code) {
|
||||
if (src_code == dst_code) return;
|
||||
|
||||
if (src_code < SwVfpRegister::kMaxNumRegisters &&
|
||||
dst_code < SwVfpRegister::kMaxNumRegisters) {
|
||||
// src and dst are both s-registers.
|
||||
vmov(SwVfpRegister::from_code(dst_code),
|
||||
SwVfpRegister::from_code(src_code));
|
||||
} else if (src_code < SwVfpRegister::kMaxNumRegisters) {
|
||||
// src is an s-register.
|
||||
vmov(scratch, SwVfpRegister::from_code(src_code));
|
||||
VmovExtended(dst_code, scratch);
|
||||
return;
|
||||
}
|
||||
DwVfpRegister dst_d_reg = DwVfpRegister::from_code(dst_code / 2);
|
||||
DwVfpRegister src_d_reg = DwVfpRegister::from_code(src_code / 2);
|
||||
int dst_offset = dst_code & 1;
|
||||
int src_offset = src_code & 1;
|
||||
if (CpuFeatures::IsSupported(NEON)) {
|
||||
// On Neon we can shift and insert from d-registers.
|
||||
if (src_offset == dst_offset) {
|
||||
// Offsets are the same, use vdup to copy the source to the opposite lane.
|
||||
vdup(Neon32, kScratchDoubleReg, src_d_reg, src_offset);
|
||||
src_d_reg = kScratchDoubleReg;
|
||||
src_offset = dst_offset ^ 1;
|
||||
}
|
||||
if (dst_offset) {
|
||||
if (dst_d_reg.is(src_d_reg)) {
|
||||
vdup(Neon32, dst_d_reg, src_d_reg, 0);
|
||||
} else {
|
||||
vsli(Neon64, dst_d_reg, src_d_reg, 32);
|
||||
}
|
||||
} else {
|
||||
if (dst_d_reg.is(src_d_reg)) {
|
||||
vdup(Neon32, dst_d_reg, src_d_reg, 1);
|
||||
} else {
|
||||
vsri(Neon64, dst_d_reg, src_d_reg, 32);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Without Neon, use the scratch registers to move src and/or dst into
|
||||
// s-registers.
|
||||
int scratchSCode = kScratchDoubleReg.low().code();
|
||||
int scratchSCode2 = kScratchDoubleReg2.low().code();
|
||||
if (src_code < SwVfpRegister::kMaxNumRegisters) {
|
||||
// src is an s-register, dst is not.
|
||||
vmov(kScratchDoubleReg, dst_d_reg);
|
||||
vmov(SwVfpRegister::from_code(scratchSCode + dst_offset),
|
||||
SwVfpRegister::from_code(src_code));
|
||||
vmov(dst_d_reg, kScratchDoubleReg);
|
||||
} else if (dst_code < SwVfpRegister::kMaxNumRegisters) {
|
||||
// dst is an s-register.
|
||||
VmovExtended(scratch, src_code);
|
||||
vmov(SwVfpRegister::from_code(dst_code), scratch);
|
||||
// dst is an s-register, src is not.
|
||||
vmov(kScratchDoubleReg, src_d_reg);
|
||||
vmov(SwVfpRegister::from_code(dst_code),
|
||||
SwVfpRegister::from_code(scratchSCode + src_offset));
|
||||
} else {
|
||||
// Neither src or dst are s-registers.
|
||||
DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, src_code);
|
||||
DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, dst_code);
|
||||
VmovExtended(scratch, src_code);
|
||||
VmovExtended(dst_code, scratch);
|
||||
// Neither src or dst are s-registers. Both scratch double registers are
|
||||
// available when there are 32 VFP registers.
|
||||
vmov(kScratchDoubleReg, src_d_reg);
|
||||
vmov(kScratchDoubleReg2, dst_d_reg);
|
||||
vmov(SwVfpRegister::from_code(scratchSCode + dst_offset),
|
||||
SwVfpRegister::from_code(scratchSCode2 + src_offset));
|
||||
vmov(dst_d_reg, kScratchQuadReg.high());
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src,
|
||||
Register scratch) {
|
||||
if (dst_code >= SwVfpRegister::kMaxNumRegisters) {
|
||||
ldr(scratch, src);
|
||||
VmovExtended(dst_code, scratch);
|
||||
} else {
|
||||
void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src) {
|
||||
if (dst_code < SwVfpRegister::kMaxNumRegisters) {
|
||||
vldr(SwVfpRegister::from_code(dst_code), src);
|
||||
} else {
|
||||
// TODO(bbudge) If Neon supported, use load single lane form of vld1.
|
||||
int dst_s_code = kScratchDoubleReg.low().code() + (dst_code & 1);
|
||||
vmov(kScratchDoubleReg, DwVfpRegister::from_code(dst_code / 2));
|
||||
vldr(SwVfpRegister::from_code(dst_s_code), src);
|
||||
vmov(DwVfpRegister::from_code(dst_code / 2), kScratchDoubleReg);
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code,
|
||||
Register scratch) {
|
||||
if (src_code >= SwVfpRegister::kMaxNumRegisters) {
|
||||
VmovExtended(scratch, src_code);
|
||||
str(scratch, dst);
|
||||
} else {
|
||||
void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code) {
|
||||
if (src_code < SwVfpRegister::kMaxNumRegisters) {
|
||||
vstr(SwVfpRegister::from_code(src_code), dst);
|
||||
} else {
|
||||
// TODO(bbudge) If Neon supported, use store single lane form of vst1.
|
||||
int src_s_code = kScratchDoubleReg.low().code() + (src_code & 1);
|
||||
vmov(kScratchDoubleReg, DwVfpRegister::from_code(src_code / 2));
|
||||
vstr(SwVfpRegister::from_code(src_s_code), dst);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1145,9 +1188,9 @@ void MacroAssembler::ExtractLane(Register dst, DwVfpRegister src,
|
||||
}
|
||||
|
||||
void MacroAssembler::ExtractLane(SwVfpRegister dst, QwNeonRegister src,
|
||||
Register scratch, int lane) {
|
||||
int lane) {
|
||||
int s_code = src.code() * 4 + lane;
|
||||
VmovExtended(dst.code(), s_code, scratch);
|
||||
VmovExtended(dst.code(), s_code);
|
||||
}
|
||||
|
||||
void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
|
||||
@ -1164,11 +1207,10 @@ void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
|
||||
}
|
||||
|
||||
void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
|
||||
SwVfpRegister src_lane, Register scratch,
|
||||
int lane) {
|
||||
SwVfpRegister src_lane, int lane) {
|
||||
Move(dst, src);
|
||||
int s_code = dst.code() * 4 + lane;
|
||||
VmovExtended(s_code, src_lane.code(), scratch);
|
||||
VmovExtended(s_code, src_lane.code());
|
||||
}
|
||||
|
||||
void MacroAssembler::LslPair(Register dst_low, Register dst_high,
|
||||
|
@ -559,18 +559,17 @@ class MacroAssembler: public Assembler {
|
||||
void VmovExtended(Register dst, int src_code);
|
||||
void VmovExtended(int dst_code, Register src);
|
||||
// Move between s-registers and imaginary s-registers.
|
||||
void VmovExtended(int dst_code, int src_code, Register scratch);
|
||||
void VmovExtended(int dst_code, const MemOperand& src, Register scratch);
|
||||
void VmovExtended(const MemOperand& dst, int src_code, Register scratch);
|
||||
void VmovExtended(int dst_code, int src_code);
|
||||
void VmovExtended(int dst_code, const MemOperand& src);
|
||||
void VmovExtended(const MemOperand& dst, int src_code);
|
||||
|
||||
void ExtractLane(Register dst, QwNeonRegister src, NeonDataType dt, int lane);
|
||||
void ExtractLane(Register dst, DwVfpRegister src, NeonDataType dt, int lane);
|
||||
void ExtractLane(SwVfpRegister dst, QwNeonRegister src, Register scratch,
|
||||
int lane);
|
||||
void ExtractLane(SwVfpRegister dst, QwNeonRegister src, int lane);
|
||||
void ReplaceLane(QwNeonRegister dst, QwNeonRegister src, Register src_lane,
|
||||
NeonDataType dt, int lane);
|
||||
void ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
|
||||
SwVfpRegister src_lane, Register scratch, int lane);
|
||||
SwVfpRegister src_lane, int lane);
|
||||
|
||||
void LslPair(Register dst_low, Register dst_high, Register src_low,
|
||||
Register src_high, Register scratch, Register shift);
|
||||
|
@ -4222,6 +4222,34 @@ void ArithmeticShiftRight(Simulator* simulator, int Vd, int Vm, int shift) {
|
||||
simulator->set_neon_register<T, SIZE>(Vd, src);
|
||||
}
|
||||
|
||||
template <typename T, int SIZE>
|
||||
void ShiftLeftAndInsert(Simulator* simulator, int Vd, int Vm, int shift) {
|
||||
static const int kElems = SIZE / sizeof(T);
|
||||
T src[kElems];
|
||||
T dst[kElems];
|
||||
simulator->get_neon_register<T, SIZE>(Vm, src);
|
||||
simulator->get_neon_register<T, SIZE>(Vd, dst);
|
||||
uint64_t mask = (1llu << shift) - 1llu;
|
||||
for (int i = 0; i < kElems; i++) {
|
||||
dst[i] = (src[i] << shift) | (dst[i] & mask);
|
||||
}
|
||||
simulator->set_neon_register<T, SIZE>(Vd, dst);
|
||||
}
|
||||
|
||||
template <typename T, int SIZE>
|
||||
void ShiftRightAndInsert(Simulator* simulator, int Vd, int Vm, int shift) {
|
||||
static const int kElems = SIZE / sizeof(T);
|
||||
T src[kElems];
|
||||
T dst[kElems];
|
||||
simulator->get_neon_register<T, SIZE>(Vm, src);
|
||||
simulator->get_neon_register<T, SIZE>(Vd, dst);
|
||||
uint64_t mask = ~((1llu << (kBitsPerByte * SIZE - shift)) - 1llu);
|
||||
for (int i = 0; i < kElems; i++) {
|
||||
dst[i] = (src[i] >> shift) | (dst[i] & mask);
|
||||
}
|
||||
simulator->set_neon_register<T, SIZE>(Vd, dst);
|
||||
}
|
||||
|
||||
template <typename T, int SIZE>
|
||||
void CompareEqual(Simulator* simulator, int Vd, int Vm, int Vn) {
|
||||
static const int kElems = SIZE / sizeof(T);
|
||||
@ -4995,14 +5023,40 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
|
||||
set_neon_register(vd, mval);
|
||||
}
|
||||
} else if (instr->Bits(11, 7) == 0x18) {
|
||||
// vdup.32 Qd, Sm.
|
||||
int vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
// vdup.<size> Dd, Dm[index].
|
||||
// vdup.<size> Qd, Dm[index].
|
||||
int vm = instr->VFPMRegValue(kDoublePrecision);
|
||||
int index = instr->Bit(19);
|
||||
uint32_t s_data = get_s_register(vm * 2 + index);
|
||||
uint32_t q_data[4];
|
||||
for (int i = 0; i < 4; i++) q_data[i] = s_data;
|
||||
set_neon_register(vd, q_data);
|
||||
int imm4 = instr->Bits(19, 16);
|
||||
int size = 0, index = 0, mask = 0;
|
||||
if ((imm4 & 0x1) != 0) {
|
||||
size = 8;
|
||||
index = imm4 >> 1;
|
||||
mask = 0xffu;
|
||||
} else if ((imm4 & 0x2) != 0) {
|
||||
size = 16;
|
||||
index = imm4 >> 2;
|
||||
mask = 0xffffu;
|
||||
} else {
|
||||
size = 32;
|
||||
index = imm4 >> 3;
|
||||
mask = 0xffffffffu;
|
||||
}
|
||||
uint64_t d_data;
|
||||
get_d_register(vm, &d_data);
|
||||
uint32_t scalar = (d_data >> (size * index)) & mask;
|
||||
uint32_t duped = scalar;
|
||||
for (int i = 1; i < 32 / size; i++) {
|
||||
scalar <<= size;
|
||||
duped |= scalar;
|
||||
}
|
||||
uint32_t result[4] = {duped, duped, duped, duped};
|
||||
if (instr->Bit(6) == 0) {
|
||||
int vd = instr->VFPDRegValue(kDoublePrecision);
|
||||
set_d_register(vd, result);
|
||||
} else {
|
||||
int vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
set_neon_register(vd, result);
|
||||
}
|
||||
} else if (instr->Bits(19, 16) == 0 && instr->Bits(11, 6) == 0x17) {
|
||||
// vmvn Qd, Qm.
|
||||
int vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
@ -5379,6 +5433,58 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
|
||||
UNREACHABLE();
|
||||
break;
|
||||
}
|
||||
} else if (instr->Bits(11, 8) == 0x5 && instr->Bit(6) == 0 &&
|
||||
instr->Bit(4) == 1) {
|
||||
// vsli.<size> Dd, Dm, shift
|
||||
int imm7 = instr->Bits(21, 16);
|
||||
if (instr->Bit(7) != 0) imm7 += 64;
|
||||
int size = base::bits::RoundDownToPowerOfTwo32(imm7);
|
||||
int shift = imm7 - size;
|
||||
int Vd = instr->VFPDRegValue(kDoublePrecision);
|
||||
int Vm = instr->VFPMRegValue(kDoublePrecision);
|
||||
switch (size) {
|
||||
case 8:
|
||||
ShiftLeftAndInsert<uint8_t, kDoubleSize>(this, Vd, Vm, shift);
|
||||
break;
|
||||
case 16:
|
||||
ShiftLeftAndInsert<uint16_t, kDoubleSize>(this, Vd, Vm, shift);
|
||||
break;
|
||||
case 32:
|
||||
ShiftLeftAndInsert<uint32_t, kDoubleSize>(this, Vd, Vm, shift);
|
||||
break;
|
||||
case 64:
|
||||
ShiftLeftAndInsert<uint64_t, kDoubleSize>(this, Vd, Vm, shift);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE();
|
||||
break;
|
||||
}
|
||||
} else if (instr->Bits(11, 8) == 0x4 && instr->Bit(6) == 0 &&
|
||||
instr->Bit(4) == 1) {
|
||||
// vsri.<size> Dd, Dm, shift
|
||||
int imm7 = instr->Bits(21, 16);
|
||||
if (instr->Bit(7) != 0) imm7 += 64;
|
||||
int size = base::bits::RoundDownToPowerOfTwo32(imm7);
|
||||
int shift = 2 * size - imm7;
|
||||
int Vd = instr->VFPDRegValue(kDoublePrecision);
|
||||
int Vm = instr->VFPMRegValue(kDoublePrecision);
|
||||
switch (size) {
|
||||
case 8:
|
||||
ShiftRightAndInsert<uint8_t, kDoubleSize>(this, Vd, Vm, shift);
|
||||
break;
|
||||
case 16:
|
||||
ShiftRightAndInsert<uint16_t, kDoubleSize>(this, Vd, Vm, shift);
|
||||
break;
|
||||
case 32:
|
||||
ShiftRightAndInsert<uint32_t, kDoubleSize>(this, Vd, Vm, shift);
|
||||
break;
|
||||
case 64:
|
||||
ShiftRightAndInsert<uint64_t, kDoubleSize>(this, Vd, Vm, shift);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE();
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
|
@ -1612,17 +1612,19 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kArmF32x4Splat: {
|
||||
__ vdup(i.OutputSimd128Register(), i.InputFloatRegister(0));
|
||||
int src_code = i.InputFloatRegister(0).code();
|
||||
__ vdup(Neon32, i.OutputSimd128Register(),
|
||||
DwVfpRegister::from_code(src_code / 2), src_code & 0x1);
|
||||
break;
|
||||
}
|
||||
case kArmF32x4ExtractLane: {
|
||||
__ ExtractLane(i.OutputFloatRegister(), i.InputSimd128Register(0),
|
||||
kScratchReg, i.InputInt8(1));
|
||||
i.InputInt8(1));
|
||||
break;
|
||||
}
|
||||
case kArmF32x4ReplaceLane: {
|
||||
__ ReplaceLane(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputFloatRegister(2), kScratchReg, i.InputInt8(1));
|
||||
i.InputFloatRegister(2), i.InputInt8(1));
|
||||
break;
|
||||
}
|
||||
case kArmF32x4SConvertI32x4: {
|
||||
@ -2219,7 +2221,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
src_code = src1_code;
|
||||
lane &= 0x3;
|
||||
}
|
||||
__ VmovExtended(dst_code + i, src_code + lane, kScratchReg);
|
||||
__ VmovExtended(dst_code + i, src_code + lane);
|
||||
shuffle >>= 8;
|
||||
}
|
||||
break;
|
||||
@ -3038,10 +3040,10 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
|
||||
int src_code = LocationOperand::cast(source)->register_code();
|
||||
if (destination->IsFloatRegister()) {
|
||||
int dst_code = LocationOperand::cast(destination)->register_code();
|
||||
__ VmovExtended(dst_code, src_code, kScratchReg);
|
||||
__ VmovExtended(dst_code, src_code);
|
||||
} else {
|
||||
DCHECK(destination->IsFloatStackSlot());
|
||||
__ VmovExtended(g.ToMemOperand(destination), src_code, kScratchReg);
|
||||
__ VmovExtended(g.ToMemOperand(destination), src_code);
|
||||
}
|
||||
} else {
|
||||
DCHECK_EQ(MachineRepresentation::kSimd128, rep);
|
||||
@ -3068,7 +3070,7 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
|
||||
// GapResolver may give us reg codes that don't map to actual
|
||||
// s-registers. Generate code to work around those cases.
|
||||
int dst_code = LocationOperand::cast(destination)->register_code();
|
||||
__ VmovExtended(dst_code, src, kScratchReg);
|
||||
__ VmovExtended(dst_code, src);
|
||||
} else {
|
||||
DCHECK_EQ(MachineRepresentation::kSimd128, rep);
|
||||
QwNeonRegister dst = g.ToSimd128Register(destination);
|
||||
@ -3152,14 +3154,14 @@ void CodeGenerator::AssembleSwap(InstructionOperand* source,
|
||||
int src_code = LocationOperand::cast(source)->register_code();
|
||||
if (destination->IsFPRegister()) {
|
||||
int dst_code = LocationOperand::cast(destination)->register_code();
|
||||
__ VmovExtended(temp.low().code(), src_code, kScratchReg);
|
||||
__ VmovExtended(src_code, dst_code, kScratchReg);
|
||||
__ VmovExtended(dst_code, temp.low().code(), kScratchReg);
|
||||
__ VmovExtended(temp.low().code(), src_code);
|
||||
__ VmovExtended(src_code, dst_code);
|
||||
__ VmovExtended(dst_code, temp.low().code());
|
||||
} else {
|
||||
DCHECK(destination->IsFPStackSlot());
|
||||
MemOperand dst = g.ToMemOperand(destination);
|
||||
__ VmovExtended(temp.low().code(), src_code, kScratchReg);
|
||||
__ VmovExtended(src_code, dst, kScratchReg);
|
||||
__ VmovExtended(temp.low().code(), src_code);
|
||||
__ VmovExtended(src_code, dst);
|
||||
__ vstr(temp.low(), dst);
|
||||
}
|
||||
} else {
|
||||
|
@ -1298,6 +1298,7 @@ TEST(15) {
|
||||
uint32_t vneg_s8[4], vneg_s16[4], vneg_s32[4];
|
||||
uint32_t veor[4], vand[4], vorr[4];
|
||||
float vdupf[4], vaddf[4], vpaddf[2], vsubf[4], vmulf[4];
|
||||
uint32_t vdupf_16[2], vdupf_8[4];
|
||||
uint32_t vmin_s8[4], vmin_u16[4], vmin_s32[4];
|
||||
uint32_t vmax_s8[4], vmax_u16[4], vmax_s32[4];
|
||||
uint32_t vpadd_i8[2], vpadd_i16[2], vpadd_i32[2];
|
||||
@ -1310,6 +1311,7 @@ TEST(15) {
|
||||
uint32_t vmul8[4], vmul16[4], vmul32[4];
|
||||
uint32_t vshl8[4], vshl16[4], vshl32[5];
|
||||
uint32_t vshr_s8[4], vshr_u16[4], vshr_s32[5];
|
||||
uint32_t vsli_64[2], vsri_64[2], vsli_32[2], vsri_32[2];
|
||||
uint32_t vceq[4], vceqf[4], vcgef[4], vcgtf[4];
|
||||
uint32_t vcge_s8[4], vcge_u16[4], vcge_s32[4];
|
||||
uint32_t vcgt_s8[4], vcgt_u16[4], vcgt_s32[4];
|
||||
@ -1440,7 +1442,7 @@ TEST(15) {
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcvt_f32_u32))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
|
||||
// vdup (integer).
|
||||
// vdup (from register).
|
||||
__ mov(r4, Operand(0xa));
|
||||
__ vdup(Neon8, q0, r4);
|
||||
__ vdup(Neon16, q1, r4);
|
||||
@ -1452,11 +1454,16 @@ TEST(15) {
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup32))));
|
||||
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
|
||||
|
||||
// vdup (float).
|
||||
// vdup (from scalar).
|
||||
__ vmov(s0, -1.0);
|
||||
__ vdup(q0, s0);
|
||||
__ vdup(Neon32, q1, d0, 0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdupf))));
|
||||
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
__ vdup(Neon16, d2, d0, 1);
|
||||
__ vstr(d2, r0, offsetof(T, vdupf_16));
|
||||
__ vdup(Neon8, q1, d0, 3);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdupf_8))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
|
||||
// vabs (float).
|
||||
__ vmov(s0, -1.0);
|
||||
@ -1525,24 +1532,24 @@ TEST(15) {
|
||||
|
||||
// vmin (float).
|
||||
__ vmov(s4, 2.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vdup(Neon32, q0, d2, 0);
|
||||
__ vmov(s4, 1.0);
|
||||
__ vdup(q1, s4);
|
||||
__ vdup(Neon32, q1, d2, 0);
|
||||
__ vmin(q1, q1, q0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vminf))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vmax (float).
|
||||
__ vmov(s4, 2.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vdup(Neon32, q0, d2, 0);
|
||||
__ vmov(s4, 1.0);
|
||||
__ vdup(q1, s4);
|
||||
__ vdup(Neon32, q1, d2, 0);
|
||||
__ vmax(q1, q1, q0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmaxf))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vadd (float).
|
||||
__ vmov(s4, 1.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vdup(q1, s4);
|
||||
__ vdup(Neon32, q0, d2, 0);
|
||||
__ vdup(Neon32, q1, d2, 0);
|
||||
__ vadd(q1, q1, q0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vaddf))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
@ -1555,51 +1562,51 @@ TEST(15) {
|
||||
__ vstr(d2, r0, offsetof(T, vpaddf));
|
||||
// vsub (float).
|
||||
__ vmov(s4, 2.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vdup(Neon32, q0, d2, 0);
|
||||
__ vmov(s4, 1.0);
|
||||
__ vdup(q1, s4);
|
||||
__ vdup(Neon32, q1, d2, 0);
|
||||
__ vsub(q1, q1, q0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsubf))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vmul (float).
|
||||
__ vmov(s4, 2.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vdup(q1, s4);
|
||||
__ vdup(Neon32, q0, d2, 0);
|
||||
__ vdup(Neon32, q1, d2, 0);
|
||||
__ vmul(q1, q1, q0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmulf))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vrecpe.
|
||||
__ vmov(s4, 2.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vdup(Neon32, q0, d2, 0);
|
||||
__ vrecpe(q1, q0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrecpe))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vrecps.
|
||||
__ vmov(s4, 2.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vdup(Neon32, q0, d2, 0);
|
||||
__ vmov(s4, 1.5);
|
||||
__ vdup(q1, s4);
|
||||
__ vdup(Neon32, q1, d2, 0);
|
||||
__ vrecps(q1, q0, q1);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrecps))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vrsqrte.
|
||||
__ vmov(s4, 4.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vdup(Neon32, q0, d2, 0);
|
||||
__ vrsqrte(q1, q0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrsqrte))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vrsqrts.
|
||||
__ vmov(s4, 2.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vdup(Neon32, q0, d2, 0);
|
||||
__ vmov(s4, 2.5);
|
||||
__ vdup(q1, s4);
|
||||
__ vdup(Neon32, q1, d2, 0);
|
||||
__ vrsqrts(q1, q0, q1);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrsqrts))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
// vceq (float).
|
||||
__ vmov(s4, 1.0);
|
||||
__ vdup(q0, s4);
|
||||
__ vdup(q1, s4);
|
||||
__ vdup(Neon32, q0, d2, 0);
|
||||
__ vdup(Neon32, q1, d2, 0);
|
||||
__ vceq(q1, q1, q0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vceqf))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
@ -1608,7 +1615,7 @@ TEST(15) {
|
||||
__ vmov(s1, -1.0);
|
||||
__ vmov(s2, -0.0);
|
||||
__ vmov(s3, 0.0);
|
||||
__ vdup(q1, s3);
|
||||
__ vdup(Neon32, q1, d1, 1);
|
||||
__ vcge(q2, q1, q0);
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcgef))));
|
||||
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
|
||||
@ -1814,6 +1821,26 @@ TEST(15) {
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshr_s32))));
|
||||
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
|
||||
|
||||
// vsli, vsri.
|
||||
__ mov(r4, Operand(0xffffffff));
|
||||
__ mov(r5, Operand(0x1));
|
||||
__ vmov(d0, r4, r5);
|
||||
__ vmov(d1, r5, r5);
|
||||
__ vsli(Neon64, d1, d0, 32);
|
||||
__ vstr(d1, r0, offsetof(T, vsli_64));
|
||||
__ vmov(d0, r5, r4);
|
||||
__ vmov(d1, r5, r5);
|
||||
__ vsri(Neon64, d1, d0, 32);
|
||||
__ vstr(d1, r0, offsetof(T, vsri_64));
|
||||
__ vmov(d0, r4, r5);
|
||||
__ vmov(d1, r5, r5);
|
||||
__ vsli(Neon32, d1, d0, 16);
|
||||
__ vstr(d1, r0, offsetof(T, vsli_32));
|
||||
__ vmov(d0, r5, r4);
|
||||
__ vmov(d1, r5, r5);
|
||||
__ vsri(Neon32, d1, d0, 16);
|
||||
__ vstr(d1, r0, offsetof(T, vsri_32));
|
||||
|
||||
// vceq.
|
||||
__ mov(r4, Operand(0x03));
|
||||
__ vdup(Neon8, q0, r4);
|
||||
@ -2107,7 +2134,9 @@ TEST(15) {
|
||||
CHECK_EQ_SPLAT(vdup8, 0x0a0a0a0au);
|
||||
CHECK_EQ_SPLAT(vdup16, 0x000a000au);
|
||||
CHECK_EQ_SPLAT(vdup32, 0x0000000au);
|
||||
CHECK_EQ_SPLAT(vdupf, -1.0);
|
||||
CHECK_EQ_SPLAT(vdupf, -1.0); // bit pattern is 0xbf800000.
|
||||
CHECK_EQ_32X2(vdupf_16, 0xbf80bf80u, 0xbf80bf80u);
|
||||
CHECK_EQ_SPLAT(vdupf_8, 0xbfbfbfbfu);
|
||||
|
||||
// src: [-1, -1, 1, 1]
|
||||
CHECK_EQ_32X4(vcvt_s32_f32, -1, -1, 1, 1);
|
||||
@ -2189,6 +2218,10 @@ TEST(15) {
|
||||
CHECK_EQ_SPLAT(vshr_s8, 0xc0c0c0c0u);
|
||||
CHECK_EQ_SPLAT(vshr_u16, 0x00400040u);
|
||||
CHECK_EQ_SPLAT(vshr_s32, 0xffffc040u);
|
||||
CHECK_EQ_32X2(vsli_64, 0x01u, 0xffffffffu);
|
||||
CHECK_EQ_32X2(vsri_64, 0xffffffffu, 0x01u);
|
||||
CHECK_EQ_32X2(vsli_32, 0xffff0001u, 0x00010001u);
|
||||
CHECK_EQ_32X2(vsri_32, 0x00000000u, 0x0000ffffu);
|
||||
CHECK_EQ_SPLAT(vceq, 0x00ff00ffu);
|
||||
// [0, 3, 0, 3, ...] >= [3, 3, 3, 3, ...]
|
||||
CHECK_EQ_SPLAT(vcge_s8, 0x00ff00ffu);
|
||||
@ -3840,11 +3873,8 @@ TEST(vswp) {
|
||||
const uint32_t test_2 = 0x89abcdef;
|
||||
__ mov(r4, Operand(test_1));
|
||||
__ mov(r5, Operand(test_2));
|
||||
// TODO(bbudge) replace with vdup when implemented.
|
||||
__ vmov(d8, r4, r4);
|
||||
__ vmov(d9, r4, r4); // q4 = [1.0, 1.0]
|
||||
__ vmov(d10, r5, r5);
|
||||
__ vmov(d11, r5, r5); // q5 = [-1.0, -1.0]
|
||||
__ vdup(Neon32, q4, r4);
|
||||
__ vdup(Neon32, q5, r5);
|
||||
__ vswp(q4, q5);
|
||||
__ add(r6, r0, Operand(static_cast<int32_t>(offsetof(T, vswp_q4))));
|
||||
__ vst1(Neon8, NeonListOperand(q4), NeonMemOperand(r6));
|
||||
|
@ -994,10 +994,14 @@ TEST(Neon) {
|
||||
"eea24b30 vdup.16 q1, r4");
|
||||
COMPARE(vdup(Neon32, q15, r1),
|
||||
"eeae1b90 vdup.32 q15, r1");
|
||||
COMPARE(vdup(q0, s3),
|
||||
"f3bc0c41 vdup q0, d1[1]");
|
||||
COMPARE(vdup(q15, s2),
|
||||
"f3f4ec41 vdup q15, d1[0]");
|
||||
COMPARE(vdup(Neon32, q0, d1, 1),
|
||||
"f3bc0c41 vdup.32 q0, d1[1]");
|
||||
COMPARE(vdup(Neon32, q15, d1, 0),
|
||||
"f3f4ec41 vdup.32 q15, d1[0]");
|
||||
COMPARE(vdup(Neon16, q7, d8, 3),
|
||||
"f3beec48 vdup.16 q7, d8[3]");
|
||||
COMPARE(vdup(Neon32, d0, d30, 0),
|
||||
"f3b40c2e vdup.32 d0, d30[0]");
|
||||
COMPARE(vcvt_f32_s32(q15, q1),
|
||||
"f3fbe642 vcvt.f32.s32 q15, q1");
|
||||
COMPARE(vcvt_f32_u32(q8, q9),
|
||||
@ -1106,6 +1110,14 @@ TEST(Neon) {
|
||||
"f3d6e050 vshr.u16 q15, q0, #10");
|
||||
COMPARE(vshr(NeonS32, q15, q0, 17),
|
||||
"f2efe050 vshr.s32 q15, q0, #17");
|
||||
COMPARE(vsli(Neon64, d2, d0, 32),
|
||||
"f3a02590 vsli.64 d2, d0, #32");
|
||||
COMPARE(vsli(Neon32, d7, d8, 17),
|
||||
"f3b17518 vsli.32 d7, d8, #17");
|
||||
COMPARE(vsri(Neon64, d2, d0, 32),
|
||||
"f3a02490 vsri.64 d2, d0, #32");
|
||||
COMPARE(vsri(Neon16, d7, d8, 8),
|
||||
"f3987418 vsri.16 d7, d8, #8");
|
||||
COMPARE(vrecpe(q15, q0),
|
||||
"f3fbe540 vrecpe.f32 q15, q0");
|
||||
COMPARE(vrecps(q15, q0, q8),
|
||||
|
@ -169,7 +169,7 @@ TEST(ExtractLane) {
|
||||
__ ExtractLane(r5, q1, NeonS32, i);
|
||||
__ str(r5, MemOperand(r0, offsetof(T, i32x4_low) + 4 * i));
|
||||
SwVfpRegister si = SwVfpRegister::from_code(i);
|
||||
__ ExtractLane(si, q1, r4, i);
|
||||
__ ExtractLane(si, q1, i);
|
||||
__ vstr(si, r0, offsetof(T, f32x4_low) + 4 * i);
|
||||
}
|
||||
|
||||
@ -203,7 +203,7 @@ TEST(ExtractLane) {
|
||||
__ ExtractLane(r5, q15, NeonS32, i);
|
||||
__ str(r5, MemOperand(r0, offsetof(T, i32x4_high) + 4 * i));
|
||||
SwVfpRegister si = SwVfpRegister::from_code(i);
|
||||
__ ExtractLane(si, q15, r4, i);
|
||||
__ ExtractLane(si, q15, i);
|
||||
__ vstr(si, r0, offsetof(T, f32x4_high) + 4 * i);
|
||||
}
|
||||
|
||||
@ -304,8 +304,6 @@ TEST(ReplaceLane) {
|
||||
|
||||
__ stm(db_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | lr.bit());
|
||||
|
||||
const Register kScratch = r5;
|
||||
|
||||
__ veor(q0, q0, q0); // Zero
|
||||
__ veor(q1, q1, q1); // Zero
|
||||
for (int i = 0; i < 4; i++) {
|
||||
@ -313,7 +311,7 @@ TEST(ReplaceLane) {
|
||||
__ ReplaceLane(q0, q0, r4, NeonS32, i);
|
||||
SwVfpRegister si = SwVfpRegister::from_code(i);
|
||||
__ vmov(si, r4);
|
||||
__ ReplaceLane(q1, q1, si, kScratch, i);
|
||||
__ ReplaceLane(q1, q1, si, i);
|
||||
}
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i32x4_low))));
|
||||
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
|
||||
@ -344,7 +342,7 @@ TEST(ReplaceLane) {
|
||||
__ ReplaceLane(q14, q14, r4, NeonS32, i);
|
||||
SwVfpRegister si = SwVfpRegister::from_code(i);
|
||||
__ vmov(si, r4);
|
||||
__ ReplaceLane(q15, q15, si, kScratch, i);
|
||||
__ ReplaceLane(q15, q15, si, i);
|
||||
}
|
||||
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i32x4_high))));
|
||||
__ vst1(Neon8, NeonListOperand(q14), NeonMemOperand(r4));
|
||||
|
Loading…
Reference in New Issue
Block a user