[wasm-simd] Implement i64x2.mul on arm
Bug: v8:9813 Change-Id: I0436c6a90284559a110e99476c12ae39183c961e Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1994382 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Reviewed-by: Jakob Kummerow <jkummerow@chromium.org> Cr-Commit-Position: refs/heads/master@{#65846}
This commit is contained in:
parent
73c9a99488
commit
9ff2de441a
@ -4267,6 +4267,25 @@ void Assembler::vqsub(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
|
||||
emit(EncodeNeonBinOp(VQSUB, dt, dst, src1, src2));
|
||||
}
|
||||
|
||||
void Assembler::vmlal(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src1,
|
||||
DwVfpRegister src2) {
|
||||
DCHECK(IsEnabled(NEON));
|
||||
// Qd = vmlal(Dn, Dm) Vector Multiply Accumulate Long (integer)
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-931.
|
||||
int vd, d;
|
||||
dst.split_code(&vd, &d);
|
||||
int vn, n;
|
||||
src1.split_code(&vn, &n);
|
||||
int vm, m;
|
||||
src2.split_code(&vm, &m);
|
||||
int size = NeonSz(dt);
|
||||
int u = NeonU(dt);
|
||||
if (!u) UNIMPLEMENTED();
|
||||
DCHECK_NE(size, 3); // SEE "Related encodings"
|
||||
emit(0xFU * B28 | B25 | u * B24 | B23 | d * B22 | size * B20 | vn * B16 |
|
||||
vd * B12 | 0x8 * B8 | n * B7 | m * B5 | vm);
|
||||
}
|
||||
|
||||
void Assembler::vmul(QwNeonRegister dst, QwNeonRegister src1,
|
||||
QwNeonRegister src2) {
|
||||
DCHECK(IsEnabled(NEON));
|
||||
@ -4283,6 +4302,24 @@ void Assembler::vmul(NeonSize size, QwNeonRegister dst, QwNeonRegister src1,
|
||||
emit(EncodeNeonBinOp(VMUL, size, dst, src1, src2));
|
||||
}
|
||||
|
||||
void Assembler::vmull(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src1,
|
||||
DwVfpRegister src2) {
|
||||
DCHECK(IsEnabled(NEON));
|
||||
// Qd = vmull(Dn, Dm) Vector Multiply Long (integer).
|
||||
// Instruction details available in ARM DDI 0406C.b, A8-960.
|
||||
int vd, d;
|
||||
dst.split_code(&vd, &d);
|
||||
int vn, n;
|
||||
src1.split_code(&vn, &n);
|
||||
int vm, m;
|
||||
src2.split_code(&vm, &m);
|
||||
int size = NeonSz(dt);
|
||||
int u = NeonU(dt);
|
||||
if (!u) UNIMPLEMENTED();
|
||||
emit(0xFU * B28 | B25 | u * B24 | B23 | d * B22 | size * B20 | vn * B16 |
|
||||
vd * B12 | 0xC * B8 | n * B7 | m * B5 | vm);
|
||||
}
|
||||
|
||||
void Assembler::vmin(QwNeonRegister dst, QwNeonRegister src1,
|
||||
QwNeonRegister src2) {
|
||||
DCHECK(IsEnabled(NEON));
|
||||
|
@ -888,9 +888,13 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
QwNeonRegister src2);
|
||||
void vqsub(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
|
||||
QwNeonRegister src2);
|
||||
void vmlal(NeonDataType size, QwNeonRegister dst, DwVfpRegister src1,
|
||||
DwVfpRegister src2);
|
||||
void vmul(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
|
||||
void vmul(NeonSize size, QwNeonRegister dst, QwNeonRegister src1,
|
||||
QwNeonRegister src2);
|
||||
void vmull(NeonDataType size, QwNeonRegister dst, DwVfpRegister src1,
|
||||
DwVfpRegister src2);
|
||||
void vmin(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
|
||||
void vmin(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
|
||||
QwNeonRegister src2);
|
||||
|
@ -1975,6 +1975,51 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
i.InputSimd128Register(1));
|
||||
break;
|
||||
}
|
||||
case kArmI64x2Mul: {
|
||||
QwNeonRegister dst = i.OutputSimd128Register();
|
||||
QwNeonRegister left = i.InputSimd128Register(0);
|
||||
QwNeonRegister right = i.InputSimd128Register(1);
|
||||
QwNeonRegister tmp1 = i.TempSimd128Register(0);
|
||||
QwNeonRegister tmp2 = i.TempSimd128Register(1);
|
||||
|
||||
// This algorithm uses vector operations to perform 64-bit integer
|
||||
// multiplication by splitting it into a high and low 32-bit integers.
|
||||
// The tricky part is getting the low and high integers in the correct
|
||||
// place inside a NEON register, so that we can use as little vmull and
|
||||
// vmlal as possible.
|
||||
|
||||
// Move left and right into temporaries, they will be modified by vtrn.
|
||||
__ vmov(tmp1, left);
|
||||
__ vmov(tmp2, right);
|
||||
|
||||
// This diagram shows how the 64-bit integers fit into NEON registers.
|
||||
//
|
||||
// [q.high()| q.low()]
|
||||
// left/tmp1: [ a3, a2 | a1, a0 ]
|
||||
// right/tmp2: [ b3, b2 | b1, b0 ]
|
||||
//
|
||||
// We want to multiply the low 32 bits of left with high 32 bits of right,
|
||||
// for each lane, i.e. a2 * b3, a0 * b1. However, vmull takes two input d
|
||||
// registers, and multiply the corresponding low/high 32 bits, to get a
|
||||
// 64-bit integer: a1 * b1, a0 * b0. In order to make it work we transpose
|
||||
// the vectors, so that we get the low 32 bits of each 64-bit integer into
|
||||
// the same lane, similarly for high 32 bits.
|
||||
__ vtrn(Neon32, tmp1.low(), tmp1.high());
|
||||
// tmp1: [ a3, a1 | a2, a0 ]
|
||||
__ vtrn(Neon32, tmp2.low(), tmp2.high());
|
||||
// tmp2: [ b3, b1 | b2, b0 ]
|
||||
|
||||
__ vmull(NeonU32, dst, tmp1.low(), tmp2.high());
|
||||
// dst: [ a2*b3 | a0*b1 ]
|
||||
__ vmlal(NeonU32, dst, tmp1.high(), tmp2.low());
|
||||
// dst: [ a2*b3 + a3*b2 | a0*b1 + a1*b0 ]
|
||||
__ vshl(NeonU64, dst, dst, 32);
|
||||
// dst: [ (a2*b3 + a3*b2) << 32 | (a0*b1 + a1*b0) << 32 ]
|
||||
|
||||
__ vmlal(NeonU32, dst, tmp1.low(), tmp2.low());
|
||||
// dst: [ (a2*b3 + a3*b2)<<32 + (a2*b2) | (a0*b1 + a1*b0)<<32 + (a0*b0) ]
|
||||
break;
|
||||
}
|
||||
case kArmI64x2Neg: {
|
||||
Simd128Register dst = i.OutputSimd128Register();
|
||||
__ vmov(dst, static_cast<uint64_t>(0));
|
||||
|
@ -172,6 +172,7 @@ namespace compiler {
|
||||
V(ArmI64x2ShrS) \
|
||||
V(ArmI64x2Add) \
|
||||
V(ArmI64x2Sub) \
|
||||
V(ArmI64x2Mul) \
|
||||
V(ArmI64x2ShrU) \
|
||||
V(ArmI32x4Splat) \
|
||||
V(ArmI32x4ExtractLane) \
|
||||
|
@ -152,6 +152,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kArmI64x2ShrS:
|
||||
case kArmI64x2Add:
|
||||
case kArmI64x2Sub:
|
||||
case kArmI64x2Mul:
|
||||
case kArmI64x2ShrU:
|
||||
case kArmI32x4Splat:
|
||||
case kArmI32x4ExtractLane:
|
||||
|
@ -2656,6 +2656,15 @@ void InstructionSelector::VisitI64x2Neg(Node* node) {
|
||||
g.UseUniqueRegister(node->InputAt(0)));
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitI64x2Mul(Node* node) {
|
||||
ArmOperandGenerator g(this);
|
||||
InstructionOperand temps[] = {g.TempSimd128Register(),
|
||||
g.TempSimd128Register()};
|
||||
Emit(kArmI64x2Mul, g.DefineAsRegister(node),
|
||||
g.UseUniqueRegister(node->InputAt(0)),
|
||||
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps);
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitF32x4Sqrt(Node* node) {
|
||||
ArmOperandGenerator g(this);
|
||||
// Use fixed registers in the lower 8 Q-registers so we can directly access
|
||||
|
@ -2635,9 +2635,6 @@ void InstructionSelector::VisitF64x2UConvertI64x2(Node* node) {
|
||||
#if !V8_TARGET_ARCH_ARM
|
||||
void InstructionSelector::VisitS128AndNot(Node* node) { UNIMPLEMENTED(); }
|
||||
#endif // !V8_TARGET_ARCH_ARM
|
||||
#if !V8_TARGET_ARCH_IA32
|
||||
void InstructionSelector::VisitI64x2Mul(Node* node) { UNIMPLEMENTED(); }
|
||||
#endif // !V8_TARGET_ARCH_IA32
|
||||
void InstructionSelector::VisitI64x2Splat(Node* node) { UNIMPLEMENTED(); }
|
||||
void InstructionSelector::VisitI64x2ExtractLane(Node* node) { UNIMPLEMENTED(); }
|
||||
void InstructionSelector::VisitI64x2ReplaceLane(Node* node) { UNIMPLEMENTED(); }
|
||||
|
@ -2395,6 +2395,26 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
|
||||
out_buffer_pos_ +=
|
||||
SNPrintF(out_buffer_ + out_buffer_pos_, "vs%ci.%d d%d, d%d, #%d",
|
||||
direction, size, Vd, Vm, shift);
|
||||
} else if (instr->Bits(11, 8) == 0x8 && instr->Bit(6) == 0 &&
|
||||
instr->Bit(4) == 0) {
|
||||
// vmlal.u<size> <Qd>, <Dn>, <Dm>
|
||||
int Vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
int Vn = instr->VFPNRegValue(kDoublePrecision);
|
||||
int Vm = instr->VFPMRegValue(kDoublePrecision);
|
||||
int size = 8 << instr->Bits(21, 20);
|
||||
out_buffer_pos_ +=
|
||||
SNPrintF(out_buffer_ + out_buffer_pos_, "vmlal.u%d q%d, d%d, d%d",
|
||||
size, Vd, Vn, Vm);
|
||||
} else if (instr->Bits(11, 8) == 0xC && instr->Bit(6) == 0 &&
|
||||
instr->Bit(4) == 0) {
|
||||
// vmull.u<size> <Qd>, <Dn>, <Dm>
|
||||
int Vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
int Vn = instr->VFPNRegValue(kDoublePrecision);
|
||||
int Vm = instr->VFPMRegValue(kDoublePrecision);
|
||||
int size = 8 << instr->Bits(21, 20);
|
||||
out_buffer_pos_ +=
|
||||
SNPrintF(out_buffer_ + out_buffer_pos_, "vmull.u%d q%d, d%d, d%d",
|
||||
size, Vd, Vn, Vm);
|
||||
} else {
|
||||
Unknown(instr);
|
||||
}
|
||||
|
@ -5519,6 +5519,39 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
|
||||
UNREACHABLE();
|
||||
break;
|
||||
}
|
||||
} else if (instr->Bits(11, 8) == 0x8 && instr->Bit(6) == 0 &&
|
||||
instr->Bit(4) == 0) {
|
||||
// vmlal.u<size> Qd, Dn, Dm
|
||||
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
|
||||
if (size != Neon32) UNIMPLEMENTED();
|
||||
|
||||
int Vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
int Vn = instr->VFPNRegValue(kDoublePrecision);
|
||||
int Vm = instr->VFPMRegValue(kDoublePrecision);
|
||||
uint64_t src1, src2, dst[2];
|
||||
|
||||
get_neon_register<uint64_t>(Vd, dst);
|
||||
get_d_register(Vn, &src1);
|
||||
get_d_register(Vm, &src2);
|
||||
dst[0] += (src1 & 0xFFFFFFFFULL) * (src2 & 0xFFFFFFFFULL);
|
||||
dst[1] += (src1 >> 32) * (src2 >> 32);
|
||||
set_neon_register<uint64_t>(Vd, dst);
|
||||
} else if (instr->Bits(11, 8) == 0xC && instr->Bit(6) == 0 &&
|
||||
instr->Bit(4) == 0) {
|
||||
// vmull.u<size> Qd, Dn, Dm
|
||||
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
|
||||
if (size != Neon32) UNIMPLEMENTED();
|
||||
|
||||
int Vd = instr->VFPDRegValue(kSimd128Precision);
|
||||
int Vn = instr->VFPNRegValue(kDoublePrecision);
|
||||
int Vm = instr->VFPMRegValue(kDoublePrecision);
|
||||
uint64_t src1, src2, dst[2];
|
||||
|
||||
get_d_register(Vn, &src1);
|
||||
get_d_register(Vm, &src2);
|
||||
dst[0] = (src1 & 0xFFFFFFFFULL) * (src2 & 0xFFFFFFFFULL);
|
||||
dst[1] = (src1 >> 32) * (src2 >> 32);
|
||||
set_neon_register<uint64_t>(Vd, dst);
|
||||
} else {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
|
@ -1171,6 +1171,12 @@ TEST(Neon) {
|
||||
"f2142970 vmul.i16 q1, q2, q8");
|
||||
COMPARE(vmul(Neon32, q15, q0, q8),
|
||||
"f260e970 vmul.i32 q15, q0, q8");
|
||||
|
||||
COMPARE(vmull(NeonU32, q15, d0, d8),
|
||||
"f3e0ec08 vmull.u32 q15, d0, d8");
|
||||
COMPARE(vmlal(NeonU32, q15, d0, d8),
|
||||
"f3e0e808 vmlal.u32 q15, d0, d8");
|
||||
|
||||
COMPARE(vshl(NeonS8, q15, q0, 6),
|
||||
"f2cee550 vshl.i8 q15, q0, #6");
|
||||
COMPARE(vshl(NeonU16, q15, q0, 10),
|
||||
|
@ -1454,7 +1454,6 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Max) {
|
||||
RunF64x2BinOpTest(execution_tier, lower_simd, kExprF64x2Max, JSMax);
|
||||
}
|
||||
|
||||
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32
|
||||
WASM_SIMD_TEST_NO_LOWERING(I64x2Mul) {
|
||||
RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2Mul,
|
||||
base::MulWithWraparound);
|
||||
@ -1528,10 +1527,7 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Qfms) {
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
|
||||
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32
|
||||
|
||||
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
|
||||
WASM_SIMD_TEST_NO_LOWERING(F64x2ConvertI64x2) {
|
||||
WasmRunner<int32_t, int64_t> r(execution_tier, lower_simd);
|
||||
// Create two output vectors to hold signed and unsigned results.
|
||||
|
Loading…
Reference in New Issue
Block a user