[wasm-simd][arm] Prototype i32x4.dot_i16x8_s

This implements I32x4DotI16x8S for arm.

Bug: v8:10583
Change-Id: I4541f4f5bc7daba03c6ab2040589893c8ef571bc
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2230787
Reviewed-by: Jakob Kummerow <jkummerow@chromium.org>
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#68550}
This commit is contained in:
Ng Zhi An 2020-06-24 09:54:23 -07:00 committed by Commit Bot
parent 1ae2636293
commit 91bf68ae70
9 changed files with 85 additions and 13 deletions

View File

@ -4338,7 +4338,6 @@ void Assembler::vmull(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src1,
src2.split_code(&vm, &m);
int size = NeonSz(dt);
int u = NeonU(dt);
if (!u) UNIMPLEMENTED();
emit(0xFU * B28 | B25 | u * B24 | B23 | d * B22 | size * B20 | vn * B16 |
vd * B12 | 0xC * B8 | n * B7 | m * B5 | vm);
}

View File

@ -2434,6 +2434,19 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ VmovLow(dst, tmp2.low());
break;
}
case kArmI32x4DotI16x8S: {
Simd128Register dst = i.OutputSimd128Register();
Simd128Register lhs = i.InputSimd128Register(0);
Simd128Register rhs = i.InputSimd128Register(1);
Simd128Register tmp1 = i.TempSimd128Register(0);
UseScratchRegisterScope temps(tasm());
Simd128Register scratch = temps.AcquireQ();
__ vmull(NeonS16, tmp1, lhs.low(), rhs.low());
__ vmull(NeonS16, scratch, lhs.high(), rhs.high());
__ vpadd(Neon32, dst.low(), tmp1.low(), tmp1.high());
__ vpadd(Neon32, dst.high(), scratch.low(), scratch.high());
break;
}
case kArmI16x8Splat: {
__ vdup(Neon16, i.OutputSimd128Register(), i.InputRegister(0));
break;

View File

@ -207,6 +207,7 @@ namespace compiler {
V(ArmI32x4GeU) \
V(ArmI32x4Abs) \
V(ArmI32x4BitMask) \
V(ArmI32x4DotI16x8S) \
V(ArmI16x8Splat) \
V(ArmI16x8ExtractLaneS) \
V(ArmI16x8ReplaceLane) \

View File

@ -187,6 +187,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmI32x4GeU:
case kArmI32x4Abs:
case kArmI32x4BitMask:
case kArmI32x4DotI16x8S:
case kArmI16x8Splat:
case kArmI16x8ExtractLaneS:
case kArmI16x8ReplaceLane:

View File

@ -2633,6 +2633,14 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(S128Xor, kArmS128Xor) \
V(S128AndNot, kArmS128AndNot)
void InstructionSelector::VisitI32x4DotI16x8S(Node* node) {
ArmOperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register()};
Emit(kArmI32x4DotI16x8S, g.DefineAsRegister(node),
g.UseUniqueRegister(node->InputAt(0)),
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps);
}
void InstructionSelector::VisitS128Zero(Node* node) {
ArmOperandGenerator g(this);
Emit(kArmS128Zero, g.DefineAsRegister(node));

View File

@ -2705,10 +2705,12 @@ void InstructionSelector::VisitF64x2NearestInt(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_S390X
// && !V8_TARGET_ARCH_IA32
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64 && \
!V8_TARGET_ARCH_ARM
// TODO(v8:10583) Prototype i32x4.dot_i16x8_s
void InstructionSelector::VisitI32x4DotI16x8S(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64
// && !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }

View File

@ -2065,6 +2065,16 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vshr.s%d q%d, q%d, #%d",
size, Vd, Vm, shift);
} else if (instr->Bits(11, 8) == 0xC && instr->Bit(6) == 0 &&
instr->Bit(4) == 0) {
// vmull.s<size> Qd, Dn, Dm
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int size = 8 << instr->Bits(21, 20);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vmull.s%d q%d, d%d, d%d",
size, Vd, Vn, Vm);
} else {
Unknown(instr);
}

View File

@ -4241,6 +4241,28 @@ void RoundingAverageUnsigned(Simulator* simulator, int Vd, int Vm, int Vn) {
simulator->set_neon_register<T, SIZE>(Vd, src1);
}
template <typename NarrowType, typename WideType>
void MultiplyLong(Simulator* simulator, int Vd, int Vn, int Vm) {
DCHECK_EQ(sizeof(WideType), 2 * sizeof(NarrowType));
static const int kElems = kSimd128Size / sizeof(WideType);
NarrowType src1[kElems], src2[kElems];
WideType dst[kElems];
// Get the entire d reg, then memcpy it to an array so we can address the
// underlying datatype easily.
uint64_t tmp;
simulator->get_d_register(Vn, &tmp);
memcpy(src1, &tmp, sizeof(tmp));
simulator->get_d_register(Vm, &tmp);
memcpy(src2, &tmp, sizeof(tmp));
for (int i = 0; i < kElems; i++) {
dst[i] = WideType{src1[i]} * WideType{src2[i]};
}
simulator->set_neon_register<WideType>(Vd, dst);
}
void Simulator::DecodeSpecialCondition(Instruction* instr) {
switch (instr->SpecialValue()) {
case 4: {
@ -4686,6 +4708,21 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
ArithmeticShiftRight<int64_t, kSimd128Size>(this, Vd, Vm, shift);
break;
}
} else if (instr->Bits(11, 8) == 0xC && instr->Bit(6) == 0 &&
instr->Bit(4) == 0) {
// vmull.s<size> Qd, Dn, Dm
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
switch (size) {
case Neon16: {
MultiplyLong<int16_t, int32_t>(this, Vd, Vn, Vm);
break;
}
default:
UNIMPLEMENTED();
}
} else {
UNIMPLEMENTED();
}
@ -5579,18 +5616,17 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
instr->Bit(4) == 0) {
// vmull.u<size> Qd, Dn, Dm
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
if (size != Neon32) UNIMPLEMENTED();
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
uint64_t src1, src2, dst[2];
get_d_register(Vn, &src1);
get_d_register(Vm, &src2);
dst[0] = (src1 & 0xFFFFFFFFULL) * (src2 & 0xFFFFFFFFULL);
dst[1] = (src1 >> 32) * (src2 >> 32);
set_neon_register<uint64_t>(Vd, dst);
switch (size) {
case Neon32: {
MultiplyLong<uint32_t, uint64_t>(this, Vd, Vn, Vm);
break;
}
default:
UNIMPLEMENTED();
}
} else {
UNIMPLEMENTED();
}

View File

@ -2313,7 +2313,8 @@ WASM_SIMD_TEST(I16x8RoundingAverageU) {
}
// TODO(v8:10583) Prototype i32x4.dot_i16x8_s
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 || \
V8_TARGET_ARCH_ARM
WASM_SIMD_TEST_NO_LOWERING(I32x4DotI16x8S) {
FLAG_SCOPE(wasm_simd_post_mvp);
@ -2340,7 +2341,8 @@ WASM_SIMD_TEST_NO_LOWERING(I32x4DotI16x8S) {
}
}
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 ||
// V8_TARGET_ARCH_ARM
void RunI16x8ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, Int16ShiftOp expected_op) {