[wasm-simd][arm] Implement double precision conversion

Prototype these 6 instructions on arm:

- f64x2.convert_low_i32x4_s
- f64x2.convert_low_i32x4_u
- i32x4.trunc_sat_f64x2_s_zero
- i32x4.trunc_sat_f64x2_u_zero
- f32x4.demote_f64x2_zero
- f64x2.promote_low_f32x4

For all these instructions we rely on having Q registers that map to S
registers, which means we can only use q0 to q7. We fix the src/dst
to q0 arbitrarily.

Bug: v8:11265
Change-Id: Ied95f2dde9859a60fc216ed67615f80e9d795bb7
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2679842
Reviewed-by: Bill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72567}
This commit is contained in:
Ng Zhi An 2021-02-05 15:55:19 -08:00 committed by Commit Bot
parent a723767935
commit 3b6eb33543
6 changed files with 96 additions and 24 deletions

View File

@ -2065,6 +2065,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vrintn(dst.high(), src.high()); __ vrintn(dst.high(), src.high());
break; break;
} }
case kArmF64x2ConvertLowI32x4S: {
Simd128Register dst = i.OutputSimd128Register();
Simd128Register src = i.InputSimd128Register(0);
__ vcvt_f64_s32(dst.low(), SwVfpRegister::from_code(src.code() * 4));
__ vcvt_f64_s32(dst.high(), SwVfpRegister::from_code(src.code() * 4 + 1));
break;
}
case kArmF64x2ConvertLowI32x4U: {
Simd128Register dst = i.OutputSimd128Register();
Simd128Register src = i.InputSimd128Register(0);
__ vcvt_f64_u32(dst.low(), SwVfpRegister::from_code(src.code() * 4));
__ vcvt_f64_u32(dst.high(), SwVfpRegister::from_code(src.code() * 4 + 1));
break;
}
case kArmF64x2PromoteLowF32x4: {
Simd128Register dst = i.OutputSimd128Register();
Simd128Register src = i.InputSimd128Register(0);
__ vcvt_f64_f32(dst.low(), SwVfpRegister::from_code(src.code() * 4));
__ vcvt_f64_f32(dst.high(), SwVfpRegister::from_code(src.code() * 4 + 1));
break;
}
case kArmI64x2SplatI32Pair: { case kArmI64x2SplatI32Pair: {
Simd128Register dst = i.OutputSimd128Register(); Simd128Register dst = i.OutputSimd128Register();
__ vdup(Neon32, dst, i.InputRegister(0)); __ vdup(Neon32, dst, i.InputRegister(0));
@ -2339,6 +2360,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vbsl(dst, rhs, lhs); __ vbsl(dst, rhs, lhs);
break; break;
} }
case kArmF32x4DemoteF64x2Zero: {
Simd128Register dst = i.OutputSimd128Register();
Simd128Register src = i.InputSimd128Register(0);
__ vcvt_f32_f64(SwVfpRegister::from_code(dst.code() * 4), src.low());
__ vcvt_f32_f64(SwVfpRegister::from_code(dst.code() * 4 + 1), src.high());
__ vmov(dst.high(), 0);
break;
}
case kArmI32x4Splat: { case kArmI32x4Splat: {
__ vdup(Neon32, i.OutputSimd128Register(), i.InputRegister(0)); __ vdup(Neon32, i.OutputSimd128Register(), i.InputRegister(0));
break; break;
@ -2506,6 +2535,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vpadd(Neon32, dst.high(), scratch.low(), scratch.high()); __ vpadd(Neon32, dst.high(), scratch.low(), scratch.high());
break; break;
} }
case kArmI32x4TruncSatF64x2SZero: {
Simd128Register dst = i.OutputSimd128Register();
Simd128Register src = i.InputSimd128Register(0);
__ vcvt_s32_f64(SwVfpRegister::from_code(dst.code() * 4), src.low());
__ vcvt_s32_f64(SwVfpRegister::from_code(dst.code() * 4 + 1), src.high());
__ vmov(dst.high(), 0);
break;
}
case kArmI32x4TruncSatF64x2UZero: {
Simd128Register dst = i.OutputSimd128Register();
Simd128Register src = i.InputSimd128Register(0);
__ vcvt_u32_f64(SwVfpRegister::from_code(dst.code() * 4), src.low());
__ vcvt_u32_f64(SwVfpRegister::from_code(dst.code() * 4 + 1), src.high());
__ vmov(dst.high(), 0);
break;
}
case kArmI16x8Splat: { case kArmI16x8Splat: {
__ vdup(Neon16, i.OutputSimd128Register(), i.InputRegister(0)); __ vdup(Neon16, i.OutputSimd128Register(), i.InputRegister(0));
break; break;

View File

@ -154,6 +154,9 @@ namespace compiler {
V(ArmF64x2Floor) \ V(ArmF64x2Floor) \
V(ArmF64x2Trunc) \ V(ArmF64x2Trunc) \
V(ArmF64x2NearestInt) \ V(ArmF64x2NearestInt) \
V(ArmF64x2ConvertLowI32x4S) \
V(ArmF64x2ConvertLowI32x4U) \
V(ArmF64x2PromoteLowF32x4) \
V(ArmF32x4Splat) \ V(ArmF32x4Splat) \
V(ArmF32x4ExtractLane) \ V(ArmF32x4ExtractLane) \
V(ArmF32x4ReplaceLane) \ V(ArmF32x4ReplaceLane) \
@ -177,6 +180,7 @@ namespace compiler {
V(ArmF32x4Le) \ V(ArmF32x4Le) \
V(ArmF32x4Pmin) \ V(ArmF32x4Pmin) \
V(ArmF32x4Pmax) \ V(ArmF32x4Pmax) \
V(ArmF32x4DemoteF64x2Zero) \
V(ArmI64x2SplatI32Pair) \ V(ArmI64x2SplatI32Pair) \
V(ArmI64x2ReplaceLaneI32Pair) \ V(ArmI64x2ReplaceLaneI32Pair) \
V(ArmI64x2Neg) \ V(ArmI64x2Neg) \
@ -222,6 +226,8 @@ namespace compiler {
V(ArmI32x4Abs) \ V(ArmI32x4Abs) \
V(ArmI32x4BitMask) \ V(ArmI32x4BitMask) \
V(ArmI32x4DotI16x8S) \ V(ArmI32x4DotI16x8S) \
V(ArmI32x4TruncSatF64x2SZero) \
V(ArmI32x4TruncSatF64x2UZero) \
V(ArmI16x8Splat) \ V(ArmI16x8Splat) \
V(ArmI16x8ExtractLaneS) \ V(ArmI16x8ExtractLaneS) \
V(ArmI16x8ReplaceLane) \ V(ArmI16x8ReplaceLane) \

View File

@ -134,6 +134,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmF64x2Floor: case kArmF64x2Floor:
case kArmF64x2Trunc: case kArmF64x2Trunc:
case kArmF64x2NearestInt: case kArmF64x2NearestInt:
case kArmF64x2ConvertLowI32x4S:
case kArmF64x2ConvertLowI32x4U:
case kArmF64x2PromoteLowF32x4:
case kArmF32x4Splat: case kArmF32x4Splat:
case kArmF32x4ExtractLane: case kArmF32x4ExtractLane:
case kArmF32x4ReplaceLane: case kArmF32x4ReplaceLane:
@ -157,6 +160,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmF32x4Le: case kArmF32x4Le:
case kArmF32x4Pmin: case kArmF32x4Pmin:
case kArmF32x4Pmax: case kArmF32x4Pmax:
case kArmF32x4DemoteF64x2Zero:
case kArmI64x2SplatI32Pair: case kArmI64x2SplatI32Pair:
case kArmI64x2ReplaceLaneI32Pair: case kArmI64x2ReplaceLaneI32Pair:
case kArmI64x2Neg: case kArmI64x2Neg:
@ -202,6 +206,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmI32x4Abs: case kArmI32x4Abs:
case kArmI32x4BitMask: case kArmI32x4BitMask:
case kArmI32x4DotI16x8S: case kArmI32x4DotI16x8S:
case kArmI32x4TruncSatF64x2SZero:
case kArmI32x4TruncSatF64x2UZero:
case kArmI16x8Splat: case kArmI16x8Splat:
case kArmI16x8ExtractLaneS: case kArmI16x8ExtractLaneS:
case kArmI16x8ReplaceLane: case kArmI16x8ReplaceLane:

View File

@ -3149,6 +3149,45 @@ void InstructionSelector::VisitTruncateFloat32ToUint32(Node* node) {
Emit(opcode, g.DefineAsRegister(node), g.UseRegister(node->InputAt(0))); Emit(opcode, g.DefineAsRegister(node), g.UseRegister(node->InputAt(0)));
} }
// TODO(v8:9780)
// These double precision conversion instructions need a low Q register (q0-q7)
// because the codegen accesses the S registers they overlap with.
void InstructionSelector::VisitF64x2ConvertLowI32x4S(Node* node) {
ArmOperandGenerator g(this);
Emit(kArmF64x2ConvertLowI32x4S, g.DefineAsRegister(node),
g.UseFixed(node->InputAt(0), q0));
}
void InstructionSelector::VisitF64x2ConvertLowI32x4U(Node* node) {
ArmOperandGenerator g(this);
Emit(kArmF64x2ConvertLowI32x4U, g.DefineAsRegister(node),
g.UseFixed(node->InputAt(0), q0));
}
void InstructionSelector::VisitI32x4TruncSatF64x2SZero(Node* node) {
ArmOperandGenerator g(this);
Emit(kArmI32x4TruncSatF64x2SZero, g.DefineAsFixed(node, q0),
g.UseRegister(node->InputAt(0)));
}
void InstructionSelector::VisitI32x4TruncSatF64x2UZero(Node* node) {
ArmOperandGenerator g(this);
Emit(kArmI32x4TruncSatF64x2UZero, g.DefineAsFixed(node, q0),
g.UseRegister(node->InputAt(0)));
}
void InstructionSelector::VisitF32x4DemoteF64x2Zero(Node* node) {
ArmOperandGenerator g(this);
Emit(kArmF32x4DemoteF64x2Zero, g.DefineAsFixed(node, q0),
g.UseRegister(node->InputAt(0)));
}
void InstructionSelector::VisitF64x2PromoteLowF32x4(Node* node) {
ArmOperandGenerator g(this);
Emit(kArmF64x2PromoteLowF32x4, g.DefineAsRegister(node),
g.UseFixed(node->InputAt(0), q0));
}
// static // static
MachineOperatorBuilder::Flags MachineOperatorBuilder::Flags
InstructionSelector::SupportedMachineOperatorFlags() { InstructionSelector::SupportedMachineOperatorFlags() {

View File

@ -2787,27 +2787,6 @@ void InstructionSelector::VisitI64x2SignSelect(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64 #endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64
// && !V8_TARGET_ARCH_ARM // && !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_IA32
void InstructionSelector::VisitF64x2ConvertLowI32x4S(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitF64x2ConvertLowI32x4U(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitF64x2PromoteLowF32x4(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitF32x4DemoteF64x2Zero(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI32x4TruncSatF64x2SZero(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI32x4TruncSatF64x2UZero(Node* node) {
UNIMPLEMENTED();
}
#endif //! V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_IA32
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 #if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64
// TODO(v8:11297) Prototype i32x4.widen_i8x16_u // TODO(v8:11297) Prototype i32x4.widen_i8x16_u
void InstructionSelector::VisitI32x4WidenI8x16S(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI32x4WidenI8x16S(Node* node) { UNIMPLEMENTED(); }

View File

@ -1278,8 +1278,6 @@ WASM_SIMD_TEST(F64x2NearestInt) {
true); true);
} }
// TODO(v8:11265): Prototyping double precision conversions.
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32
template <typename SrcType> template <typename SrcType>
void RunF64x2ConvertLowI32x4Test(TestExecutionTier execution_tier, void RunF64x2ConvertLowI32x4Test(TestExecutionTier execution_tier,
LowerSimd lower_simd, WasmOpcode opcode) { LowerSimd lower_simd, WasmOpcode opcode) {
@ -1396,7 +1394,6 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2PromoteLowF32x4) {
} }
} }
} }
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32
void RunF64x2BinOpTest(TestExecutionTier execution_tier, LowerSimd lower_simd, void RunF64x2BinOpTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, DoubleBinOp expected_op) { WasmOpcode opcode, DoubleBinOp expected_op) {