[wasm-simd][arm] Prototype f32x4.ceil

Prototype f32x4.ceil on ARM for both ARM v7 and ARM v8. ARM v8 has
support for vrintp, and for ARM v7 we fallback to runtime.

Since ARM v8 uses vrintp, which is the same instruction used for F32
Ceil (scalar), wasm-compiler reuses the Float32Round check, rather than
creating new F32x4Round optional operators.

Implementation for vrintp (Advanced SIMD version that takes Q
registers), assembler, disassembler support. Incomplete for now, but
more will be added as we add other rounding modes.

Bug: v8:10553
Change-Id: I4563608b9501f6f57c3a8325b17de89da7058a43
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2248779
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Reviewed-by: Jakob Kummerow <jkummerow@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#68419}
This commit is contained in:
Ng Zhi An 2020-06-17 11:29:32 -07:00 committed by Commit Bot
parent 0d9eb10552
commit d9381fd697
15 changed files with 117 additions and 6 deletions

View File

@ -3596,6 +3596,23 @@ void Assembler::vrintp(const DwVfpRegister dst, const DwVfpRegister src) {
vd * B12 | 0x5 * B9 | B8 | B6 | m * B5 | vm);
}
void Assembler::vrintp(NeonDataType dt, const QwNeonRegister dst,
const QwNeonRegister src) {
// cond=kSpecialCondition(31-28) | 00111(27-23)| D(22) | 11(21-20) |
// size(19-18) | 10(17-16) | Vd(15-12) | 01(11-10) | 7(9-7) | 1(6) | M(5) |
// 0(4) | Vm(3-0)
DCHECK(IsEnabled(ARMv8));
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
int size = NeonSz(dt);
// Only F32 is implemented for now.
DCHECK_EQ(0x2, dt);
emit(kSpecialCondition | 0x7 * B23 | d * B22 | 0x3 * B20 | size * B18 |
0x2 * B16 | vd * B12 | 0x1 * B10 | 0x7 * B7 | B6 | m * B5 | vm);
}
void Assembler::vrintm(const SwVfpRegister dst, const SwVfpRegister src) {
// cond=kSpecialCondition(31-28) | 11101(27-23)| D(22) | 11(21-20) |
// 10(19-18) | RM=11(17-16) | Vd(15-12) | 101(11-9) | sz=0(8) | 01(7-6) |

View File

@ -820,7 +820,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vsqrt(const SwVfpRegister dst, const SwVfpRegister src,
const Condition cond = al);
// ARMv8 rounding instructions.
// ARMv8 rounding instructions (Scalar).
void vrinta(const SwVfpRegister dst, const SwVfpRegister src);
void vrinta(const DwVfpRegister dst, const DwVfpRegister src);
void vrintn(const SwVfpRegister dst, const SwVfpRegister src);
@ -908,6 +908,11 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
DwVfpRegister src2);
void vpmax(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2);
// ARMv8 rounding instructions (NEON).
void vrintp(NeonDataType dt, const QwNeonRegister dst,
const QwNeonRegister src);
void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
QwNeonRegister shift);

View File

@ -297,6 +297,7 @@ FUNCTION_REFERENCE(wasm_word32_rol, wasm::word32_rol_wrapper)
FUNCTION_REFERENCE(wasm_word32_ror, wasm::word32_ror_wrapper)
FUNCTION_REFERENCE(wasm_word64_rol, wasm::word64_rol_wrapper)
FUNCTION_REFERENCE(wasm_word64_ror, wasm::word64_ror_wrapper)
FUNCTION_REFERENCE(wasm_f32x4_ceil, wasm::f32x4_ceil_wrapper)
FUNCTION_REFERENCE(wasm_memory_init, wasm::memory_init_wrapper)
FUNCTION_REFERENCE(wasm_memory_copy, wasm::memory_copy_wrapper)
FUNCTION_REFERENCE(wasm_memory_fill, wasm::memory_fill_wrapper)

View File

@ -206,6 +206,7 @@ class StatsCounter;
V(wasm_word64_ror, "wasm::word64_ror") \
V(wasm_word64_ctz, "wasm::word64_ctz") \
V(wasm_word64_popcnt, "wasm::word64_popcnt") \
V(wasm_f32x4_ceil, "wasm::f32x4_ceil_wrapper") \
V(wasm_memory_init, "wasm::memory_init") \
V(wasm_memory_copy, "wasm::memory_copy") \
V(wasm_memory_fill, "wasm::memory_fill") \

View File

@ -1466,7 +1466,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kArmVrintpF32: {
CpuFeatureScope scope(tasm(), ARMv8);
__ vrintp(i.OutputFloatRegister(), i.InputFloatRegister(0));
if (instr->InputAt(0)->IsSimd128Register()) {
__ vrintp(NeonS32, i.OutputSimd128Register(),
i.InputSimd128Register(0));
} else {
__ vrintp(i.OutputFloatRegister(), i.InputFloatRegister(0));
}
break;
}
case kArmVrintpF64: {

View File

@ -1495,7 +1495,8 @@ void InstructionSelector::VisitUint32Mod(Node* node) {
V(Float64RoundTruncate, kArmVrintzF64) \
V(Float64RoundTiesAway, kArmVrintaF64) \
V(Float32RoundTiesEven, kArmVrintnF32) \
V(Float64RoundTiesEven, kArmVrintnF64)
V(Float64RoundTiesEven, kArmVrintnF64) \
V(F32x4Ceil, kArmVrintpF32)
#define RRR_OP_LIST(V) \
V(Int32MulHigh, kArmSmmul) \

View File

@ -2690,11 +2690,15 @@ void InstructionSelector::VisitF64x2Pmax(Node* node) { UNIMPLEMENTED(); }
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_S390X && \
!V8_TARGET_ARCH_IA32
// TODO(v8:10553) Prototyping floating point rounding instructions.
// TODO(zhin): Temporary convoluted way to for unimplemented opcodes on ARM as
// we are implementing them one at a time.
#if !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitF32x4Ceil(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitF64x2Ceil(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Floor(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Trunc(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2NearestInt(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Ceil(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Floor(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Trunc(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4NearestInt(Node* node) { UNIMPLEMENTED(); }

View File

@ -4040,6 +4040,12 @@ Node* WasmGraphBuilder::BuildAsmjsStoreMem(MachineType type, Node* index,
return val;
}
Node* WasmGraphBuilder::BuildF32x4Ceil(Node* input) {
MachineType type = MachineType::Simd128();
ExternalReference ref = ExternalReference::wasm_f32x4_ceil();
return BuildCFuncInstruction(ref, type, input);
}
void WasmGraphBuilder::PrintDebugName(Node* node) {
PrintF("#%d:%s", node->id(), node->op()->mnemonic());
}
@ -4281,6 +4287,9 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
return graph()->NewNode(mcgraph()->machine()->F32x4Pmax(), inputs[0],
inputs[1]);
case wasm::kExprF32x4Ceil:
// Architecture support for F32x4Ceil and Float32RoundUp is the same.
if (!mcgraph()->machine()->Float32RoundUp().IsSupported())
return BuildF32x4Ceil(inputs[0]);
return graph()->NewNode(mcgraph()->machine()->F32x4Ceil(), inputs[0]);
case wasm::kExprF32x4Floor:
return graph()->NewNode(mcgraph()->machine()->F32x4Floor(), inputs[0]);

View File

@ -553,6 +553,9 @@ class WasmGraphBuilder {
Node* BuildAsmjsLoadMem(MachineType type, Node* index);
Node* BuildAsmjsStoreMem(MachineType type, Node* index, Node* val);
// Wasm SIMD.
Node* BuildF32x4Ceil(Node* input);
void BuildEncodeException32BitValue(Node* values_array, uint32_t* index,
Node* value);
Node* BuildDecodeException32BitValue(Node* values_array, uint32_t* index);

View File

@ -2264,6 +2264,21 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.%c%i d%d, q%d", name,
type, size, Vd, Vm);
} else if (instr->Bits(17, 16) == 0x2 && instr->Bit(10) == 1) {
// vrintp
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
bool dp_op = instr->Bit(6) == 0;
int rounding_mode = instr->Bits(9, 7);
if (rounding_mode != 7) {
UNIMPLEMENTED();
}
if (dp_op) {
Format(instr, "vrintp.f32.f32 'Dd, 'Dm");
} else {
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vrintp.f32.f32 q%d, q%d", Vd, Vm);
}
} else {
int Vd, Vm;
if (instr->Bit(6) == 0) {

View File

@ -5442,6 +5442,33 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNIMPLEMENTED();
break;
}
} else if (instr->Bits(17, 16) == 0x2 && instr->Bit(10) == 1) {
// vrint<q>.<dt> <Dd>, <Dm>
// vrint<q>.<dt> <Qd>, <Qm>
// See F6.1.205
int regs = instr->Bit(6) + 1;
int rounding_mode = instr->Bits(9, 7);
float (*fproundint)(float) = nullptr;
switch (rounding_mode) {
case 7:
fproundint = &ceilf;
break;
default:
UNIMPLEMENTED();
}
int vm = instr->VFPMRegValue(kDoublePrecision);
int vd = instr->VFPDRegValue(kDoublePrecision);
float floats[2];
for (int r = 0; r < regs; r++) {
// We cannot simply use GetVFPSingleValue since our Q registers
// might not map to any S registers at all.
get_neon_register<float, kDoubleSize>(vm + r, floats);
for (int e = 0; e < 2; e++) {
floats[e] = canonicalizeNaN(fproundint(floats[e]));
}
set_neon_register<float, kDoubleSize>(vd + r, floats);
}
} else {
UNIMPLEMENTED();
}

View File

@ -401,6 +401,20 @@ void float64_pow_wrapper(Address data) {
WriteUnalignedValue<double>(data, base::ieee754::pow(x, y));
}
template <typename T, T (*float_round_op)(T)>
void simd_float_round_wrapper(Address data) {
constexpr int n = kSimd128Size / sizeof(T);
for (int i = 0; i < n; i++) {
WriteUnalignedValue<T>(
data + (i * sizeof(T)),
float_round_op(ReadUnalignedValue<T>(data + (i * sizeof(T)))));
}
}
void f32x4_ceil_wrapper(Address data) {
simd_float_round_wrapper<float, &ceilf>(data);
}
namespace {
class ThreadNotInWasmScope {
// Asan on Windows triggers exceptions to allocate shadow memory lazily. When

View File

@ -79,6 +79,8 @@ V8_EXPORT_PRIVATE void word64_ror_wrapper(Address data);
V8_EXPORT_PRIVATE void float64_pow_wrapper(Address data);
V8_EXPORT_PRIVATE void f32x4_ceil_wrapper(Address data);
// The return type is {int32_t} instead of {bool} to enforce the compiler to
// zero-extend the result in the return register.
int32_t memory_init_wrapper(Address data);

View File

@ -916,6 +916,9 @@ TEST(ARMv8_vrintX_disasm) {
COMPARE(vrintz(d0, d0), "eeb60bc0 vrintz.f64.f64 d0, d0");
COMPARE(vrintz(d2, d3, ne), "1eb62bc3 vrintzne.f64.f64 d2, d3");
// Advanced SIMD
COMPARE(vrintp(NeonS32, q0, q3), "f3ba07c6 vrintp.f32.f32 q0, q3");
}
VERIFY_RUN();

View File

@ -692,12 +692,15 @@ WASM_SIMD_TEST(F32x4RecipSqrtApprox) {
// TODO(v8:10553) Prototyping floating-point rounding instructions.
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_S390X || \
V8_TARGET_ARCH_IA32
V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM
WASM_SIMD_TEST_NO_LOWERING(F32x4Ceil) {
FLAG_SCOPE(wasm_simd_post_mvp);
RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4Ceil, ceilf, true);
}
// TODO(zhin): Temporary convoluted way to exclude running these tests on ARM as
// we are implementing each opcode one at a time.
#if !V8_TARGET_ARCH_ARM
WASM_SIMD_TEST_NO_LOWERING(F32x4Floor) {
FLAG_SCOPE(wasm_simd_post_mvp);
RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4Floor, floorf, true);
@ -713,8 +716,9 @@ WASM_SIMD_TEST_NO_LOWERING(F32x4NearestInt) {
RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4NearestInt, nearbyintf,
true);
}
#endif // !V8_TARGET_ARCH_ARM
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_S390X ||
// V8_TARGET_ARCH_IA32
// V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM
void RunF32x4BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, FloatBinOp expected_op) {