[wasm-simd][arm] Prototype f32x4.ceil

Prototype f32x4.ceil on ARM for both ARM v7 and ARM v8. ARM v8 has support for vrintp, and for ARM v7 we fallback to runtime. Since ARM v8 uses vrintp, which is the same instruction used for F32 Ceil (scalar), wasm-compiler reuses the Float32Round check, rather than creating new F32x4Round optional operators. Implementation for vrintp (Advanced SIMD version that takes Q registers), assembler, disassembler support. Incomplete for now, but more will be added as we add other rounding modes. Bug: v8:10553 Change-Id: I4563608b9501f6f57c3a8325b17de89da7058a43 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2248779 Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Reviewed-by: Jakob Kummerow <jkummerow@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#68419}
2020-06-17 11:29:32 -07:00 · 2020-06-17 11:29:32 -07:00 · d9381fd697
commit d9381fd697
parent 0d9eb10552
15 changed files with 117 additions and 6 deletions
--- a/src/codegen/arm/assembler-arm.cc
+++ b/src/codegen/arm/assembler-arm.cc
@ -3596,6 +3596,23 @@ void Assembler::vrintp(const DwVfpRegister dst, const DwVfpRegister src) {
       vd * B12 | 0x5 * B9 | B8 | B6 | m * B5 | vm);
 }

+void Assembler::vrintp(NeonDataType dt, const QwNeonRegister dst,
+                       const QwNeonRegister src) {
+  // cond=kSpecialCondition(31-28) | 00111(27-23)| D(22) | 11(21-20) |
+  // size(19-18) | 10(17-16) | Vd(15-12) | 01(11-10) | 7(9-7) | 1(6) | M(5) |
+  // 0(4) | Vm(3-0)
+  DCHECK(IsEnabled(ARMv8));
+  int vd, d;
+  dst.split_code(&vd, &d);
+  int vm, m;
+  src.split_code(&vm, &m);
+  int size = NeonSz(dt);
+  // Only F32 is implemented for now.
+  DCHECK_EQ(0x2, dt);
+  emit(kSpecialCondition | 0x7 * B23 | d * B22 | 0x3 * B20 | size * B18 |
+       0x2 * B16 | vd * B12 | 0x1 * B10 | 0x7 * B7 | B6 | m * B5 | vm);
+}
+
 void Assembler::vrintm(const SwVfpRegister dst, const SwVfpRegister src) {
  // cond=kSpecialCondition(31-28) | 11101(27-23)| D(22) | 11(21-20) |
  // 10(19-18) | RM=11(17-16) |  Vd(15-12) | 101(11-9) | sz=0(8) | 01(7-6) |
--- a/src/codegen/arm/assembler-arm.h
+++ b/src/codegen/arm/assembler-arm.h
@ -820,7 +820,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
  void vsqrt(const SwVfpRegister dst, const SwVfpRegister src,
             const Condition cond = al);

-  // ARMv8 rounding instructions.
+  // ARMv8 rounding instructions (Scalar).
  void vrinta(const SwVfpRegister dst, const SwVfpRegister src);
  void vrinta(const DwVfpRegister dst, const DwVfpRegister src);
  void vrintn(const SwVfpRegister dst, const SwVfpRegister src);
@ -908,6 +908,11 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
             DwVfpRegister src2);
  void vpmax(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
             DwVfpRegister src2);
+
+  // ARMv8 rounding instructions (NEON).
+  void vrintp(NeonDataType dt, const QwNeonRegister dst,
+              const QwNeonRegister src);
+
  void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
  void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
            QwNeonRegister shift);
--- a/src/codegen/external-reference.cc
+++ b/src/codegen/external-reference.cc
@ -297,6 +297,7 @@ FUNCTION_REFERENCE(wasm_word32_rol, wasm::word32_rol_wrapper)
 FUNCTION_REFERENCE(wasm_word32_ror, wasm::word32_ror_wrapper)
 FUNCTION_REFERENCE(wasm_word64_rol, wasm::word64_rol_wrapper)
 FUNCTION_REFERENCE(wasm_word64_ror, wasm::word64_ror_wrapper)
+FUNCTION_REFERENCE(wasm_f32x4_ceil, wasm::f32x4_ceil_wrapper)
 FUNCTION_REFERENCE(wasm_memory_init, wasm::memory_init_wrapper)
 FUNCTION_REFERENCE(wasm_memory_copy, wasm::memory_copy_wrapper)
 FUNCTION_REFERENCE(wasm_memory_fill, wasm::memory_fill_wrapper)
--- a/src/codegen/external-reference.h
+++ b/src/codegen/external-reference.h
@ -206,6 +206,7 @@ class StatsCounter;
  V(wasm_word64_ror, "wasm::word64_ror")                                      \
  V(wasm_word64_ctz, "wasm::word64_ctz")                                      \
  V(wasm_word64_popcnt, "wasm::word64_popcnt")                                \
+  V(wasm_f32x4_ceil, "wasm::f32x4_ceil_wrapper")                              \
  V(wasm_memory_init, "wasm::memory_init")                                    \
  V(wasm_memory_copy, "wasm::memory_copy")                                    \
  V(wasm_memory_fill, "wasm::memory_fill")                                    \
--- a/src/compiler/backend/arm/code-generator-arm.cc
+++ b/src/compiler/backend/arm/code-generator-arm.cc
@ -1466,7 +1466,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
    case kArmVrintpF32: {
      CpuFeatureScope scope(tasm(), ARMv8);
-      __ vrintp(i.OutputFloatRegister(), i.InputFloatRegister(0));
+      if (instr->InputAt(0)->IsSimd128Register()) {
+        __ vrintp(NeonS32, i.OutputSimd128Register(),
+                  i.InputSimd128Register(0));
+      } else {
+        __ vrintp(i.OutputFloatRegister(), i.InputFloatRegister(0));
+      }
      break;
    }
    case kArmVrintpF64: {
--- a/src/compiler/backend/arm/instruction-selector-arm.cc
+++ b/src/compiler/backend/arm/instruction-selector-arm.cc
@ -1495,7 +1495,8 @@ void InstructionSelector::VisitUint32Mod(Node* node) {
  V(Float64RoundTruncate, kArmVrintzF64) \
  V(Float64RoundTiesAway, kArmVrintaF64) \
  V(Float32RoundTiesEven, kArmVrintnF32) \
-  V(Float64RoundTiesEven, kArmVrintnF64)
+  V(Float64RoundTiesEven, kArmVrintnF64) \
+  V(F32x4Ceil, kArmVrintpF32)

 #define RRR_OP_LIST(V)          \
  V(Int32MulHigh, kArmSmmul)    \
--- a/src/compiler/backend/instruction-selector.cc
+++ b/src/compiler/backend/instruction-selector.cc
@ -2690,11 +2690,15 @@ void InstructionSelector::VisitF64x2Pmax(Node* node) { UNIMPLEMENTED(); }
 #if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_S390X && \
    !V8_TARGET_ARCH_IA32
 // TODO(v8:10553) Prototyping floating point rounding instructions.
+// TODO(zhin): Temporary convoluted way to for unimplemented opcodes on ARM as
+// we are implementing them one at a time.
+#if !V8_TARGET_ARCH_ARM
+void InstructionSelector::VisitF32x4Ceil(Node* node) { UNIMPLEMENTED(); }
+#endif  // !V8_TARGET_ARCH_ARM
 void InstructionSelector::VisitF64x2Ceil(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF64x2Floor(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF64x2Trunc(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF64x2NearestInt(Node* node) { UNIMPLEMENTED(); }
-void InstructionSelector::VisitF32x4Ceil(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF32x4Floor(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF32x4Trunc(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF32x4NearestInt(Node* node) { UNIMPLEMENTED(); }
--- a/src/compiler/wasm-compiler.cc
+++ b/src/compiler/wasm-compiler.cc
@ -4040,6 +4040,12 @@ Node* WasmGraphBuilder::BuildAsmjsStoreMem(MachineType type, Node* index,
  return val;
 }

+Node* WasmGraphBuilder::BuildF32x4Ceil(Node* input) {
+  MachineType type = MachineType::Simd128();
+  ExternalReference ref = ExternalReference::wasm_f32x4_ceil();
+  return BuildCFuncInstruction(ref, type, input);
+}
+
 void WasmGraphBuilder::PrintDebugName(Node* node) {
  PrintF("#%d:%s", node->id(), node->op()->mnemonic());
 }
@ -4281,6 +4287,9 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
      return graph()->NewNode(mcgraph()->machine()->F32x4Pmax(), inputs[0],
                              inputs[1]);
    case wasm::kExprF32x4Ceil:
+      // Architecture support for F32x4Ceil and Float32RoundUp is the same.
+      if (!mcgraph()->machine()->Float32RoundUp().IsSupported())
+        return BuildF32x4Ceil(inputs[0]);
      return graph()->NewNode(mcgraph()->machine()->F32x4Ceil(), inputs[0]);
    case wasm::kExprF32x4Floor:
      return graph()->NewNode(mcgraph()->machine()->F32x4Floor(), inputs[0]);
--- a/src/compiler/wasm-compiler.h
+++ b/src/compiler/wasm-compiler.h
@ -553,6 +553,9 @@ class WasmGraphBuilder {
  Node* BuildAsmjsLoadMem(MachineType type, Node* index);
  Node* BuildAsmjsStoreMem(MachineType type, Node* index, Node* val);

+  // Wasm SIMD.
+  Node* BuildF32x4Ceil(Node* input);
+
  void BuildEncodeException32BitValue(Node* values_array, uint32_t* index,
                                      Node* value);
  Node* BuildDecodeException32BitValue(Node* values_array, uint32_t* index);
--- a/src/diagnostics/arm/disasm-arm.cc
+++ b/src/diagnostics/arm/disasm-arm.cc
@ -2264,6 +2264,21 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
          out_buffer_pos_ +=
              SNPrintF(out_buffer_ + out_buffer_pos_, "%s.%c%i d%d, q%d", name,
                       type, size, Vd, Vm);
+        } else if (instr->Bits(17, 16) == 0x2 && instr->Bit(10) == 1) {
+          // vrintp
+          int Vd = instr->VFPDRegValue(kSimd128Precision);
+          int Vm = instr->VFPMRegValue(kSimd128Precision);
+          bool dp_op = instr->Bit(6) == 0;
+          int rounding_mode = instr->Bits(9, 7);
+          if (rounding_mode != 7) {
+            UNIMPLEMENTED();
+          }
+          if (dp_op) {
+            Format(instr, "vrintp.f32.f32 'Dd, 'Dm");
+          } else {
+            out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
+                                        "vrintp.f32.f32 q%d, q%d", Vd, Vm);
+          }
        } else {
          int Vd, Vm;
          if (instr->Bit(6) == 0) {
--- a/src/execution/arm/simulator-arm.cc
+++ b/src/execution/arm/simulator-arm.cc
@ -5442,6 +5442,33 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
              UNIMPLEMENTED();
              break;
          }
+        } else if (instr->Bits(17, 16) == 0x2 && instr->Bit(10) == 1) {
+          // vrint<q>.<dt> <Dd>, <Dm>
+          // vrint<q>.<dt> <Qd>, <Qm>
+          // See F6.1.205
+          int regs = instr->Bit(6) + 1;
+          int rounding_mode = instr->Bits(9, 7);
+          float (*fproundint)(float) = nullptr;
+          switch (rounding_mode) {
+            case 7:
+              fproundint = &ceilf;
+              break;
+            default:
+              UNIMPLEMENTED();
+          }
+          int vm = instr->VFPMRegValue(kDoublePrecision);
+          int vd = instr->VFPDRegValue(kDoublePrecision);
+
+          float floats[2];
+          for (int r = 0; r < regs; r++) {
+            // We cannot simply use GetVFPSingleValue since our Q registers
+            // might not map to any S registers at all.
+            get_neon_register<float, kDoubleSize>(vm + r, floats);
+            for (int e = 0; e < 2; e++) {
+              floats[e] = canonicalizeNaN(fproundint(floats[e]));
+            }
+            set_neon_register<float, kDoubleSize>(vd + r, floats);
+          }
        } else {
          UNIMPLEMENTED();
        }
--- a/src/wasm/wasm-external-refs.cc
+++ b/src/wasm/wasm-external-refs.cc
@ -401,6 +401,20 @@ void float64_pow_wrapper(Address data) {
  WriteUnalignedValue<double>(data, base::ieee754::pow(x, y));
 }

+template <typename T, T (*float_round_op)(T)>
+void simd_float_round_wrapper(Address data) {
+  constexpr int n = kSimd128Size / sizeof(T);
+  for (int i = 0; i < n; i++) {
+    WriteUnalignedValue<T>(
+        data + (i * sizeof(T)),
+        float_round_op(ReadUnalignedValue<T>(data + (i * sizeof(T)))));
+  }
+}
+
+void f32x4_ceil_wrapper(Address data) {
+  simd_float_round_wrapper<float, &ceilf>(data);
+}
+
 namespace {
 class ThreadNotInWasmScope {
 // Asan on Windows triggers exceptions to allocate shadow memory lazily. When
--- a/src/wasm/wasm-external-refs.h
+++ b/src/wasm/wasm-external-refs.h
@ -79,6 +79,8 @@ V8_EXPORT_PRIVATE void word64_ror_wrapper(Address data);

 V8_EXPORT_PRIVATE void float64_pow_wrapper(Address data);

+V8_EXPORT_PRIVATE void f32x4_ceil_wrapper(Address data);
+
 // The return type is {int32_t} instead of {bool} to enforce the compiler to
 // zero-extend the result in the return register.
 int32_t memory_init_wrapper(Address data);
--- a/test/cctest/test-disasm-arm.cc
+++ b/test/cctest/test-disasm-arm.cc
@ -916,6 +916,9 @@ TEST(ARMv8_vrintX_disasm) {

    COMPARE(vrintz(d0, d0), "eeb60bc0       vrintz.f64.f64 d0, d0");
    COMPARE(vrintz(d2, d3, ne), "1eb62bc3       vrintzne.f64.f64 d2, d3");
+
+    // Advanced SIMD
+    COMPARE(vrintp(NeonS32, q0, q3), "f3ba07c6       vrintp.f32.f32 q0, q3");
  }

  VERIFY_RUN();
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -692,12 +692,15 @@ WASM_SIMD_TEST(F32x4RecipSqrtApprox) {

 // TODO(v8:10553) Prototyping floating-point rounding instructions.
 #if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_S390X || \
-    V8_TARGET_ARCH_IA32
+    V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM
 WASM_SIMD_TEST_NO_LOWERING(F32x4Ceil) {
  FLAG_SCOPE(wasm_simd_post_mvp);
  RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4Ceil, ceilf, true);
 }

+// TODO(zhin): Temporary convoluted way to exclude running these tests on ARM as
+// we are implementing each opcode one at a time.
+#if !V8_TARGET_ARCH_ARM
 WASM_SIMD_TEST_NO_LOWERING(F32x4Floor) {
  FLAG_SCOPE(wasm_simd_post_mvp);
  RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4Floor, floorf, true);
@ -713,8 +716,9 @@ WASM_SIMD_TEST_NO_LOWERING(F32x4NearestInt) {
  RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4NearestInt, nearbyintf,
                   true);
 }
+#endif  // !V8_TARGET_ARCH_ARM
 #endif  // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_S390X ||
-        // V8_TARGET_ARCH_IA32
+        // V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM

 void RunF32x4BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                       WasmOpcode opcode, FloatBinOp expected_op) {