[wasm-simd][arm64] Bitmask instructions

Implement i8x16.bitmask, i16x8.bitmask, i32x4.bitmask on interpreter and arm64. These operations are behind wasm_simd_post_mvp flag, as we are only prototyping to evaluate performance. The codegen is based on guidance at https://github.com/WebAssembly/simd/pull/201. Bug: v8:10308 Change-Id: I835aa8a23e677a00ee7897c1c31a028850e238a9 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2099451 Reviewed-by: Tobias Tebbi <tebbi@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#66793}
2020-03-18 16:14:40 -07:00 · 2020-03-18 16:14:40 -07:00 · 3406cba8fe
commit 3406cba8fe
parent ca5ee9d636
13 changed files with 193 additions and 0 deletions
--- a/src/compiler/backend/arm64/code-generator-arm64.cc
+++ b/src/compiler/backend/arm64/code-generator-arm64.cc
@ -2128,6 +2128,21 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      SIMD_BINOP_CASE(kArm64I32x4GtU, Cmhi, 4S);
      SIMD_BINOP_CASE(kArm64I32x4GeU, Cmhs, 4S);
      SIMD_UNOP_CASE(kArm64I32x4Abs, Abs, 4S);
+    case kArm64I32x4BitMask: {
+      Register dst = i.OutputRegister32();
+      VRegister src = i.InputSimd128Register(0);
+      VRegister tmp = i.TempSimd128Register(0);
+      VRegister mask = i.TempSimd128Register(1);
+
+      __ Sshr(tmp.V4S(), src.V4S(), 31);
+      // Set i-th bit of each lane i. When AND with tmp, the lanes that
+      // are signed will have i-th bit set, unsigned will be 0.
+      __ Movi(mask.V2D(), 0x0000'0008'0000'0004, 0x0000'0002'0000'0001);
+      __ And(tmp.V16B(), mask.V16B(), tmp.V16B());
+      __ Addv(tmp.S(), tmp.V4S());
+      __ Mov(dst.W(), tmp.V4S(), 0);
+      break;
+    }
    case kArm64I16x8Splat: {
      __ Dup(i.OutputSimd128Register().V8H(), i.InputRegister32(0));
      break;
@ -2229,6 +2244,21 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      SIMD_BINOP_CASE(kArm64I16x8GeU, Cmhs, 8H);
      SIMD_BINOP_CASE(kArm64I16x8RoundingAverageU, Urhadd, 8H);
      SIMD_UNOP_CASE(kArm64I16x8Abs, Abs, 8H);
+    case kArm64I16x8BitMask: {
+      Register dst = i.OutputRegister32();
+      VRegister src = i.InputSimd128Register(0);
+      VRegister tmp = i.TempSimd128Register(0);
+      VRegister mask = i.TempSimd128Register(1);
+
+      __ Sshr(tmp.V8H(), src.V8H(), 15);
+      // Set i-th bit of each lane i. When AND with tmp, the lanes that
+      // are signed will have i-th bit set, unsigned will be 0.
+      __ Movi(mask.V2D(), 0x0080'0040'0020'0010, 0x0008'0004'0002'0001);
+      __ And(tmp.V16B(), mask.V16B(), tmp.V16B());
+      __ Addv(tmp.H(), tmp.V8H());
+      __ Mov(dst.W(), tmp.V8H(), 0);
+      break;
+    }
    case kArm64I8x16Splat: {
      __ Dup(i.OutputSimd128Register().V16B(), i.InputRegister32(0));
      break;
@ -2318,6 +2348,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      SIMD_BINOP_CASE(kArm64I8x16GeU, Cmhs, 16B);
      SIMD_BINOP_CASE(kArm64I8x16RoundingAverageU, Urhadd, 16B);
      SIMD_UNOP_CASE(kArm64I8x16Abs, Abs, 16B);
+    case kArm64I8x16BitMask: {
+      Register dst = i.OutputRegister32();
+      VRegister src = i.InputSimd128Register(0);
+      VRegister tmp = i.TempSimd128Register(0);
+      VRegister mask = i.TempSimd128Register(1);
+
+      // Set i-th bit of each lane i. When AND with tmp, the lanes that
+      // are signed will have i-th bit set, unsigned will be 0.
+      __ Sshr(tmp.V16B(), src.V16B(), 7);
+      __ Movi(mask.V2D(), 0x8040'2010'0804'0201);
+      __ And(tmp.V16B(), mask.V16B(), tmp.V16B());
+      __ Ext(mask.V16B(), tmp.V16B(), tmp.V16B(), 8);
+      __ Zip1(tmp.V16B(), tmp.V16B(), mask.V16B());
+      __ Addv(tmp.H(), tmp.V8H());
+      __ Mov(dst.W(), tmp.V8H(), 0);
+      break;
+    }
    case kArm64S128Zero: {
      __ Movi(i.OutputSimd128Register().V16B(), 0);
      break;
--- a/src/compiler/backend/arm64/instruction-codes-arm64.h
+++ b/src/compiler/backend/arm64/instruction-codes-arm64.h
@ -253,6 +253,7 @@ namespace compiler {
  V(Arm64I32x4GtU)                          \
  V(Arm64I32x4GeU)                          \
  V(Arm64I32x4Abs)                          \
+  V(Arm64I32x4BitMask)                      \
  V(Arm64I16x8Splat)                        \
  V(Arm64I16x8ExtractLaneU)                 \
  V(Arm64I16x8ExtractLaneS)                 \
@ -287,6 +288,7 @@ namespace compiler {
  V(Arm64I16x8GeU)                          \
  V(Arm64I16x8RoundingAverageU)             \
  V(Arm64I16x8Abs)                          \
+  V(Arm64I16x8BitMask)                      \
  V(Arm64I8x16Splat)                        \
  V(Arm64I8x16ExtractLaneU)                 \
  V(Arm64I8x16ExtractLaneS)                 \
@ -316,6 +318,7 @@ namespace compiler {
  V(Arm64I8x16GeU)                          \
  V(Arm64I8x16RoundingAverageU)             \
  V(Arm64I8x16Abs)                          \
+  V(Arm64I8x16BitMask)                      \
  V(Arm64S128Zero)                          \
  V(Arm64S128Dup)                           \
  V(Arm64S128And)                           \
--- a/src/compiler/backend/arm64/instruction-scheduler-arm64.cc
+++ b/src/compiler/backend/arm64/instruction-scheduler-arm64.cc
@ -223,6 +223,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kArm64I32x4GtU:
    case kArm64I32x4GeU:
    case kArm64I32x4Abs:
+    case kArm64I32x4BitMask:
    case kArm64I16x8Splat:
    case kArm64I16x8ExtractLaneU:
    case kArm64I16x8ExtractLaneS:
@ -257,6 +258,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kArm64I16x8GeU:
    case kArm64I16x8RoundingAverageU:
    case kArm64I16x8Abs:
+    case kArm64I16x8BitMask:
    case kArm64I8x16Splat:
    case kArm64I8x16ExtractLaneU:
    case kArm64I8x16ExtractLaneS:
@ -286,6 +288,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kArm64I8x16GeU:
    case kArm64I8x16RoundingAverageU:
    case kArm64I8x16Abs:
+    case kArm64I8x16BitMask:
    case kArm64S128Zero:
    case kArm64S128Dup:
    case kArm64S128And:
--- a/src/compiler/backend/arm64/instruction-selector-arm64.cc
+++ b/src/compiler/backend/arm64/instruction-selector-arm64.cc
@ -3367,6 +3367,29 @@ VISIT_SIMD_QFMOP(F32x4Qfma)
 VISIT_SIMD_QFMOP(F32x4Qfms)
 #undef VISIT_SIMD_QFMOP

+namespace {
+template <ArchOpcode opcode>
+void VisitBitMask(InstructionSelector* selector, Node* node) {
+  Arm64OperandGenerator g(selector);
+  InstructionOperand temps[] = {g.TempSimd128Register(),
+                                g.TempSimd128Register()};
+  selector->Emit(opcode, g.DefineAsRegister(node),
+                 g.UseRegister(node->InputAt(0)), arraysize(temps), temps);
+}
+}  // namespace
+
+void InstructionSelector::VisitI8x16BitMask(Node* node) {
+  VisitBitMask<kArm64I8x16BitMask>(this, node);
+}
+
+void InstructionSelector::VisitI16x8BitMask(Node* node) {
+  VisitBitMask<kArm64I16x8BitMask>(this, node);
+}
+
+void InstructionSelector::VisitI32x4BitMask(Node* node) {
+  VisitBitMask<kArm64I32x4BitMask>(this, node);
+}
+
 namespace {

 struct ShuffleEntry {
--- a/src/compiler/backend/instruction-selector.cc
+++ b/src/compiler/backend/instruction-selector.cc
@ -2025,6 +2025,8 @@ void InstructionSelector::VisitNode(Node* node) {
      return MarkAsSimd128(node), VisitI32x4GeU(node);
    case IrOpcode::kI32x4Abs:
      return MarkAsSimd128(node), VisitI32x4Abs(node);
+    case IrOpcode::kI32x4BitMask:
+      return MarkAsWord32(node), VisitI32x4BitMask(node);
    case IrOpcode::kI16x8Splat:
      return MarkAsSimd128(node), VisitI16x8Splat(node);
    case IrOpcode::kI16x8ExtractLaneU:
@ -2093,6 +2095,8 @@ void InstructionSelector::VisitNode(Node* node) {
      return MarkAsSimd128(node), VisitI16x8RoundingAverageU(node);
    case IrOpcode::kI16x8Abs:
      return MarkAsSimd128(node), VisitI16x8Abs(node);
+    case IrOpcode::kI16x8BitMask:
+      return MarkAsWord32(node), VisitI16x8BitMask(node);
    case IrOpcode::kI8x16Splat:
      return MarkAsSimd128(node), VisitI8x16Splat(node);
    case IrOpcode::kI8x16ExtractLaneU:
@ -2151,6 +2155,8 @@ void InstructionSelector::VisitNode(Node* node) {
      return MarkAsSimd128(node), VisitI8x16RoundingAverageU(node);
    case IrOpcode::kI8x16Abs:
      return MarkAsSimd128(node), VisitI8x16Abs(node);
+    case IrOpcode::kI8x16BitMask:
+      return MarkAsWord32(node), VisitI8x16BitMask(node);
    case IrOpcode::kS128Zero:
      return MarkAsSimd128(node), VisitS128Zero(node);
    case IrOpcode::kS128And:
@ -2628,6 +2634,12 @@ void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2MaxU(Node* node) { UNIMPLEMENTED(); }
 #endif  // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_S390X

+#if !V8_TARGET_ARCH_ARM64
+void InstructionSelector::VisitI8x16BitMask(Node* node) { UNIMPLEMENTED(); }
+void InstructionSelector::VisitI16x8BitMask(Node* node) { UNIMPLEMENTED(); }
+void InstructionSelector::VisitI32x4BitMask(Node* node) { UNIMPLEMENTED(); }
+#endif  // !V8_TARGET_ARCH_ARM64
+
 void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }

 void InstructionSelector::VisitParameter(Node* node) {
--- a/src/compiler/machine-operator.cc
+++ b/src/compiler/machine-operator.cc
@ -390,6 +390,7 @@ MachineType AtomicOpType(Operator const* op) {
  V(I32x4GtU, Operator::kNoProperties, 2, 0, 1)                            \
  V(I32x4GeU, Operator::kNoProperties, 2, 0, 1)                            \
  V(I32x4Abs, Operator::kNoProperties, 1, 0, 1)                            \
+  V(I32x4BitMask, Operator::kNoProperties, 1, 0, 1)                        \
  V(I16x8Splat, Operator::kNoProperties, 1, 0, 1)                          \
  V(I16x8SConvertI8x16Low, Operator::kNoProperties, 1, 0, 1)               \
  V(I16x8SConvertI8x16High, Operator::kNoProperties, 1, 0, 1)              \
@ -421,6 +422,7 @@ MachineType AtomicOpType(Operator const* op) {
  V(I16x8GeU, Operator::kNoProperties, 2, 0, 1)                            \
  V(I16x8RoundingAverageU, Operator::kCommutative, 2, 0, 1)                \
  V(I16x8Abs, Operator::kNoProperties, 1, 0, 1)                            \
+  V(I16x8BitMask, Operator::kNoProperties, 1, 0, 1)                        \
  V(I8x16Splat, Operator::kNoProperties, 1, 0, 1)                          \
  V(I8x16Neg, Operator::kNoProperties, 1, 0, 1)                            \
  V(I8x16Shl, Operator::kNoProperties, 2, 0, 1)                            \
@ -447,6 +449,7 @@ MachineType AtomicOpType(Operator const* op) {
  V(I8x16GeU, Operator::kNoProperties, 2, 0, 1)                            \
  V(I8x16RoundingAverageU, Operator::kCommutative, 2, 0, 1)                \
  V(I8x16Abs, Operator::kNoProperties, 1, 0, 1)                            \
+  V(I8x16BitMask, Operator::kNoProperties, 1, 0, 1)                        \
  V(S128Load, Operator::kNoProperties, 2, 0, 1)                            \
  V(S128Store, Operator::kNoProperties, 3, 0, 1)                           \
  V(S128Zero, Operator::kNoProperties, 0, 0, 1)                            \
--- a/src/compiler/machine-operator.h
+++ b/src/compiler/machine-operator.h
@ -630,6 +630,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
  const Operator* I32x4GtU();
  const Operator* I32x4GeU();
  const Operator* I32x4Abs();
+  const Operator* I32x4BitMask();

  const Operator* I16x8Splat();
  const Operator* I16x8ExtractLaneU(int32_t);
@ -666,6 +667,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
  const Operator* I16x8GeU();
  const Operator* I16x8RoundingAverageU();
  const Operator* I16x8Abs();
+  const Operator* I16x8BitMask();

  const Operator* I8x16Splat();
  const Operator* I8x16ExtractLaneU(int32_t);
@ -697,6 +699,7 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
  const Operator* I8x16GeU();
  const Operator* I8x16RoundingAverageU();
  const Operator* I8x16Abs();
+  const Operator* I8x16BitMask();

  const Operator* S128Load();
  const Operator* S128Store();
--- a/src/compiler/opcodes.h
+++ b/src/compiler/opcodes.h
@ -840,6 +840,7 @@
  V(I32x4GtU)                   \
  V(I32x4GeU)                   \
  V(I32x4Abs)                   \
+  V(I32x4BitMask)               \
  V(I16x8Splat)                 \
  V(I16x8ExtractLaneU)          \
  V(I16x8ExtractLaneS)          \
@ -878,6 +879,7 @@
  V(I16x8GeU)                   \
  V(I16x8RoundingAverageU)      \
  V(I16x8Abs)                   \
+  V(I16x8BitMask)               \
  V(I8x16Splat)                 \
  V(I8x16ExtractLaneU)          \
  V(I8x16ExtractLaneS)          \
@ -911,6 +913,7 @@
  V(I8x16GeU)                   \
  V(I8x16RoundingAverageU)      \
  V(I8x16Abs)                   \
+  V(I8x16BitMask)               \
  V(S128Load)                   \
  V(S128Store)                  \
  V(S128Zero)                   \
--- a/src/compiler/wasm-compiler.cc
+++ b/src/compiler/wasm-compiler.cc
@ -4368,6 +4368,8 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
                              inputs[1]);
    case wasm::kExprI32x4Abs:
      return graph()->NewNode(mcgraph()->machine()->I32x4Abs(), inputs[0]);
+    case wasm::kExprI32x4BitMask:
+      return graph()->NewNode(mcgraph()->machine()->I32x4BitMask(), inputs[0]);
    case wasm::kExprI16x8Splat:
      return graph()->NewNode(mcgraph()->machine()->I16x8Splat(), inputs[0]);
    case wasm::kExprI16x8SConvertI8x16Low:
@ -4470,6 +4472,8 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
                              inputs[0], inputs[1]);
    case wasm::kExprI16x8Abs:
      return graph()->NewNode(mcgraph()->machine()->I16x8Abs(), inputs[0]);
+    case wasm::kExprI16x8BitMask:
+      return graph()->NewNode(mcgraph()->machine()->I16x8BitMask(), inputs[0]);
    case wasm::kExprI8x16Splat:
      return graph()->NewNode(mcgraph()->machine()->I8x16Splat(), inputs[0]);
    case wasm::kExprI8x16Neg:
@ -4557,6 +4561,8 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
                              inputs[0], inputs[1]);
    case wasm::kExprI8x16Abs:
      return graph()->NewNode(mcgraph()->machine()->I8x16Abs(), inputs[0]);
+    case wasm::kExprI8x16BitMask:
+      return graph()->NewNode(mcgraph()->machine()->I8x16BitMask(), inputs[0]);
    case wasm::kExprS128And:
      return graph()->NewNode(mcgraph()->machine()->S128And(), inputs[0],
                              inputs[1]);
--- a/src/wasm/wasm-interpreter.cc
+++ b/src/wasm/wasm-interpreter.cc
@ -26,6 +26,7 @@
 #include "src/wasm/wasm-limits.h"
 #include "src/wasm/wasm-module.h"
 #include "src/wasm/wasm-objects-inl.h"
+#include "src/wasm/wasm-opcodes.h"
 #include "src/zone/accounting-allocator.h"
 #include "src/zone/zone-containers.h"

@ -2379,6 +2380,26 @@ class ThreadImpl {
      UNOP_CASE(I8x16Neg, i8x16, int16, 16, base::NegateWithWraparound(a))
      UNOP_CASE(I8x16Abs, i8x16, int16, 16, std::abs(a))
 #undef UNOP_CASE
+
+// Cast to double in call to signbit is due to MSCV issue, see
+// https://github.com/microsoft/STL/issues/519.
+#define BITMASK_CASE(op, name, stype, count)                   \
+  case kExpr##op: {                                            \
+    WasmValue v = Pop();                                       \
+    stype s = v.to_s128().to_##name();                         \
+    int32_t res = 0;                                           \
+    for (size_t i = 0; i < count; ++i) {                       \
+      bool sign = std::signbit(static_cast<double>(s.val[i])); \
+      res |= (sign << i);                                      \
+    }                                                          \
+    Push(WasmValue(res));                                      \
+    return true;                                               \
+  }
+      BITMASK_CASE(I8x16BitMask, i8x16, int16, 16)
+      BITMASK_CASE(I16x8BitMask, i16x8, int8, 8)
+      BITMASK_CASE(I32x4BitMask, i32x4, int4, 4)
+#undef BITMASK_CASE
+
 #define CMPOP_CASE(op, name, stype, out_stype, count, expr) \
  case kExpr##op: {                                         \
    WasmValue v2 = Pop();                                   \
--- a/src/wasm/wasm-opcodes.cc
+++ b/src/wasm/wasm-opcodes.cc
@ -317,6 +317,10 @@ const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
    CASE_I16x8_OP(Abs, "abs")
    CASE_I32x4_OP(Abs, "abs")

+    CASE_I8x16_OP(BitMask, "bitmask")
+    CASE_I16x8_OP(BitMask, "bitmask")
+    CASE_I32x4_OP(BitMask, "bitmask")
+
    // Atomic operations.
    CASE_OP(AtomicNotify, "atomic.notify")
    CASE_INT_OP(AtomicWait, "atomic.wait")
--- a/src/wasm/wasm-opcodes.h
+++ b/src/wasm/wasm-opcodes.h
@ -453,6 +453,9 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, const WasmFeatures&);
  V(I16x8AddHoriz, 0xfdbd, s_ss)        \
  V(I32x4AddHoriz, 0xfdbe, s_ss)        \
  V(F32x4AddHoriz, 0xfdbf, s_ss)        \
+  V(I8x16BitMask, 0xfde4, i_s)          \
+  V(I16x8BitMask, 0xfde5, i_s)          \
+  V(I32x4BitMask, 0xfde6, i_s)          \
  V(F32x4RecipApprox, 0xfdee, s_s)      \
  V(F32x4RecipSqrtApprox, 0xfdef, s_s)

--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -1658,6 +1658,68 @@ WASM_SIMD_TEST(I16x8ReplaceLane) {
  }
 }

+#if V8_TARGET_ARCH_ARM64
+WASM_SIMD_TEST_NO_LOWERING(I8x16BitMask) {
+  FLAG_SCOPE(wasm_simd_post_mvp);
+  WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
+  byte value1 = r.AllocateLocal(kWasmS128);
+
+  BUILD(r, WASM_SET_LOCAL(value1, WASM_SIMD_I8x16_SPLAT(WASM_GET_LOCAL(0))),
+        WASM_SET_LOCAL(value1, WASM_SIMD_I8x16_REPLACE_LANE(
+                                   0, WASM_GET_LOCAL(value1), WASM_I32V(0))),
+        WASM_SET_LOCAL(value1, WASM_SIMD_I8x16_REPLACE_LANE(
+                                   1, WASM_GET_LOCAL(value1), WASM_I32V(-1))),
+        WASM_SIMD_UNOP(kExprI8x16BitMask, WASM_GET_LOCAL(value1)));
+
+  FOR_INT8_INPUTS(x) {
+    int32_t actual = r.Call(x);
+    // Lane 0 is always 0 (positive), lane 1 is always -1.
+    int32_t expected = std::signbit(x) ? 0xFFFE : 0x0002;
+    CHECK_EQ(actual, expected);
+  }
+}
+
+WASM_SIMD_TEST_NO_LOWERING(I16x8BitMask) {
+  FLAG_SCOPE(wasm_simd_post_mvp);
+  WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
+  byte value1 = r.AllocateLocal(kWasmS128);
+
+  BUILD(r, WASM_SET_LOCAL(value1, WASM_SIMD_I16x8_SPLAT(WASM_GET_LOCAL(0))),
+        WASM_SET_LOCAL(value1, WASM_SIMD_I16x8_REPLACE_LANE(
+                                   0, WASM_GET_LOCAL(value1), WASM_I32V(0))),
+        WASM_SET_LOCAL(value1, WASM_SIMD_I16x8_REPLACE_LANE(
+                                   1, WASM_GET_LOCAL(value1), WASM_I32V(-1))),
+        WASM_SIMD_UNOP(kExprI16x8BitMask, WASM_GET_LOCAL(value1)));
+
+  FOR_INT16_INPUTS(x) {
+    int32_t actual = r.Call(x);
+    // Lane 0 is always 0 (positive), lane 1 is always -1.
+    int32_t expected = std::signbit(x) ? 0xFE : 2;
+    CHECK_EQ(actual, expected);
+  }
+}
+
+WASM_SIMD_TEST_NO_LOWERING(I32x4BitMask) {
+  FLAG_SCOPE(wasm_simd_post_mvp);
+  WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
+  byte value1 = r.AllocateLocal(kWasmS128);
+
+  BUILD(r, WASM_SET_LOCAL(value1, WASM_SIMD_I32x4_SPLAT(WASM_GET_LOCAL(0))),
+        WASM_SET_LOCAL(value1, WASM_SIMD_I32x4_REPLACE_LANE(
+                                   0, WASM_GET_LOCAL(value1), WASM_I32V(0))),
+        WASM_SET_LOCAL(value1, WASM_SIMD_I32x4_REPLACE_LANE(
+                                   1, WASM_GET_LOCAL(value1), WASM_I32V(-1))),
+        WASM_SIMD_UNOP(kExprI32x4BitMask, WASM_GET_LOCAL(value1)));
+
+  FOR_INT32_INPUTS(x) {
+    int32_t actual = r.Call(x);
+    // Lane 0 is always 0 (positive), lane 1 is always -1.
+    int32_t expected = std::signbit(x) ? 0xE : 2;
+    CHECK_EQ(actual, expected);
+  }
+}
+#endif  // V8_TARGET_ARCH_ARM64
+
 WASM_SIMD_TEST(I8x16Splat) {
  WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
  // Set up a global to hold output vector.