[wasm-simd] Fix unsigned narrow instructions

These instructions should always treat inputs as signed, and saturate to unsigned min/max values. E.g. given -1, it should saturate to 0. The spec text, https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md#integer-to-integer-narrowing, has been updated to describe this. The changes here include codegen changes to ia32, x64, arm, and arm64, changes to arm simulator, assembler, and disassembler to handle the case of treating input as signed and narrowing to unsigned. The vqmovn instruction can handle this case, our assembler wasn't allowing callers to specify this. The interpreter and scalar lowering are also fixed with this change. Bug: v8:9729 Change-Id: I6f72baa825f59037f7754485df6a2964af59fe31 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1879423 Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Reviewed-by: Michael Starzinger <mstarzinger@chromium.org> Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#65051}
2019-10-28 11:05:16 -07:00 · 2019-10-28 11:05:16 -07:00 · e927764216
commit e927764216
parent d30ec8566b
13 changed files with 97 additions and 119 deletions
--- a/src/codegen/arm/assembler-arm.cc
+++ b/src/codegen/arm/assembler-arm.cc
@ -3690,17 +3690,19 @@ void Assembler::vmovl(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src) {
       0xA * B8 | m * B5 | B4 | vm);
 }

-void Assembler::vqmovn(NeonDataType dt, DwVfpRegister dst, QwNeonRegister src) {
+void Assembler::vqmovn(NeonDataType dst_dt, NeonDataType src_dt,
+                       DwVfpRegister dst, QwNeonRegister src) {
  // Instruction details available in ARM DDI 0406C.b, A8.8.1004.
  // vqmovn.<type><size> Dd, Qm. ARM vector narrowing move with saturation.
+  // vqmovun.<type><size> Dd, Qm. Same as above, but produces unsigned results.
  DCHECK(IsEnabled(NEON));
+  DCHECK_IMPLIES(NeonU(src_dt), NeonU(dst_dt));
  int vd, d;
  dst.split_code(&vd, &d);
  int vm, m;
  src.split_code(&vm, &m);
-  int size = NeonSz(dt);
-  int u = NeonU(dt);
-  int op = u != 0 ? 3 : 2;
+  int size = NeonSz(dst_dt);
+  int op = NeonU(src_dt) ? 0b11 : NeonU(dst_dt) ? 0b01 : 0b10;
  emit(0x1E7U * B23 | d * B22 | 0x3 * B20 | size * B18 | 0x2 * B16 | vd * B12 |
       0x2 * B8 | op * B6 | m * B5 | vm);
 }
--- a/src/codegen/arm/assembler-arm.h
+++ b/src/codegen/arm/assembler-arm.h
@ -843,8 +843,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
            const NeonMemOperand& dst);
  // dt represents the narrower type
  void vmovl(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src);
-  // dt represents the narrower type.
-  void vqmovn(NeonDataType dt, DwVfpRegister dst, QwNeonRegister src);
+  // dst_dt represents the narrower type, src_dt represents the src type.
+  void vqmovn(NeonDataType dst_dt, NeonDataType src_dt, DwVfpRegister dst,
+              QwNeonRegister src);

  // Only unconditional core <-> scalar moves are currently supported.
  void vmov(NeonDataType dt, DwVfpRegister dst, int index, Register src);
--- a/src/compiler/backend/arm/code-generator-arm.cc
+++ b/src/compiler/backend/arm/code-generator-arm.cc
@ -459,20 +459,20 @@ void ComputePoisonedAddressForLoad(CodeGenerator* codegen,
    DCHECK_EQ(LeaveCC, i.OutputSBit());                                        \
  } while (0)

-#define ASSEMBLE_NEON_NARROWING_OP(dt)                \
+#define ASSEMBLE_NEON_NARROWING_OP(dt, sdt)           \
  do {                                                \
    Simd128Register dst = i.OutputSimd128Register(),  \
                    src0 = i.InputSimd128Register(0), \
                    src1 = i.InputSimd128Register(1); \
    if (dst == src0 && dst == src1) {                 \
-      __ vqmovn(dt, dst.low(), src0);                 \
+      __ vqmovn(dt, sdt, dst.low(), src0);            \
      __ vmov(dst.high(), dst.low());                 \
    } else if (dst == src0) {                         \
-      __ vqmovn(dt, dst.low(), src0);                 \
-      __ vqmovn(dt, dst.high(), src1);                \
+      __ vqmovn(dt, sdt, dst.low(), src0);            \
+      __ vqmovn(dt, sdt, dst.high(), src1);           \
    } else {                                          \
-      __ vqmovn(dt, dst.high(), src1);                \
-      __ vqmovn(dt, dst.low(), src0);                 \
+      __ vqmovn(dt, sdt, dst.high(), src1);           \
+      __ vqmovn(dt, sdt, dst.low(), src0);            \
    }                                                 \
  } while (0)

@ -2259,7 +2259,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kArmI16x8SConvertI32x4:
-      ASSEMBLE_NEON_NARROWING_OP(NeonS16);
+      ASSEMBLE_NEON_NARROWING_OP(NeonS16, NeonS16);
      break;
    case kArmI16x8Add: {
      __ vadd(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0),
@ -2343,7 +2343,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kArmI16x8UConvertI32x4:
-      ASSEMBLE_NEON_NARROWING_OP(NeonU16);
+      ASSEMBLE_NEON_NARROWING_OP(NeonU16, NeonS16);
      break;
    case kArmI16x8AddSaturateU: {
      __ vqadd(NeonU16, i.OutputSimd128Register(), i.InputSimd128Register(0),
@ -2415,7 +2415,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kArmI8x16SConvertI16x8:
-      ASSEMBLE_NEON_NARROWING_OP(NeonS8);
+      ASSEMBLE_NEON_NARROWING_OP(NeonS8, NeonS8);
      break;
    case kArmI8x16Add: {
      __ vadd(Neon8, i.OutputSimd128Register(), i.InputSimd128Register(0),
@ -2485,7 +2485,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kArmI8x16UConvertI16x8:
-      ASSEMBLE_NEON_NARROWING_OP(NeonU8);
+      ASSEMBLE_NEON_NARROWING_OP(NeonU8, NeonS8);
      break;
    case kArmI8x16AddSaturateU: {
      __ vqadd(NeonU8, i.OutputSimd128Register(), i.InputSimd128Register(0),
--- a/src/compiler/backend/arm64/code-generator-arm64.cc
+++ b/src/compiler/backend/arm64/code-generator-arm64.cc
@ -2245,8 +2245,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
        __ Mov(temp, src1.V4S());
        src1 = temp;
      }
-      __ Uqxtn(dst.V4H(), src0.V4S());
-      __ Uqxtn2(dst.V8H(), src1.V4S());
+      __ Sqxtun(dst.V4H(), src0.V4S());
+      __ Sqxtun2(dst.V8H(), src1.V4S());
      break;
    }
      SIMD_BINOP_CASE(kArm64I16x8AddSaturateU, Uqadd, 8H);
@ -2347,8 +2347,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
        __ Mov(temp, src1.V8H());
        src1 = temp;
      }
-      __ Uqxtn(dst.V8B(), src0.V8H());
-      __ Uqxtn2(dst.V16B(), src1.V8H());
+      __ Sqxtun(dst.V8B(), src0.V8H());
+      __ Sqxtun2(dst.V16B(), src1.V8H());
      break;
    }
      SIMD_BINOP_CASE(kArm64I8x16AddSaturateU, Uqadd, 16B);
--- a/src/compiler/backend/ia32/code-generator-ia32.cc
+++ b/src/compiler/backend/ia32/code-generator-ia32.cc
@ -3063,25 +3063,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    case kSSEI16x8UConvertI32x4: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
-      XMMRegister dst = i.OutputSimd128Register();
-      // Change negative lanes to 0x7FFFFFFF
-      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
-      __ psrld(kScratchDoubleReg, 1);
-      __ pminud(dst, kScratchDoubleReg);
-      __ pminud(kScratchDoubleReg, i.InputOperand(1));
-      __ packusdw(dst, kScratchDoubleReg);
+      __ packusdw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8UConvertI32x4: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
-      // Change negative lanes to 0x7FFFFFFF
-      __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
-      __ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 1);
-      __ vpminud(dst, kScratchDoubleReg, i.InputSimd128Register(0));
-      __ vpminud(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(1));
-      __ vpackusdw(dst, dst, kScratchDoubleReg);
+      __ vpackusdw(dst, dst, i.InputOperand(1));
      break;
    }
    case kSSEI16x8AddSaturateU: {
@ -3481,24 +3470,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
-      // Change negative lanes to 0x7FFF
-      __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
-      __ psrlw(kScratchDoubleReg, 1);
-      __ pminuw(dst, kScratchDoubleReg);
-      __ pminuw(kScratchDoubleReg, i.InputOperand(1));
-      __ packuswb(dst, kScratchDoubleReg);
+      __ packuswb(dst, i.InputOperand(1));
      break;
    }
    case kAVXI8x16UConvertI16x8: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
-      // Change negative lanes to 0x7FFF
-      __ vpcmpeqw(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
-      __ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 1);
-      __ vpminuw(dst, kScratchDoubleReg, i.InputSimd128Register(0));
-      __ vpminuw(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(1));
-      __ vpackuswb(dst, dst, kScratchDoubleReg);
+      __ vpackuswb(dst, dst, i.InputOperand(1));
      break;
    }
    case kSSEI8x16AddSaturateU: {
--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -3306,13 +3306,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    case kX64I16x8UConvertI32x4: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
-      XMMRegister dst = i.OutputSimd128Register();
-      // Change negative lanes to 0x7FFFFFFF
-      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
-      __ psrld(kScratchDoubleReg, 1);
-      __ pminud(dst, kScratchDoubleReg);
-      __ pminud(kScratchDoubleReg, i.InputSimd128Register(1));
-      __ packusdw(dst, kScratchDoubleReg);
+      __ packusdw(i.OutputSimd128Register(), i.InputSimd128Register(1));
      break;
    }
    case kX64I16x8AddSaturateU: {
@ -3524,13 +3518,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    case kX64I8x16UConvertI16x8: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
-      XMMRegister dst = i.OutputSimd128Register();
-      // Change negative lanes to 0x7FFF
-      __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
-      __ psrlw(kScratchDoubleReg, 1);
-      __ pminuw(dst, kScratchDoubleReg);
-      __ pminuw(kScratchDoubleReg, i.InputSimd128Register(1));
-      __ packuswb(dst, kScratchDoubleReg);
+      __ packuswb(i.OutputSimd128Register(), i.InputSimd128Register(1));
      break;
    }
    case kX64I8x16ShrU: {
--- a/src/compiler/simd-scalar-lowering.cc
+++ b/src/compiler/simd-scalar-lowering.cc
@ -779,11 +779,9 @@ void SimdScalarLowering::LowerPack(Node* node, SimdType input_rep_type,
  DCHECK_EQ(2, node->InputCount());
  Node** rep_left = GetReplacementsWithType(node->InputAt(0), input_rep_type);
  Node** rep_right = GetReplacementsWithType(node->InputAt(1), input_rep_type);
-  const Operator* less_op =
-      is_signed ? machine()->Int32LessThan() : machine()->Uint32LessThan();
+  const Operator* less_op = machine()->Int32LessThan();
  Node* min = nullptr;
  Node* max = nullptr;
-  int32_t shift_val = 0;
  MachineRepresentation phi_rep;
  if (output_rep_type == SimdType::kInt16x8) {
    DCHECK(input_rep_type == SimdType::kInt32x4);
@ -791,8 +789,8 @@ void SimdScalarLowering::LowerPack(Node* node, SimdType input_rep_type,
      min = mcgraph_->Int32Constant(std::numeric_limits<int16_t>::min());
      max = mcgraph_->Int32Constant(std::numeric_limits<int16_t>::max());
    } else {
+      min = mcgraph_->Int32Constant(std::numeric_limits<uint16_t>::min());
      max = mcgraph_->Uint32Constant(std::numeric_limits<uint16_t>::max());
-      shift_val = kShift16;
    }
    phi_rep = MachineRepresentation::kWord16;
  } else {
@ -802,8 +800,8 @@ void SimdScalarLowering::LowerPack(Node* node, SimdType input_rep_type,
      min = mcgraph_->Int32Constant(std::numeric_limits<int8_t>::min());
      max = mcgraph_->Int32Constant(std::numeric_limits<int8_t>::max());
    } else {
+      min = mcgraph_->Int32Constant(std::numeric_limits<uint8_t>::min());
      max = mcgraph_->Uint32Constant(std::numeric_limits<uint8_t>::max());
-      shift_val = kShift8;
    }
    phi_rep = MachineRepresentation::kWord8;
  }
@ -815,14 +813,10 @@ void SimdScalarLowering::LowerPack(Node* node, SimdType input_rep_type,
      input = rep_left[i];
    else
      input = rep_right[i - num_lanes / 2];
-    if (is_signed) {
-      Diamond d_min(graph(), common(), graph()->NewNode(less_op, input, min));
-      input = d_min.Phi(phi_rep, min, input);
-    }
+    Diamond d_min(graph(), common(), graph()->NewNode(less_op, input, min));
+    input = d_min.Phi(phi_rep, min, input);
    Diamond d_max(graph(), common(), graph()->NewNode(less_op, max, input));
    rep_node[i] = d_max.Phi(phi_rep, max, input);
-    rep_node[i] =
-        is_signed ? rep_node[i] : FixUpperBits(rep_node[i], shift_val);
  }
  ReplaceNode(node, rep_node, num_lanes);
 }
--- a/src/diagnostics/arm/disasm-arm.cc
+++ b/src/diagnostics/arm/disasm-arm.cc
@ -2235,13 +2235,15 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
          PrintDRegister(Vm);
        } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 8) == 0x2 &&
                   instr->Bits(7, 6) != 0) {
-          // vqmovn.<type><size> Dd, Qm.
+          // vqmov{u}n.<type><size> Dd, Qm.
          int Vd = instr->VFPDRegValue(kDoublePrecision);
          int Vm = instr->VFPMRegValue(kSimd128Precision);
-          char type = instr->Bit(6) != 0 ? 'u' : 's';
+          int op = instr->Bits(7, 6);
+          const char* name = op == 0b01 ? "vqmovun" : "vqmovn";
+          char type = op == 0b11 ? 'u' : 's';
          int size = 2 * kBitsPerByte * (1 << instr->Bits(19, 18));
          out_buffer_pos_ +=
-              SNPrintF(out_buffer_ + out_buffer_pos_, "vqmovn.%c%i d%d, q%d",
+              SNPrintF(out_buffer_ + out_buffer_pos_, "%s.%c%i d%d, q%d", name,
                       type, size, Vd, Vm);
        } else {
          int Vd, Vm;
--- a/src/execution/arm/simulator-arm.cc
+++ b/src/execution/arm/simulator-arm.cc
@ -3912,6 +3912,18 @@ void SaturatingNarrow(Simulator* simulator, int Vd, int Vm) {
  simulator->set_neon_register<U, kDoubleSize>(Vd, dst);
 }

+template <typename T, typename U>
+void SaturatingUnsignedNarrow(Simulator* simulator, int Vd, int Vm) {
+  static const int kLanes = 16 / sizeof(T);
+  T src[kLanes];
+  U dst[kLanes];
+  simulator->get_neon_register(Vm, src);
+  for (int i = 0; i < kLanes; i++) {
+    dst[i] = Clamp<U>(src[i]);
+  }
+  simulator->set_neon_register<U, kDoubleSize>(Vd, dst);
+}
+
 template <typename T>
 void AddSaturate(Simulator* simulator, int Vd, int Vm, int Vn) {
  static const int kLanes = 16 / sizeof(T);
@ -5332,27 +5344,35 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
          int Vd = instr->VFPDRegValue(kDoublePrecision);
          int Vm = instr->VFPMRegValue(kSimd128Precision);
          NeonSize size = static_cast<NeonSize>(instr->Bits(19, 18));
-          bool is_unsigned = instr->Bit(6) != 0;
+          bool dst_unsigned = instr->Bit(6) != 0;
+          bool src_unsigned = instr->Bit(7, 6) == 0b11;
+          DCHECK_IMPLIES(src_unsigned, dst_unsigned);
          switch (size) {
            case Neon8: {
-              if (is_unsigned) {
+              if (src_unsigned) {
                SaturatingNarrow<uint16_t, uint8_t>(this, Vd, Vm);
+              } else if (dst_unsigned) {
+                SaturatingUnsignedNarrow<int16_t, uint8_t>(this, Vd, Vm);
              } else {
                SaturatingNarrow<int16_t, int8_t>(this, Vd, Vm);
              }
              break;
            }
            case Neon16: {
-              if (is_unsigned) {
+              if (src_unsigned) {
                SaturatingNarrow<uint32_t, uint16_t>(this, Vd, Vm);
+              } else if (dst_unsigned) {
+                SaturatingUnsignedNarrow<int32_t, uint16_t>(this, Vd, Vm);
              } else {
                SaturatingNarrow<int32_t, int16_t>(this, Vd, Vm);
              }
              break;
            }
            case Neon32: {
-              if (is_unsigned) {
+              if (src_unsigned) {
                SaturatingNarrow<uint64_t, uint32_t>(this, Vd, Vm);
+              } else if (dst_unsigned) {
+                SaturatingUnsignedNarrow<int64_t, uint32_t>(this, Vd, Vm);
              } else {
                SaturatingNarrow<int64_t, int32_t>(this, Vd, Vm);
              }
--- a/src/wasm/wasm-interpreter.cc
+++ b/src/wasm/wasm-interpreter.cc
@ -2515,34 +2515,28 @@ class ThreadImpl {
        CONVERT_CASE(I16x8UConvertI8x16Low, int16, i8x16, int8, 8, 0, uint8_t,
                     a)
 #undef CONVERT_CASE
-#define PACK_CASE(op, src_type, name, dst_type, count, ctype, dst_ctype,   \
-                  is_unsigned)                                             \
-  case kExpr##op: {                                                        \
-    WasmValue v2 = Pop();                                                  \
-    WasmValue v1 = Pop();                                                  \
-    src_type s1 = v1.to_s128().to_##name();                                \
-    src_type s2 = v2.to_s128().to_##name();                                \
-    dst_type res;                                                          \
-    int64_t min = std::numeric_limits<ctype>::min();                       \
-    int64_t max = std::numeric_limits<ctype>::max();                       \
-    for (size_t i = 0; i < count; ++i) {                                   \
-      int32_t v = i < count / 2 ? s1.val[LANE(i, s1)]                      \
-                                : s2.val[LANE(i - count / 2, s2)];         \
-      int64_t a = is_unsigned ? static_cast<int64_t>(v & 0xFFFFFFFFu) : v; \
-      res.val[LANE(i, res)] =                                              \
-          static_cast<dst_ctype>(std::max(min, std::min(max, a)));         \
-    }                                                                      \
-    Push(WasmValue(Simd128(res)));                                         \
-    return true;                                                           \
+#define PACK_CASE(op, src_type, name, dst_type, count, ctype, dst_ctype) \
+  case kExpr##op: {                                                      \
+    WasmValue v2 = Pop();                                                \
+    WasmValue v1 = Pop();                                                \
+    src_type s1 = v1.to_s128().to_##name();                              \
+    src_type s2 = v2.to_s128().to_##name();                              \
+    dst_type res;                                                        \
+    int64_t min = std::numeric_limits<ctype>::min();                     \
+    int64_t max = std::numeric_limits<ctype>::max();                     \
+    for (size_t i = 0; i < count; ++i) {                                 \
+      int64_t v = i < count / 2 ? s1.val[LANE(i, s1)]                    \
+                                : s2.val[LANE(i - count / 2, s2)];       \
+      res.val[LANE(i, res)] =                                            \
+          static_cast<dst_ctype>(std::max(min, std::min(max, v)));       \
+    }                                                                    \
+    Push(WasmValue(Simd128(res)));                                       \
+    return true;                                                         \
  }
-        PACK_CASE(I16x8SConvertI32x4, int4, i32x4, int8, 8, int16_t, int16_t,
-                  false)
-        PACK_CASE(I16x8UConvertI32x4, int4, i32x4, int8, 8, uint16_t, int16_t,
-                  true)
-        PACK_CASE(I8x16SConvertI16x8, int8, i16x8, int16, 16, int8_t, int8_t,
-                  false)
-        PACK_CASE(I8x16UConvertI16x8, int8, i16x8, int16, 16, uint8_t, int8_t,
-                  true)
+        PACK_CASE(I16x8SConvertI32x4, int4, i32x4, int8, 8, int16_t, int16_t)
+        PACK_CASE(I16x8UConvertI32x4, int4, i32x4, int8, 8, uint16_t, int16_t)
+        PACK_CASE(I8x16SConvertI16x8, int8, i16x8, int16, 16, int8_t, int8_t)
+        PACK_CASE(I8x16UConvertI16x8, int8, i16x8, int16, 16, uint8_t, int8_t)
 #undef PACK_CASE
      case kExprS128Select: {
        int4 bool_val = Pop().to_s128().to_i32x4();
--- a/test/cctest/test-assembler-arm.cc
+++ b/test/cctest/test-assembler-arm.cc
@ -1335,12 +1335,12 @@ TEST(15) {
    __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmovl_s32))));
    __ vst1(Neon8, NeonListOperand(q3), NeonMemOperand(r4));
    // Narrow what we widened.
-    __ vqmovn(NeonU16, d0, q2);
+    __ vqmovn(NeonU16, NeonU16, d0, q2);
    __ vstr(d0, r0, offsetof(T, vqmovn_u16));
    __ vmov(d1, d0);
-    __ vqmovn(NeonS8, d2, q0);
+    __ vqmovn(NeonS8, NeonS8, d2, q0);
    __ vstr(d2, r0, offsetof(T, vqmovn_s8));
-    __ vqmovn(NeonS32, d4, q3);
+    __ vqmovn(NeonS32, NeonS32, d4, q3);
    __ vstr(d4, r0, offsetof(T, vqmovn_s32));

    // ARM core register to scalar.
--- a/test/cctest/test-disasm-arm.cc
+++ b/test/cctest/test-disasm-arm.cc
@ -997,9 +997,14 @@ TEST(Neon) {
      COMPARE(vmovl(NeonS16, q4, d2), "f2908a12       vmovl.s16 q4, d2");
      COMPARE(vmovl(NeonU32, q4, d2), "f3a08a12       vmovl.u32 q4, d2");

-      COMPARE(vqmovn(NeonU8, d16, q8), "f3f202e0       vqmovn.u16 d16, q8");
-      COMPARE(vqmovn(NeonS16, d16, q8), "f3f602a0       vqmovn.s32 d16, q8");
-      COMPARE(vqmovn(NeonU32, d2, q4), "f3ba22c8       vqmovn.u64 d2, q4");
+      COMPARE(vqmovn(NeonU8, NeonU8, d16, q8),
+              "f3f202e0       vqmovn.u16 d16, q8");
+      COMPARE(vqmovn(NeonS16, NeonS16, d16, q8),
+              "f3f602a0       vqmovn.s32 d16, q8");
+      COMPARE(vqmovn(NeonU32, NeonU32, d2, q4),
+              "f3ba22c8       vqmovn.u64 d2, q4");
+      COMPARE(vqmovn(NeonU32, NeonS32, d2, q4),
+              "f3ba2248       vqmovun.s64 d2, q4");

      COMPARE(vmov(NeonS8, d0, 0, r0), "ee400b10       vmov.8 d0[0], r0");
      COMPARE(vmov(NeonU8, d1, 1, r1), "ee411b30       vmov.8 d1[1], r1");
--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@ -227,13 +227,6 @@ T Narrow(int64_t value) {
  return Clamp<T>(value);
 }

-template <typename T>
-T UnsignedNarrow(int64_t value) {
-  static_assert(sizeof(int64_t) > sizeof(T), "T must be int32_t or smaller");
-  using UnsignedT = typename std::make_unsigned<T>::type;
-  return static_cast<T>(Clamp<UnsignedT>(value & 0xFFFFFFFFu));
-}
-
 template <typename T>
 T AddSaturate(T a, T b) {
  return Clamp<T>(Widen(a) + Widen(b));
@ -2034,7 +2027,7 @@ WASM_SIMD_TEST(I16x8ConvertI32x4) {
  FOR_INT32_INPUTS(x) {
    r.Call(x);
    int16_t expected_signed = Narrow<int16_t>(x);
-    int16_t expected_unsigned = UnsignedNarrow<int16_t>(x);
+    int16_t expected_unsigned = Narrow<uint16_t>(x);
    for (int i = 0; i < 8; i++) {
      CHECK_EQ(expected_signed, ReadLittleEndianValue<int16_t>(&g0[i]));
      CHECK_EQ(expected_unsigned, ReadLittleEndianValue<int16_t>(&g1[i]));
@ -2277,7 +2270,7 @@ WASM_SIMD_TEST(I8x16ConvertI16x8) {
  FOR_INT16_INPUTS(x) {
    r.Call(x);
    int8_t expected_signed = Narrow<int8_t>(x);
-    int8_t expected_unsigned = UnsignedNarrow<int8_t>(x);
+    int8_t expected_unsigned = Narrow<uint8_t>(x);
    for (int i = 0; i < 16; i++) {
      CHECK_EQ(expected_signed, ReadLittleEndianValue<int8_t>(&g0[i]));
      CHECK_EQ(expected_unsigned, ReadLittleEndianValue<int8_t>(&g1[i]));