[wasm-simd] Share i8x16 shr_s shr_u implementation

Move the implementation into shared macro-assembler. TurboFan and Liftoff for both ia32 and x64 can now share the implementation. No functionality change expected. Bug: v8:11589 Change-Id: I8d3567ef6e4a430fe8e007e44d5d55cf8e8a6a7a Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3088273 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Clemens Backes <clemensb@chromium.org> Cr-Commit-Position: refs/heads/master@{#76264}
2021-08-12 10:01:33 -07:00 · 2021-08-12 10:01:33 -07:00 · 4955ecfc68
commit 4955ecfc68
parent 682affed8d
6 changed files with 151 additions and 187 deletions
--- a/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc
+++ b/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc
@ -6,6 +6,7 @@

 #include "src/codegen/assembler.h"
 #include "src/codegen/cpu-features.h"
+#include "src/codegen/register-arch.h"

 #if V8_TARGET_ARCH_IA32
 #include "src/codegen/ia32/register-ia32.h"
@ -18,6 +19,17 @@
 namespace v8 {
 namespace internal {

+void SharedTurboAssembler::Move(Register dst, uint32_t src) {
+  // Helper to paper over the different assembler function names.
+#if V8_TARGET_ARCH_IA32
+  mov(dst, Immediate(src));
+#elif V8_TARGET_ARCH_X64
+  movl(dst, Immediate(src));
+#else
+#error Unsupported target architecture.
+#endif
+}
+
 void SharedTurboAssembler::Move(Register dst, Register src) {
  // Helper to paper over the different assembler function names.
  if (dst != src) {
@ -31,6 +43,17 @@ void SharedTurboAssembler::Move(Register dst, Register src) {
  }
 }

+void SharedTurboAssembler::Add(Register dst, Immediate src) {
+  // Helper to paper over the different assembler function names.
+#if V8_TARGET_ARCH_IA32
+  add(dst, src);
+#elif V8_TARGET_ARCH_X64
+  addq(dst, src);
+#else
+#error Unsupported target architecture.
+#endif
+}
+
 void SharedTurboAssembler::And(Register dst, Immediate src) {
  // Helper to paper over the different assembler function names.
 #if V8_TARGET_ARCH_IA32
@ -231,6 +254,80 @@ void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
  }
 }

+void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
+                                     uint8_t src2, XMMRegister tmp2) {
+  // Unpack bytes into words, do word (16-bit) shifts, and repack.
+  DCHECK_NE(dst, tmp2);
+  uint8_t shift = truncate_to_int3(src2) + 8;
+
+  Punpckhbw(tmp2, src1);
+  Punpcklbw(dst, src1);
+  Psraw(tmp2, shift);
+  Psraw(dst, shift);
+  Packsswb(dst, tmp2);
+}
+
+void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
+                                     Register src2, Register tmp1,
+                                     XMMRegister tmp2, XMMRegister tmp3) {
+  DCHECK(!AreAliased(dst, tmp2, tmp3));
+  DCHECK_NE(src1, tmp2);
+
+  // Unpack the bytes into words, do arithmetic shifts, and repack.
+  Punpckhbw(tmp2, src1);
+  Punpcklbw(dst, src1);
+  // Prepare shift value
+  Move(tmp1, src2);
+  // Take shift value modulo 8.
+  And(tmp1, Immediate(7));
+  Add(tmp1, Immediate(8));
+  Movd(tmp3, tmp1);
+  Psraw(tmp2, tmp3);
+  Psraw(dst, tmp3);
+  Packsswb(dst, tmp2);
+}
+
+void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
+                                     uint8_t src2, Register tmp1,
+                                     XMMRegister tmp2) {
+  DCHECK_NE(dst, tmp2);
+  if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
+    movaps(dst, src1);
+    src1 = dst;
+  }
+
+  // Perform 16-bit shift, then mask away high bits.
+  uint8_t shift = truncate_to_int3(src2);
+  Psrlw(dst, src1, shift);
+
+  uint8_t bmask = 0xff >> shift;
+  uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
+  Move(tmp1, mask);
+  Movd(tmp2, tmp1);
+  Pshufd(tmp2, tmp2, byte{0});
+  Pand(dst, tmp2);
+}
+
+void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
+                                     Register src2, Register tmp1,
+                                     XMMRegister tmp2, XMMRegister tmp3) {
+  DCHECK(!AreAliased(dst, tmp2, tmp3));
+  DCHECK_NE(src1, tmp2);
+
+  // Unpack the bytes into words, do logical shifts, and repack.
+  Punpckhbw(tmp2, src1);
+  Punpcklbw(dst, src1);
+  // Prepare shift value.
+  Move(tmp1, src2);
+  // Take shift value modulo 8.
+  And(tmp1, Immediate(7));
+  Add(tmp1, Immediate(8));
+  Movd(tmp3, tmp1);
+  Psrlw(tmp2, tmp3);
+  Psrlw(dst, tmp3);
+  Packuswb(dst, tmp2);
+}
+
 void SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
                                          XMMRegister src2, XMMRegister scratch,
                                          bool is_signed) {
--- a/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h
+++ b/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h
@ -33,8 +33,10 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
 public:
  using TurboAssemblerBase::TurboAssemblerBase;

+  void Move(Register dst, uint32_t src);
  // Move if registers are not identical.
  void Move(Register dst, Register src);
+  void Add(Register dst, Immediate src);
  void And(Register dst, Immediate src);

  void Movapd(XMMRegister dst, XMMRegister src);
@ -298,6 +300,14 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
  void F32x4Splat(XMMRegister dst, DoubleRegister src);
  void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane);
  void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
+  void I8x16ShrS(XMMRegister dst, XMMRegister src1, uint8_t src2,
+                 XMMRegister tmp2);
+  void I8x16ShrS(XMMRegister dst, XMMRegister src1, Register src2,
+                 Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
+  void I8x16ShrU(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1,
+                 XMMRegister tmp2);
+  void I8x16ShrU(XMMRegister dst, XMMRegister src1, Register src2,
+                 Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
  void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
                      XMMRegister scrat, bool is_signed);
  void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
--- a/src/compiler/backend/ia32/code-generator-ia32.cc
+++ b/src/compiler/backend/ia32/code-generator-ia32.cc
@ -3147,28 +3147,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
    case kIA32I8x16ShrS: {
      XMMRegister dst = i.OutputSimd128Register();
+      // TODO(zhin): remove this restriction from instruction-selector.
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      if (HasImmediateInput(instr, 1)) {
-        __ Punpckhbw(kScratchDoubleReg, dst);
-        __ Punpcklbw(dst, dst);
-        uint8_t shift = i.InputInt3(1) + 8;
-        __ Psraw(kScratchDoubleReg, shift);
-        __ Psraw(dst, shift);
-        __ Packsswb(dst, kScratchDoubleReg);
+        __ I8x16ShrS(dst, i.InputSimd128Register(0), i.InputInt3(1),
+                     kScratchDoubleReg);
      } else {
-        Register tmp = i.ToRegister(instr->TempAt(0));
-        XMMRegister tmp_simd = i.TempSimd128Register(1);
-        // Unpack the bytes into words, do arithmetic shifts, and repack.
-        __ Punpckhbw(kScratchDoubleReg, dst);
-        __ Punpcklbw(dst, dst);
-        __ mov(tmp, i.InputRegister(1));
-        // Take shift value modulo 8.
-        __ and_(tmp, 7);
-        __ add(tmp, Immediate(8));
-        __ Movd(tmp_simd, tmp);
-        __ Psraw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
-        __ Psraw(dst, dst, tmp_simd);
-        __ Packsswb(dst, kScratchDoubleReg);
+        __ I8x16ShrS(dst, i.InputSimd128Register(0), i.InputRegister(1),
+                     i.TempRegister(0), kScratchDoubleReg,
+                     i.TempSimd128Register(1));
      }
      break;
    }
@ -3271,34 +3258,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
    case kIA32I8x16ShrU: {
      XMMRegister dst = i.OutputSimd128Register();
+      // TODO(zhin): remove this restriction from instruction-selector.
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      Register tmp = i.ToRegister(instr->TempAt(0));
-      XMMRegister tmp_simd = i.TempSimd128Register(1);

      if (HasImmediateInput(instr, 1)) {
-        // Perform 16-bit shift, then mask away high bits.
-        uint8_t shift = i.InputInt3(1);
-        __ Psrlw(dst, dst, byte{shift});
-
-        uint8_t bmask = 0xff >> shift;
-        uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
-        __ mov(tmp, mask);
-        __ Movd(tmp_simd, tmp);
-        __ Pshufd(tmp_simd, tmp_simd, uint8_t{0});
-        __ Pand(dst, tmp_simd);
+        __ I8x16ShrU(dst, i.InputSimd128Register(0), i.InputInt3(1), tmp,
+                     kScratchDoubleReg);
      } else {
-        // Unpack the bytes into words, do logical shifts, and repack.
-        __ Punpckhbw(kScratchDoubleReg, dst);
-        __ Punpcklbw(dst, dst);
-        __ mov(tmp, i.InputRegister(1));
-        // Take shift value modulo 8.
-        __ and_(tmp, 7);
-        __ add(tmp, Immediate(8));
-        __ Movd(tmp_simd, tmp);
-        __ Psrlw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
-        __ Psrlw(dst, dst, tmp_simd);
-        __ Packuswb(dst, kScratchDoubleReg);
+        __ I8x16ShrU(dst, i.InputSimd128Register(0), i.InputRegister(1), tmp,
+                     kScratchDoubleReg, i.TempSimd128Register(1));
      }
+
      break;
    }
    case kIA32I8x16MinU: {
--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -3528,30 +3528,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
    case kX64I8x16ShrS: {
      XMMRegister dst = i.OutputSimd128Register();
+      // TODO(zhin): remove this restriction from instruction-selector.
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      if (HasImmediateInput(instr, 1)) {
-        __ Punpckhbw(kScratchDoubleReg, dst);
-        __ Punpcklbw(dst, dst);
-        uint8_t shift = i.InputInt3(1) + 8;
-        __ Psraw(kScratchDoubleReg, shift);
-        __ Psraw(dst, shift);
-        __ Packsswb(dst, kScratchDoubleReg);
+        __ I8x16ShrS(dst, i.InputSimd128Register(0), i.InputInt3(1),
+                     kScratchDoubleReg);
      } else {
-        // Temp registers for shift mask andadditional moves to XMM registers.
-        Register tmp = i.ToRegister(instr->TempAt(0));
-        XMMRegister tmp_simd = i.TempSimd128Register(1);
-        // Unpack the bytes into words, do arithmetic shifts, and repack.
-        __ Punpckhbw(kScratchDoubleReg, dst);
-        __ Punpcklbw(dst, dst);
-        // Prepare shift value
-        __ movq(tmp, i.InputRegister(1));
-        // Take shift value modulo 8.
-        __ andq(tmp, Immediate(7));
-        __ addq(tmp, Immediate(8));
-        __ Movq(tmp_simd, tmp);
-        __ Psraw(kScratchDoubleReg, tmp_simd);
-        __ Psraw(dst, tmp_simd);
-        __ Packsswb(dst, kScratchDoubleReg);
+        // TODO(zhin): use kScratchRegister instead of TempRegister.
+        __ I8x16ShrS(dst, i.InputSimd128Register(0), i.InputRegister(1),
+                     i.TempRegister(0), kScratchDoubleReg,
+                     i.TempSimd128Register(1));
      }
      break;
    }
@ -3607,34 +3593,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
    case kX64I8x16ShrU: {
      XMMRegister dst = i.OutputSimd128Register();
-      // Unpack the bytes into words, do logical shifts, and repack.
+      // TODO(zhin): remove this restriction from instruction-selector.
      DCHECK_EQ(dst, i.InputSimd128Register(0));
-      // Temp registers for shift mask andadditional moves to XMM registers.
-      Register tmp = i.ToRegister(instr->TempAt(0));
-      XMMRegister tmp_simd = i.TempSimd128Register(1);
+      // TODO(zhin): use kScratchRegister instead of tmp.
+      Register tmp = i.TempRegister(0);
      if (HasImmediateInput(instr, 1)) {
-        // Perform 16-bit shift, then mask away high bits.
-        uint8_t shift = i.InputInt3(1);
-        __ Psrlw(dst, byte{shift});
-
-        uint8_t bmask = 0xff >> shift;
-        uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
-        __ movl(tmp, Immediate(mask));
-        __ Movd(tmp_simd, tmp);
-        __ Pshufd(tmp_simd, tmp_simd, byte{0});
-        __ Pand(dst, tmp_simd);
+        __ I8x16ShrU(dst, i.InputSimd128Register(0), i.InputInt3(1), tmp,
+                     kScratchDoubleReg);
      } else {
-        __ Punpckhbw(kScratchDoubleReg, dst);
-        __ Punpcklbw(dst, dst);
-        // Prepare shift value
-        __ movq(tmp, i.InputRegister(1));
-        // Take shift value modulo 8.
-        __ andq(tmp, Immediate(7));
-        __ addq(tmp, Immediate(8));
-        __ Movq(tmp_simd, tmp);
-        __ Psrlw(kScratchDoubleReg, tmp_simd);
-        __ Psrlw(dst, tmp_simd);
-        __ Packuswb(dst, kScratchDoubleReg);
+        __ I8x16ShrU(dst, i.InputSimd128Register(0), i.InputRegister(1), tmp,
+                     kScratchDoubleReg, i.TempSimd128Register(1));
      }
      break;
    }
--- a/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
+++ b/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
@ -2718,40 +2718,6 @@ void EmitSimdShiftOpImm(LiftoffAssembler* assm, LiftoffRegister dst,
  }
 }

-enum class ShiftSignedness { kSigned, kUnsigned };
-
-template <bool is_signed>
-void EmitI8x16Shr(LiftoffAssembler* assm, LiftoffRegister dst,
-                  LiftoffRegister lhs, LiftoffRegister rhs) {
-  // Same algorithm is used for both signed and unsigned shifts, the only
-  // difference is the actual shift and pack in the end. This is the same
-  // algorithm as used in code-generator-ia32.cc
-  Register tmp =
-      assm->GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(rhs)).gp();
-  XMMRegister tmp_simd =
-      assm->GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs)).fp();
-
-  // Unpack the bytes into words, do logical shifts, and repack.
-  assm->Punpckhbw(liftoff::kScratchDoubleReg, lhs.fp());
-  assm->Punpcklbw(dst.fp(), lhs.fp());
-  assm->mov(tmp, rhs.gp());
-  // Take shift value modulo 8.
-  assm->and_(tmp, 7);
-  assm->add(tmp, Immediate(8));
-  assm->Movd(tmp_simd, tmp);
-  if (is_signed) {
-    assm->Psraw(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg,
-                tmp_simd);
-    assm->Psraw(dst.fp(), dst.fp(), tmp_simd);
-    assm->Packsswb(dst.fp(), liftoff::kScratchDoubleReg);
-  } else {
-    assm->Psrlw(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg,
-                tmp_simd);
-    assm->Psrlw(dst.fp(), dst.fp(), tmp_simd);
-    assm->Packuswb(dst.fp(), liftoff::kScratchDoubleReg);
-  }
-}
-
 inline void EmitAnyTrue(LiftoffAssembler* assm, LiftoffRegister dst,
                        LiftoffRegister src) {
  Register tmp =
@ -3416,39 +3382,32 @@ void LiftoffAssembler::emit_i8x16_shli(LiftoffRegister dst, LiftoffRegister lhs,
 void LiftoffAssembler::emit_i8x16_shr_s(LiftoffRegister dst,
                                        LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
-  liftoff::EmitI8x16Shr</*is_signed=*/true>(this, dst, lhs, rhs);
+  Register tmp = GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(rhs)).gp();
+  XMMRegister tmp_simd =
+      GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs)).fp();
+  I8x16ShrS(dst.fp(), lhs.fp(), rhs.gp(), tmp, liftoff::kScratchDoubleReg,
+            tmp_simd);
 }

 void LiftoffAssembler::emit_i8x16_shri_s(LiftoffRegister dst,
                                         LiftoffRegister lhs, int32_t rhs) {
-  Punpckhbw(liftoff::kScratchDoubleReg, lhs.fp());
-  Punpcklbw(dst.fp(), lhs.fp());
-  uint8_t shift = (rhs & 7) + 8;
-  Psraw(liftoff::kScratchDoubleReg, shift);
-  Psraw(dst.fp(), shift);
-  Packsswb(dst.fp(), liftoff::kScratchDoubleReg);
+  I8x16ShrS(dst.fp(), lhs.fp(), rhs, liftoff::kScratchDoubleReg);
 }

 void LiftoffAssembler::emit_i8x16_shr_u(LiftoffRegister dst,
                                        LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
-  liftoff::EmitI8x16Shr</*is_signed=*/false>(this, dst, lhs, rhs);
+  Register tmp = GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(rhs)).gp();
+  XMMRegister tmp_simd =
+      GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs)).fp();
+  I8x16ShrU(dst.fp(), lhs.fp(), rhs.gp(), tmp, liftoff::kScratchDoubleReg,
+            tmp_simd);
 }

 void LiftoffAssembler::emit_i8x16_shri_u(LiftoffRegister dst,
                                         LiftoffRegister lhs, int32_t rhs) {
  Register tmp = GetUnusedRegister(kGpReg, {}).gp();
-  // Perform 16-bit shift, then mask away high bits.
-  uint8_t shift = rhs & 7;
-  liftoff::EmitSimdShiftOpImm<&Assembler::vpsrlw, &Assembler::psrlw, 3>(
-      this, dst, lhs, rhs);
-
-  uint8_t bmask = 0xff >> shift;
-  uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
-  mov(tmp, mask);
-  Movd(liftoff::kScratchDoubleReg, tmp);
-  Pshufd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, uint8_t{0});
-  Pand(dst.fp(), liftoff::kScratchDoubleReg);
+  I8x16ShrU(dst.fp(), lhs.fp(), rhs, tmp, liftoff::kScratchDoubleReg);
 }

 void LiftoffAssembler::emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,
--- a/src/wasm/baseline/x64/liftoff-assembler-x64.h
+++ b/src/wasm/baseline/x64/liftoff-assembler-x64.h
@ -2335,29 +2335,6 @@ void EmitSimdShiftOpImm(LiftoffAssembler* assm, LiftoffRegister dst,
  }
 }

-template <bool is_signed>
-void EmitI8x16Shr(LiftoffAssembler* assm, LiftoffRegister dst,
-                  LiftoffRegister lhs, LiftoffRegister rhs) {
-  // Same algorithm as the one in code-generator-x64.cc.
-  assm->Punpckhbw(kScratchDoubleReg, lhs.fp());
-  assm->Punpcklbw(dst.fp(), lhs.fp());
-  // Prepare shift value
-  assm->movq(kScratchRegister, rhs.gp());
-  // Take shift value modulo 8.
-  assm->andq(kScratchRegister, Immediate(7));
-  assm->addq(kScratchRegister, Immediate(8));
-  assm->Movq(liftoff::kScratchDoubleReg2, kScratchRegister);
-  if (is_signed) {
-    assm->Psraw(kScratchDoubleReg, liftoff::kScratchDoubleReg2);
-    assm->Psraw(dst.fp(), liftoff::kScratchDoubleReg2);
-    assm->Packsswb(dst.fp(), kScratchDoubleReg);
-  } else {
-    assm->Psrlw(kScratchDoubleReg, liftoff::kScratchDoubleReg2);
-    assm->Psrlw(dst.fp(), liftoff::kScratchDoubleReg2);
-    assm->Packuswb(dst.fp(), kScratchDoubleReg);
-  }
-}
-
 inline void EmitAnyTrue(LiftoffAssembler* assm, LiftoffRegister dst,
                        LiftoffRegister src) {
  assm->xorq(dst.gp(), dst.gp());
@ -2973,43 +2950,25 @@ void LiftoffAssembler::emit_i8x16_shli(LiftoffRegister dst, LiftoffRegister lhs,
 void LiftoffAssembler::emit_i8x16_shr_s(LiftoffRegister dst,
                                        LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
-  liftoff::EmitI8x16Shr</*is_signed=*/true>(this, dst, lhs, rhs);
+  I8x16ShrS(dst.fp(), lhs.fp(), rhs.gp(), kScratchRegister, kScratchDoubleReg,
+            liftoff::kScratchDoubleReg2);
 }

 void LiftoffAssembler::emit_i8x16_shri_s(LiftoffRegister dst,
                                         LiftoffRegister lhs, int32_t rhs) {
-  Punpckhbw(kScratchDoubleReg, lhs.fp());
-  Punpcklbw(dst.fp(), lhs.fp());
-  uint8_t shift = (rhs & 7) + 8;
-  Psraw(kScratchDoubleReg, shift);
-  Psraw(dst.fp(), shift);
-  Packsswb(dst.fp(), kScratchDoubleReg);
+  I8x16ShrS(dst.fp(), lhs.fp(), rhs, kScratchDoubleReg);
 }

 void LiftoffAssembler::emit_i8x16_shr_u(LiftoffRegister dst,
                                        LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
-  liftoff::EmitI8x16Shr</*is_signed=*/false>(this, dst, lhs, rhs);
+  I8x16ShrU(dst.fp(), lhs.fp(), rhs.gp(), kScratchRegister, kScratchDoubleReg,
+            liftoff::kScratchDoubleReg2);
 }

 void LiftoffAssembler::emit_i8x16_shri_u(LiftoffRegister dst,
                                         LiftoffRegister lhs, int32_t rhs) {
-  // Perform 16-bit shift, then mask away high bits.
-  uint8_t shift = rhs & 7;  // i.InputInt3(1);
-  if (CpuFeatures::IsSupported(AVX)) {
-    CpuFeatureScope scope(this, AVX);
-    vpsrlw(dst.fp(), lhs.fp(), byte{shift});
-  } else if (dst != lhs) {
-    Movaps(dst.fp(), lhs.fp());
-    psrlw(dst.fp(), byte{shift});
-  }
-
-  uint8_t bmask = 0xff >> shift;
-  uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
-  movl(kScratchRegister, Immediate(mask));
-  Movd(kScratchDoubleReg, kScratchRegister);
-  Pshufd(kScratchDoubleReg, kScratchDoubleReg, byte{0});
-  Pand(dst.fp(), kScratchDoubleReg);
+  I8x16ShrU(dst.fp(), lhs.fp(), rhs, kScratchRegister, kScratchDoubleReg);
 }

 void LiftoffAssembler::emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,