From 25d4f9374070a70da75c912b98aca1954823698e Mon Sep 17 00:00:00 2001
From: Predrag Rudic <predrag.rudic@mips.com>
Date: Mon, 27 Nov 2017 16:11:08 +0100
Subject: [PATCH] Reland "MIPS[64] Implementation of MSA instructions on
 builtin simulator"

This is reland of 3e0bf580e8424e37ae2ed311616d109e91052292
Original change's description:
> This commit is a step toward enabling test-run-wasm-simd tests for MIPS.
> 36 of those were failing in V8 builtin simulator because some instructions
> were not implemented.  Also there are minor fixes to some of the already
> implemented instructions.
>
> This commit has only 32-bit implementation. After review I will add
> 64-bit version.
>
> Bug:
> Change-Id: I25b0cac352db3efb56b922ace64ab2aaef82472d
> Reviewed-on: https://chromium-review.googlesource.com/744008
> Reviewed-by: Ivica Bogosavljevic <ivica.bogosavljevic@mips.com>
> Reviewed-by: Jakob Kummerow <jkummerow@chromium.org>
> Commit-Queue: Ivica Bogosavljevic <ivica.bogosavljevic@mips.com>
> Cr-Commit-Position: refs/heads/master@{#49439}

Bug:
Change-Id: I3a904caf675d314186c02c1c843d1e6a91a21a14
Reviewed-on: https://chromium-review.googlesource.com/776813
Reviewed-by: Jakob Kummerow <jkummerow@chromium.org>
Reviewed-by: Ivica Bogosavljevic <ivica.bogosavljevic@mips.com>
Reviewed-by: Benedikt Meurer <bmeurer@chromium.org>
Commit-Queue: Ivica Bogosavljevic <ivica.bogosavljevic@mips.com>
Cr-Commit-Position: refs/heads/master@{#49666}
---
 src/mips/simulator-mips.cc           |  386 +++++-
 src/mips64/simulator-mips64.cc       |  385 +++++-
 src/utils.h                          |    2 +-
 test/cctest/test-assembler-mips.cc   | 1815 ++++++++++++++++----------
 test/cctest/test-assembler-mips64.cc | 1813 +++++++++++++++----------
 test/unittests/BUILD.gn              |    1 +
 test/unittests/unittests.gyp         |    1 +
 test/unittests/utils-unittest.cc     |  113 ++
 8 files changed, 2972 insertions(+), 1544 deletions(-)
 create mode 100644 test/unittests/utils-unittest.cc

diff --git a/src/mips/simulator-mips.cc b/src/mips/simulator-mips.cc
index b420bc59b4..342f27666d 100644
--- a/src/mips/simulator-mips.cc
+++ b/src/mips/simulator-mips.cc
@@ -4671,9 +4671,12 @@ void Simulator::DecodeTypeMsaELM() {
       DCHECK_EQ(rd_reg(), kMSACSRRegister);
       SetResult(sa(), bit_cast<int32_t>(MSACSR_));
       break;
-    case MOVE_V:
-      UNIMPLEMENTED();
-      break;
+    case MOVE_V: {
+      msa_reg_t ws;
+      get_msa_register(ws_reg(), &ws);
+      set_msa_register(wd_reg(), &ws);
+      TraceMSARegWr(&ws);
+    } break;
     default:
       opcode &= kMsaELMMask;
       switch (opcode) {
@@ -4742,7 +4745,50 @@ void Simulator::DecodeTypeMsaELM() {
               UNREACHABLE();
           }
         } break;
-        case SLDI:
+        case SLDI: {
+          uint8_t v[32];
+          msa_reg_t ws;
+          msa_reg_t wd;
+          get_msa_register(ws_reg(), &ws);
+          get_msa_register(wd_reg(), &wd);
+#define SLDI_DF(s, k)                \
+  for (unsigned i = 0; i < s; i++) { \
+    v[i] = ws.b[s * k + i];          \
+    v[i + s] = wd.b[s * k + i];      \
+  }                                  \
+  for (unsigned i = 0; i < s; i++) { \
+    wd.b[s * k + i] = v[i + n];      \
+  }
+          switch (DecodeMsaDataFormat()) {
+            case MSA_BYTE:
+              DCHECK(n < kMSALanesByte);
+              SLDI_DF(kMSARegSize / sizeof(int8_t) / kBitsPerByte, 0)
+              break;
+            case MSA_HALF:
+              DCHECK(n < kMSALanesHalf);
+              for (int k = 0; k < 2; ++k) {
+                SLDI_DF(kMSARegSize / sizeof(int16_t) / kBitsPerByte, k)
+              }
+              break;
+            case MSA_WORD:
+              DCHECK(n < kMSALanesWord);
+              for (int k = 0; k < 4; ++k) {
+                SLDI_DF(kMSARegSize / sizeof(int32_t) / kBitsPerByte, k)
+              }
+              break;
+            case MSA_DWORD:
+              DCHECK(n < kMSALanesDword);
+              for (int k = 0; k < 8; ++k) {
+                SLDI_DF(kMSARegSize / sizeof(int64_t) / kBitsPerByte, k)
+              }
+              break;
+            default:
+              UNREACHABLE();
+          }
+          set_msa_register(wd_reg(), &wd);
+          TraceMSARegWr(&wd);
+        } break;
+#undef SLDI_DF
         case SPLATI:
         case INSVE:
           UNIMPLEMENTED();
@@ -4879,6 +4925,7 @@ void Simulator::DecodeTypeMsaBIT() {
     default:
       UNREACHABLE();
   }
+#undef MSA_BIT_DF
 }
 
 void Simulator::DecodeTypeMsaMI10() {
@@ -5161,13 +5208,6 @@ T Simulator::Msa3RInstrHelper(uint32_t opcode, T wd, T ws, T wt) {
     case DPSUB_U:
     case SLD:
     case SPLAT:
-    case PCKEV:
-    case PCKOD:
-    case ILVL:
-    case ILVR:
-    case ILVEV:
-    case ILVOD:
-    case VSHF:
       UNIMPLEMENTED();
       break;
     case SRAR: {
@@ -5179,51 +5219,210 @@ T Simulator::Msa3RInstrHelper(uint32_t opcode, T wd, T ws, T wt) {
       int bit = wt_modulo == 0 ? 0 : (wsu >> (wt_modulo - 1)) & 1;
       res = static_cast<T>((wsu >> wt_modulo) + bit);
     } break;
-    case HADD_S:
-    case HADD_U:
-    case HSUB_S:
-    case HSUB_U:
-      UNIMPLEMENTED();
-      break;
     default:
       UNREACHABLE();
   }
   return res;
 }
 
+template <typename T_int, typename T_reg>
+void Msa3RInstrHelper_shuffle(const uint32_t opcode, T_reg ws, T_reg wt,
+                              T_reg wd, const int i, const int num_of_lanes) {
+  T_int *ws_p, *wt_p, *wd_p;
+  ws_p = reinterpret_cast<T_int*>(ws);
+  wt_p = reinterpret_cast<T_int*>(wt);
+  wd_p = reinterpret_cast<T_int*>(wd);
+  switch (opcode) {
+    case PCKEV:
+      wd_p[i] = wt_p[2 * i];
+      wd_p[i + num_of_lanes / 2] = ws_p[2 * i];
+      break;
+    case PCKOD:
+      wd_p[i] = wt_p[2 * i + 1];
+      wd_p[i + num_of_lanes / 2] = ws_p[2 * i + 1];
+      break;
+    case ILVL:
+      wd_p[2 * i] = wt_p[i + num_of_lanes / 2];
+      wd_p[2 * i + 1] = ws_p[i + num_of_lanes / 2];
+      break;
+    case ILVR:
+      wd_p[2 * i] = wt_p[i];
+      wd_p[2 * i + 1] = ws_p[i];
+      break;
+    case ILVEV:
+      wd_p[2 * i] = wt_p[2 * i];
+      wd_p[2 * i + 1] = ws_p[2 * i];
+      break;
+    case ILVOD:
+      wd_p[2 * i] = wt_p[2 * i + 1];
+      wd_p[2 * i + 1] = ws_p[2 * i + 1];
+      break;
+    case VSHF: {
+      const int mask_not_valid = 0xc0;
+      const int mask_6_bits = 0x3f;
+      if ((wd_p[i] & mask_not_valid)) {
+        wd_p[i] = 0;
+      } else {
+        int k = (wd_p[i] & mask_6_bits) % (num_of_lanes * 2);
+        wd_p[i] = k >= num_of_lanes ? ws_p[k - num_of_lanes] : wt_p[k];
+      }
+    } break;
+    default:
+      UNREACHABLE();
+  }
+}
+
+template <typename T_int, typename T_smaller_int, typename T_reg>
+void Msa3RInstrHelper_horizontal(const uint32_t opcode, T_reg ws, T_reg wt,
+                                 T_reg wd, const int i,
+                                 const int num_of_lanes) {
+  typedef typename std::make_unsigned<T_int>::type T_uint;
+  typedef typename std::make_unsigned<T_smaller_int>::type T_smaller_uint;
+  T_int* wd_p;
+  T_smaller_int *ws_p, *wt_p;
+  ws_p = reinterpret_cast<T_smaller_int*>(ws);
+  wt_p = reinterpret_cast<T_smaller_int*>(wt);
+  wd_p = reinterpret_cast<T_int*>(wd);
+  T_uint* wd_pu;
+  T_smaller_uint *ws_pu, *wt_pu;
+  ws_pu = reinterpret_cast<T_smaller_uint*>(ws);
+  wt_pu = reinterpret_cast<T_smaller_uint*>(wt);
+  wd_pu = reinterpret_cast<T_uint*>(wd);
+  switch (opcode) {
+    case HADD_S:
+      wd_p[i] =
+          static_cast<T_int>(ws_p[2 * i + 1]) + static_cast<T_int>(wt_p[2 * i]);
+      break;
+    case HADD_U:
+      wd_pu[i] = static_cast<T_uint>(ws_pu[2 * i + 1]) +
+                 static_cast<T_uint>(wt_pu[2 * i]);
+      break;
+    case HSUB_S:
+      wd_p[i] =
+          static_cast<T_int>(ws_p[2 * i + 1]) - static_cast<T_int>(wt_p[2 * i]);
+      break;
+    case HSUB_U:
+      wd_pu[i] = static_cast<T_uint>(ws_pu[2 * i + 1]) -
+                 static_cast<T_uint>(wt_pu[2 * i]);
+      break;
+    default:
+      UNREACHABLE();
+  }
+}
+
 void Simulator::DecodeTypeMsa3R() {
   DCHECK(IsMipsArchVariant(kMips32r6));
   DCHECK(CpuFeatures::IsSupported(MIPS_SIMD));
   uint32_t opcode = instr_.InstructionBits() & kMsa3RMask;
   msa_reg_t ws, wd, wt;
-
+  get_msa_register(ws_reg(), &ws);
+  get_msa_register(wt_reg(), &wt);
+  get_msa_register(wd_reg(), &wd);
+  switch (opcode) {
+    case HADD_S:
+    case HADD_U:
+    case HSUB_S:
+    case HSUB_U:
+#define HORIZONTAL_ARITHMETIC_DF(num_of_lanes, int_type, lesser_int_type) \
+  for (int i = 0; i < num_of_lanes; ++i) {                                \
+    Msa3RInstrHelper_horizontal<int_type, lesser_int_type>(               \
+        opcode, &ws, &wt, &wd, i, num_of_lanes);                          \
+  }
+      switch (DecodeMsaDataFormat()) {
+        case MSA_HALF:
+          HORIZONTAL_ARITHMETIC_DF(kMSALanesHalf, int16_t, int8_t);
+          break;
+        case MSA_WORD:
+          HORIZONTAL_ARITHMETIC_DF(kMSALanesWord, int32_t, int16_t);
+          break;
+        case MSA_DWORD:
+          HORIZONTAL_ARITHMETIC_DF(kMSALanesDword, int64_t, int32_t);
+          break;
+        default:
+          UNREACHABLE();
+      }
+      break;
+#undef HORIZONTAL_ARITHMETIC_DF
+    case VSHF:
+#define VSHF_DF(num_of_lanes, int_type)                          \
+  for (int i = 0; i < num_of_lanes; ++i) {                       \
+    Msa3RInstrHelper_shuffle<int_type>(opcode, &ws, &wt, &wd, i, \
+                                       num_of_lanes);            \
+  }
+      switch (DecodeMsaDataFormat()) {
+        case MSA_BYTE:
+          VSHF_DF(kMSALanesByte, int8_t);
+          break;
+        case MSA_HALF:
+          VSHF_DF(kMSALanesHalf, int16_t);
+          break;
+        case MSA_WORD:
+          VSHF_DF(kMSALanesWord, int32_t);
+          break;
+        case MSA_DWORD:
+          VSHF_DF(kMSALanesDword, int64_t);
+          break;
+        default:
+          UNREACHABLE();
+      }
+#undef VSHF_DF
+      break;
+    case PCKEV:
+    case PCKOD:
+    case ILVL:
+    case ILVR:
+    case ILVEV:
+    case ILVOD:
+#define INTERLEAVE_PACK_DF(num_of_lanes, int_type)               \
+  for (int i = 0; i < num_of_lanes / 2; ++i) {                   \
+    Msa3RInstrHelper_shuffle<int_type>(opcode, &ws, &wt, &wd, i, \
+                                       num_of_lanes);            \
+  }
+      switch (DecodeMsaDataFormat()) {
+        case MSA_BYTE:
+          INTERLEAVE_PACK_DF(kMSALanesByte, int8_t);
+          break;
+        case MSA_HALF:
+          INTERLEAVE_PACK_DF(kMSALanesHalf, int16_t);
+          break;
+        case MSA_WORD:
+          INTERLEAVE_PACK_DF(kMSALanesWord, int32_t);
+          break;
+        case MSA_DWORD:
+          INTERLEAVE_PACK_DF(kMSALanesDword, int64_t);
+          break;
+        default:
+          UNREACHABLE();
+      }
+      break;
+#undef INTERLEAVE_PACK_DF
+    default:
 #define MSA_3R_DF(elem, num_of_lanes)                                          \
-  get_msa_register(instr_.WdValue(), wd.elem);                                 \
-  get_msa_register(instr_.WsValue(), ws.elem);                                 \
-  get_msa_register(instr_.WtValue(), wt.elem);                                 \
   for (int i = 0; i < num_of_lanes; i++) {                                     \
     wd.elem[i] = Msa3RInstrHelper(opcode, wd.elem[i], ws.elem[i], wt.elem[i]); \
-  }                                                                            \
-  set_msa_register(instr_.WdValue(), wd.elem);                                 \
-  TraceMSARegWr(wd.elem);
+  }
 
-    switch (DecodeMsaDataFormat()) {
-      case MSA_BYTE:
-        MSA_3R_DF(b, kMSALanesByte);
-        break;
-      case MSA_HALF:
-        MSA_3R_DF(h, kMSALanesHalf);
-        break;
-      case MSA_WORD:
-        MSA_3R_DF(w, kMSALanesWord);
-        break;
-      case MSA_DWORD:
-        MSA_3R_DF(d, kMSALanesDword);
-        break;
-      default:
-        UNREACHABLE();
-    }
+      switch (DecodeMsaDataFormat()) {
+        case MSA_BYTE:
+          MSA_3R_DF(b, kMSALanesByte);
+          break;
+        case MSA_HALF:
+          MSA_3R_DF(h, kMSALanesHalf);
+          break;
+        case MSA_WORD:
+          MSA_3R_DF(w, kMSALanesWord);
+          break;
+        case MSA_DWORD:
+          MSA_3R_DF(d, kMSALanesDword);
+          break;
+        default:
+          UNREACHABLE();
+      }
 #undef MSA_3R_DF
+      break;
+  }
+  set_msa_register(wd_reg(), &wd);
+  TraceMSARegWr(&wd);
 }
 
 template <typename T_int, typename T_fp, typename T_reg>
@@ -5321,7 +5520,7 @@ void Msa3RFInstrHelper(uint32_t opcode, T_reg ws, T_reg wt, T_reg& wd) {
       break;
     case FDIV: {
       if (t_element == 0) {
-        wd = std::numeric_limits<T_fp>::quiet_NaN();
+        wd = bit_cast<T_int>(std::numeric_limits<T_fp>::quiet_NaN());
       } else {
         wd = bit_cast<T_int>(s_element / t_element);
       }
@@ -5541,6 +5740,7 @@ void Simulator::DecodeTypeMsa3RF() {
           UNREACHABLE();
       }
       break;
+#undef PACK_FLOAT16
 #undef FEXDO_DF
     case FTQ:
 #define FTQ_DF(source, dst, fp_type, int_type)                 \
@@ -5887,8 +6087,8 @@ T_int Msa2RFInstrHelper(uint32_t opcode, T_src src, T_dst& dst,
       const T_int min_int = std::numeric_limits<T_int>::min();
       if (std::isnan(element)) {
         dst = 0;
-      } else if (element > max_int || element < min_int) {
-        dst = element > max_int ? max_int : min_int;
+      } else if (element >= max_int || element <= min_int) {
+        dst = element >= max_int ? max_int : min_int;
       } else {
         dst = static_cast<T_int>(std::trunc(element));
       }
@@ -5899,8 +6099,8 @@ T_int Msa2RFInstrHelper(uint32_t opcode, T_src src, T_dst& dst,
       const T_uint max_int = std::numeric_limits<T_uint>::max();
       if (std::isnan(element)) {
         dst = 0;
-      } else if (element > max_int || element < 0) {
-        dst = element > max_int ? max_int : 0;
+      } else if (element >= max_int || element <= 0) {
+        dst = element >= max_int ? max_int : 0;
       } else {
         dst = static_cast<T_uint>(std::trunc(element));
       }
@@ -6009,8 +6209,8 @@ T_int Msa2RFInstrHelper(uint32_t opcode, T_src src, T_dst& dst,
   return 0;
 }
 
-template <typename T_int, typename T_fp, typename T_reg, typename T_i>
-T_int Msa2RFInstrHelper2(uint32_t opcode, T_reg ws, T_i i) {
+template <typename T_int, typename T_fp, typename T_reg>
+T_int Msa2RFInstrHelper2(uint32_t opcode, T_reg ws, int i) {
   switch (opcode) {
 #define EXTRACT_FLOAT16_SIGN(fp16) (fp16 >> 15)
 #define EXTRACT_FLOAT16_EXP(fp16) (fp16 >> 10 & 0x1f)
@@ -6231,6 +6431,30 @@ void Simulator::DecodeTypeImmediate() {
     }
   };
 
+  auto BranchHelper_MSA = [this, &next_pc, imm16,
+                           &execute_branch_delay_instruction](bool do_branch) {
+    execute_branch_delay_instruction = true;
+    int32_t current_pc = get_pc();
+    const int32_t bitsIn16Int = sizeof(int16_t) * kBitsPerByte;
+    if (do_branch) {
+      if (FLAG_debug_code) {
+        int16_t bits = imm16 & 0xfc;
+        if (imm16 >= 0) {
+          CHECK_EQ(bits, 0);
+        } else {
+          CHECK_EQ(bits ^ 0xfc, 0);
+        }
+      }
+      // jump range :[pc + kInstrSize - 512 * kInstrSize,
+      //              pc + kInstrSize + 511 * kInstrSize]
+      int16_t offset = static_cast<int16_t>(imm16 << (bitsIn16Int - 10)) >>
+                       (bitsIn16Int - 12);
+      next_pc = current_pc + offset + Instruction::kInstrSize;
+    } else {
+      next_pc = current_pc + 2 * Instruction::kInstrSize;
+    }
+  };
+
   auto BranchAndLinkCompactHelper = [this, &next_pc](bool do_branch, int bits) {
     int32_t current_pc = get_pc();
     CheckForbiddenSlot(current_pc);
@@ -6273,18 +6497,66 @@ void Simulator::DecodeTypeImmediate() {
         case BC1NEZ:
           BranchHelper(get_fpu_register(ft_reg) & 0x1);
           break;
-        case BZ_V:
+        case BZ_V: {
+          msa_reg_t wt;
+          get_msa_register(wt_reg(), &wt);
+          BranchHelper_MSA(wt.d[0] == 0 && wt.d[1] == 0);
+        } break;
+#define BZ_DF(witdh, lanes)          \
+  {                                  \
+    msa_reg_t wt;                    \
+    get_msa_register(wt_reg(), &wt); \
+    int i;                           \
+    for (i = 0; i < lanes; ++i) {    \
+      if (wt.witdh[i] == 0) {        \
+        break;                       \
+      }                              \
+    }                                \
+    BranchHelper_MSA(i != lanes);    \
+  }
         case BZ_B:
-        case BZ_H:
-        case BZ_W:
-        case BZ_D:
-        case BNZ_V:
-        case BNZ_B:
-        case BNZ_H:
-        case BNZ_W:
-        case BNZ_D:
-          UNIMPLEMENTED();
+          BZ_DF(b, kMSALanesByte)
           break;
+        case BZ_H:
+          BZ_DF(h, kMSALanesHalf)
+          break;
+        case BZ_W:
+          BZ_DF(w, kMSALanesWord)
+          break;
+        case BZ_D:
+          BZ_DF(d, kMSALanesDword)
+          break;
+#undef BZ_DF
+        case BNZ_V: {
+          msa_reg_t wt;
+          get_msa_register(wt_reg(), &wt);
+          BranchHelper_MSA(wt.d[0] != 0 || wt.d[1] != 0);
+        } break;
+#define BNZ_DF(witdh, lanes)         \
+  {                                  \
+    msa_reg_t wt;                    \
+    get_msa_register(wt_reg(), &wt); \
+    int i;                           \
+    for (i = 0; i < lanes; ++i) {    \
+      if (wt.witdh[i] == 0) {        \
+        break;                       \
+      }                              \
+    }                                \
+    BranchHelper_MSA(i == lanes);    \
+  }
+        case BNZ_B:
+          BNZ_DF(b, kMSALanesByte)
+          break;
+        case BNZ_H:
+          BNZ_DF(h, kMSALanesHalf)
+          break;
+        case BNZ_W:
+          BNZ_DF(w, kMSALanesWord)
+          break;
+        case BNZ_D:
+          BNZ_DF(d, kMSALanesDword)
+          break;
+#undef BNZ_DF
         default:
           UNREACHABLE();
       }
diff --git a/src/mips64/simulator-mips64.cc b/src/mips64/simulator-mips64.cc
index 3c10f89a06..e992efebf5 100644
--- a/src/mips64/simulator-mips64.cc
+++ b/src/mips64/simulator-mips64.cc
@@ -4883,9 +4883,12 @@ void Simulator::DecodeTypeMsaELM() {
       DCHECK_EQ(rd_reg(), kMSACSRRegister);
       SetResult(sa(), static_cast<int64_t>(bit_cast<int32_t>(MSACSR_)));
       break;
-    case MOVE_V:
-      UNIMPLEMENTED();
-      break;
+    case MOVE_V: {
+      msa_reg_t ws;
+      get_msa_register(ws_reg(), &ws);
+      set_msa_register(wd_reg(), &ws);
+      TraceMSARegWr(&ws);
+    } break;
     default:
       opcode &= kMsaELMMask;
       switch (opcode) {
@@ -4967,7 +4970,50 @@ void Simulator::DecodeTypeMsaELM() {
               UNREACHABLE();
           }
         } break;
-        case SLDI:
+        case SLDI: {
+          uint8_t v[32];
+          msa_reg_t ws;
+          msa_reg_t wd;
+          get_msa_register(ws_reg(), &ws);
+          get_msa_register(wd_reg(), &wd);
+#define SLDI_DF(s, k)                \
+  for (unsigned i = 0; i < s; i++) { \
+    v[i] = ws.b[s * k + i];          \
+    v[i + s] = wd.b[s * k + i];      \
+  }                                  \
+  for (unsigned i = 0; i < s; i++) { \
+    wd.b[s * k + i] = v[i + n];      \
+  }
+          switch (DecodeMsaDataFormat()) {
+            case MSA_BYTE:
+              DCHECK(n < kMSALanesByte);
+              SLDI_DF(kMSARegSize / sizeof(int8_t) / kBitsPerByte, 0)
+              break;
+            case MSA_HALF:
+              DCHECK(n < kMSALanesHalf);
+              for (int k = 0; k < 2; ++k) {
+                SLDI_DF(kMSARegSize / sizeof(int16_t) / kBitsPerByte, k)
+              }
+              break;
+            case MSA_WORD:
+              DCHECK(n < kMSALanesWord);
+              for (int k = 0; k < 4; ++k) {
+                SLDI_DF(kMSARegSize / sizeof(int32_t) / kBitsPerByte, k)
+              }
+              break;
+            case MSA_DWORD:
+              DCHECK(n < kMSALanesDword);
+              for (int k = 0; k < 8; ++k) {
+                SLDI_DF(kMSARegSize / sizeof(int64_t) / kBitsPerByte, k)
+              }
+              break;
+            default:
+              UNREACHABLE();
+          }
+          set_msa_register(wd_reg(), &wd);
+          TraceMSARegWr(&wd);
+        } break;
+#undef SLDI_DF
         case SPLATI:
         case INSVE:
           UNIMPLEMENTED();
@@ -5104,6 +5150,7 @@ void Simulator::DecodeTypeMsaBIT() {
     default:
       UNREACHABLE();
   }
+#undef MSA_BIT_DF
 }
 
 void Simulator::DecodeTypeMsaMI10() {
@@ -5386,13 +5433,6 @@ T Simulator::Msa3RInstrHelper(uint32_t opcode, T wd, T ws, T wt) {
     case DPSUB_U:
     case SLD:
     case SPLAT:
-    case PCKEV:
-    case PCKOD:
-    case ILVL:
-    case ILVR:
-    case ILVEV:
-    case ILVOD:
-    case VSHF:
       UNIMPLEMENTED();
       break;
     case SRAR: {
@@ -5404,51 +5444,209 @@ T Simulator::Msa3RInstrHelper(uint32_t opcode, T wd, T ws, T wt) {
       int bit = wt_modulo == 0 ? 0 : (wsu >> (wt_modulo - 1)) & 1;
       res = static_cast<T>((wsu >> wt_modulo) + bit);
     } break;
-    case HADD_S:
-    case HADD_U:
-    case HSUB_S:
-    case HSUB_U:
-      UNIMPLEMENTED();
-      break;
     default:
       UNREACHABLE();
   }
   return res;
 }
+template <typename T_int, typename T_reg>
+void Msa3RInstrHelper_shuffle(const uint32_t opcode, T_reg ws, T_reg wt,
+                              T_reg wd, const int i, const int num_of_lanes) {
+  T_int *ws_p, *wt_p, *wd_p;
+  ws_p = reinterpret_cast<T_int*>(ws);
+  wt_p = reinterpret_cast<T_int*>(wt);
+  wd_p = reinterpret_cast<T_int*>(wd);
+  switch (opcode) {
+    case PCKEV:
+      wd_p[i] = wt_p[2 * i];
+      wd_p[i + num_of_lanes / 2] = ws_p[2 * i];
+      break;
+    case PCKOD:
+      wd_p[i] = wt_p[2 * i + 1];
+      wd_p[i + num_of_lanes / 2] = ws_p[2 * i + 1];
+      break;
+    case ILVL:
+      wd_p[2 * i] = wt_p[i + num_of_lanes / 2];
+      wd_p[2 * i + 1] = ws_p[i + num_of_lanes / 2];
+      break;
+    case ILVR:
+      wd_p[2 * i] = wt_p[i];
+      wd_p[2 * i + 1] = ws_p[i];
+      break;
+    case ILVEV:
+      wd_p[2 * i] = wt_p[2 * i];
+      wd_p[2 * i + 1] = ws_p[2 * i];
+      break;
+    case ILVOD:
+      wd_p[2 * i] = wt_p[2 * i + 1];
+      wd_p[2 * i + 1] = ws_p[2 * i + 1];
+      break;
+    case VSHF: {
+      const int mask_not_valid = 0xc0;
+      const int mask_6_bits = 0x3f;
+      if ((wd_p[i] & mask_not_valid)) {
+        wd_p[i] = 0;
+      } else {
+        int k = (wd_p[i] & mask_6_bits) % (num_of_lanes * 2);
+        wd_p[i] = k >= num_of_lanes ? ws_p[k - num_of_lanes] : wt_p[k];
+      }
+    } break;
+    default:
+      UNREACHABLE();
+  }
+}
+
+template <typename T_int, typename T_smaller_int, typename T_reg>
+void Msa3RInstrHelper_horizontal(const uint32_t opcode, T_reg ws, T_reg wt,
+                                 T_reg wd, const int i,
+                                 const int num_of_lanes) {
+  typedef typename std::make_unsigned<T_int>::type T_uint;
+  typedef typename std::make_unsigned<T_smaller_int>::type T_smaller_uint;
+  T_int* wd_p;
+  T_smaller_int *ws_p, *wt_p;
+  ws_p = reinterpret_cast<T_smaller_int*>(ws);
+  wt_p = reinterpret_cast<T_smaller_int*>(wt);
+  wd_p = reinterpret_cast<T_int*>(wd);
+  T_uint* wd_pu;
+  T_smaller_uint *ws_pu, *wt_pu;
+  ws_pu = reinterpret_cast<T_smaller_uint*>(ws);
+  wt_pu = reinterpret_cast<T_smaller_uint*>(wt);
+  wd_pu = reinterpret_cast<T_uint*>(wd);
+  switch (opcode) {
+    case HADD_S:
+      wd_p[i] =
+          static_cast<T_int>(ws_p[2 * i + 1]) + static_cast<T_int>(wt_p[2 * i]);
+      break;
+    case HADD_U:
+      wd_pu[i] = static_cast<T_uint>(ws_pu[2 * i + 1]) +
+                 static_cast<T_uint>(wt_pu[2 * i]);
+      break;
+    case HSUB_S:
+      wd_p[i] =
+          static_cast<T_int>(ws_p[2 * i + 1]) - static_cast<T_int>(wt_p[2 * i]);
+      break;
+    case HSUB_U:
+      wd_pu[i] = static_cast<T_uint>(ws_pu[2 * i + 1]) -
+                 static_cast<T_uint>(wt_pu[2 * i]);
+      break;
+    default:
+      UNREACHABLE();
+  }
+}
 
 void Simulator::DecodeTypeMsa3R() {
   DCHECK_EQ(kArchVariant, kMips64r6);
   DCHECK(CpuFeatures::IsSupported(MIPS_SIMD));
   uint32_t opcode = instr_.InstructionBits() & kMsa3RMask;
   msa_reg_t ws, wd, wt;
-
+  get_msa_register(ws_reg(), &ws);
+  get_msa_register(wt_reg(), &wt);
+  get_msa_register(wd_reg(), &wd);
+  switch (opcode) {
+    case HADD_S:
+    case HADD_U:
+    case HSUB_S:
+    case HSUB_U:
+#define HORIZONTAL_ARITHMETIC_DF(num_of_lanes, int_type, lesser_int_type) \
+  for (int i = 0; i < num_of_lanes; ++i) {                                \
+    Msa3RInstrHelper_horizontal<int_type, lesser_int_type>(               \
+        opcode, &ws, &wt, &wd, i, num_of_lanes);                          \
+  }
+      switch (DecodeMsaDataFormat()) {
+        case MSA_HALF:
+          HORIZONTAL_ARITHMETIC_DF(kMSALanesHalf, int16_t, int8_t);
+          break;
+        case MSA_WORD:
+          HORIZONTAL_ARITHMETIC_DF(kMSALanesWord, int32_t, int16_t);
+          break;
+        case MSA_DWORD:
+          HORIZONTAL_ARITHMETIC_DF(kMSALanesDword, int64_t, int32_t);
+          break;
+        default:
+          UNREACHABLE();
+      }
+      break;
+#undef HORIZONTAL_ARITHMETIC_DF
+    case VSHF:
+#define VSHF_DF(num_of_lanes, int_type)                          \
+  for (int i = 0; i < num_of_lanes; ++i) {                       \
+    Msa3RInstrHelper_shuffle<int_type>(opcode, &ws, &wt, &wd, i, \
+                                       num_of_lanes);            \
+  }
+      switch (DecodeMsaDataFormat()) {
+        case MSA_BYTE:
+          VSHF_DF(kMSALanesByte, int8_t);
+          break;
+        case MSA_HALF:
+          VSHF_DF(kMSALanesHalf, int16_t);
+          break;
+        case MSA_WORD:
+          VSHF_DF(kMSALanesWord, int32_t);
+          break;
+        case MSA_DWORD:
+          VSHF_DF(kMSALanesDword, int64_t);
+          break;
+        default:
+          UNREACHABLE();
+      }
+#undef VSHF_DF
+      break;
+    case PCKEV:
+    case PCKOD:
+    case ILVL:
+    case ILVR:
+    case ILVEV:
+    case ILVOD:
+#define INTERLEAVE_PACK_DF(num_of_lanes, int_type)               \
+  for (int i = 0; i < num_of_lanes / 2; ++i) {                   \
+    Msa3RInstrHelper_shuffle<int_type>(opcode, &ws, &wt, &wd, i, \
+                                       num_of_lanes);            \
+  }
+      switch (DecodeMsaDataFormat()) {
+        case MSA_BYTE:
+          INTERLEAVE_PACK_DF(kMSALanesByte, int8_t);
+          break;
+        case MSA_HALF:
+          INTERLEAVE_PACK_DF(kMSALanesHalf, int16_t);
+          break;
+        case MSA_WORD:
+          INTERLEAVE_PACK_DF(kMSALanesWord, int32_t);
+          break;
+        case MSA_DWORD:
+          INTERLEAVE_PACK_DF(kMSALanesDword, int64_t);
+          break;
+        default:
+          UNREACHABLE();
+      }
+      break;
+#undef INTERLEAVE_PACK_DF
+    default:
 #define MSA_3R_DF(elem, num_of_lanes)                                          \
-  get_msa_register(instr_.WdValue(), wd.elem);                                 \
-  get_msa_register(instr_.WsValue(), ws.elem);                                 \
-  get_msa_register(instr_.WtValue(), wt.elem);                                 \
   for (int i = 0; i < num_of_lanes; i++) {                                     \
     wd.elem[i] = Msa3RInstrHelper(opcode, wd.elem[i], ws.elem[i], wt.elem[i]); \
-  }                                                                            \
-  set_msa_register(instr_.WdValue(), wd.elem);                                 \
-  TraceMSARegWr(wd.elem);
-
-  switch (DecodeMsaDataFormat()) {
-    case MSA_BYTE:
-      MSA_3R_DF(b, kMSALanesByte);
-      break;
-    case MSA_HALF:
-      MSA_3R_DF(h, kMSALanesHalf);
-      break;
-    case MSA_WORD:
-      MSA_3R_DF(w, kMSALanesWord);
-      break;
-    case MSA_DWORD:
-      MSA_3R_DF(d, kMSALanesDword);
-      break;
-    default:
-      UNREACHABLE();
   }
+
+      switch (DecodeMsaDataFormat()) {
+        case MSA_BYTE:
+          MSA_3R_DF(b, kMSALanesByte);
+          break;
+        case MSA_HALF:
+          MSA_3R_DF(h, kMSALanesHalf);
+          break;
+        case MSA_WORD:
+          MSA_3R_DF(w, kMSALanesWord);
+          break;
+        case MSA_DWORD:
+          MSA_3R_DF(d, kMSALanesDword);
+          break;
+        default:
+          UNREACHABLE();
+      }
 #undef MSA_3R_DF
+      break;
+  }
+  set_msa_register(wd_reg(), &wd);
+  TraceMSARegWr(&wd);
 }
 
 template <typename T_int, typename T_fp, typename T_reg>
@@ -5546,7 +5744,7 @@ void Msa3RFInstrHelper(uint32_t opcode, T_reg ws, T_reg wt, T_reg& wd) {
       break;
     case FDIV: {
       if (t_element == 0) {
-        wd = std::numeric_limits<T_fp>::quiet_NaN();
+        wd = bit_cast<T_int>(std::numeric_limits<T_fp>::quiet_NaN());
       } else {
         wd = bit_cast<T_int>(s_element / t_element);
       }
@@ -5766,6 +5964,7 @@ void Simulator::DecodeTypeMsa3RF() {
           UNREACHABLE();
       }
       break;
+#undef PACK_FLOAT16
 #undef FEXDO_DF
     case FTQ:
 #define FTQ_DF(source, dst, fp_type, int_type)                 \
@@ -6119,8 +6318,8 @@ T_int Msa2RFInstrHelper(uint32_t opcode, T_src src, T_dst& dst,
       const T_int min_int = std::numeric_limits<T_int>::min();
       if (std::isnan(element)) {
         dst = 0;
-      } else if (element > max_int || element < min_int) {
-        dst = element > max_int ? max_int : min_int;
+      } else if (element >= max_int || element <= min_int) {
+        dst = element >= max_int ? max_int : min_int;
       } else {
         dst = static_cast<T_int>(std::trunc(element));
       }
@@ -6131,8 +6330,8 @@ T_int Msa2RFInstrHelper(uint32_t opcode, T_src src, T_dst& dst,
       const T_uint max_int = std::numeric_limits<T_uint>::max();
       if (std::isnan(element)) {
         dst = 0;
-      } else if (element > max_int || element < 0) {
-        dst = element > max_int ? max_int : 0;
+      } else if (element >= max_int || element <= 0) {
+        dst = element >= max_int ? max_int : 0;
       } else {
         dst = static_cast<T_uint>(std::trunc(element));
       }
@@ -6241,8 +6440,8 @@ T_int Msa2RFInstrHelper(uint32_t opcode, T_src src, T_dst& dst,
   return 0;
 }
 
-template <typename T_int, typename T_fp, typename T_reg, typename T_i>
-T_int Msa2RFInstrHelper2(uint32_t opcode, T_reg ws, T_i i) {
+template <typename T_int, typename T_fp, typename T_reg>
+T_int Msa2RFInstrHelper2(uint32_t opcode, T_reg ws, int i) {
   switch (opcode) {
 #define EXTRACT_FLOAT16_SIGN(fp16) (fp16 >> 15)
 #define EXTRACT_FLOAT16_EXP(fp16) (fp16 >> 10 & 0x1f)
@@ -6472,6 +6671,30 @@ void Simulator::DecodeTypeImmediate() {
     }
   };
 
+  auto BranchHelper_MSA = [this, &next_pc, imm16,
+                           &execute_branch_delay_instruction](bool do_branch) {
+    execute_branch_delay_instruction = true;
+    int64_t current_pc = get_pc();
+    const int32_t bitsIn16Int = sizeof(int16_t) * kBitsPerByte;
+    if (do_branch) {
+      if (FLAG_debug_code) {
+        int16_t bits = imm16 & 0xfc;
+        if (imm16 >= 0) {
+          CHECK_EQ(bits, 0);
+        } else {
+          CHECK_EQ(bits ^ 0xfc, 0);
+        }
+      }
+      // jump range :[pc + kInstrSize - 512 * kInstrSize,
+      //              pc + kInstrSize + 511 * kInstrSize]
+      int16_t offset = static_cast<int16_t>(imm16 << (bitsIn16Int - 10)) >>
+                       (bitsIn16Int - 12);
+      next_pc = current_pc + offset + Instruction::kInstrSize;
+    } else {
+      next_pc = current_pc + 2 * Instruction::kInstrSize;
+    }
+  };
+
   auto BranchAndLinkCompactHelper = [this, &next_pc](bool do_branch, int bits) {
     int64_t current_pc = get_pc();
     CheckForbiddenSlot(current_pc);
@@ -6513,18 +6736,66 @@ void Simulator::DecodeTypeImmediate() {
         case BC1NEZ:
           BranchHelper(get_fpu_register(ft_reg) & 0x1);
           break;
-        case BZ_V:
+        case BZ_V: {
+          msa_reg_t wt;
+          get_msa_register(wt_reg(), &wt);
+          BranchHelper_MSA(wt.d[0] == 0 && wt.d[1] == 0);
+        } break;
+#define BZ_DF(witdh, lanes)          \
+  {                                  \
+    msa_reg_t wt;                    \
+    get_msa_register(wt_reg(), &wt); \
+    int i;                           \
+    for (i = 0; i < lanes; ++i) {    \
+      if (wt.witdh[i] == 0) {        \
+        break;                       \
+      }                              \
+    }                                \
+    BranchHelper_MSA(i != lanes);    \
+  }
         case BZ_B:
-        case BZ_H:
-        case BZ_W:
-        case BZ_D:
-        case BNZ_V:
-        case BNZ_B:
-        case BNZ_H:
-        case BNZ_W:
-        case BNZ_D:
-          UNIMPLEMENTED();
+          BZ_DF(b, kMSALanesByte)
           break;
+        case BZ_H:
+          BZ_DF(h, kMSALanesHalf)
+          break;
+        case BZ_W:
+          BZ_DF(w, kMSALanesWord)
+          break;
+        case BZ_D:
+          BZ_DF(d, kMSALanesDword)
+          break;
+#undef BZ_DF
+        case BNZ_V: {
+          msa_reg_t wt;
+          get_msa_register(wt_reg(), &wt);
+          BranchHelper_MSA(wt.d[0] != 0 || wt.d[1] != 0);
+        } break;
+#define BNZ_DF(witdh, lanes)         \
+  {                                  \
+    msa_reg_t wt;                    \
+    get_msa_register(wt_reg(), &wt); \
+    int i;                           \
+    for (i = 0; i < lanes; ++i) {    \
+      if (wt.witdh[i] == 0) {        \
+        break;                       \
+      }                              \
+    }                                \
+    BranchHelper_MSA(i == lanes);    \
+  }
+        case BNZ_B:
+          BNZ_DF(b, kMSALanesByte)
+          break;
+        case BNZ_H:
+          BNZ_DF(h, kMSALanesHalf)
+          break;
+        case BNZ_W:
+          BNZ_DF(w, kMSALanesWord)
+          break;
+        case BNZ_D:
+          BNZ_DF(d, kMSALanesDword)
+          break;
+#undef BNZ_DF
         default:
           UNREACHABLE();
       }
diff --git a/src/utils.h b/src/utils.h
index b585217f7c..e6e98fabba 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -288,7 +288,7 @@ T SaturateAdd(T a, T b) {
 template <typename T>
 T SaturateSub(T a, T b) {
   if (std::is_signed<T>::value) {
-    if (a > 0 && b < 0) {
+    if (a >= 0 && b < 0) {
       if (a > std::numeric_limits<T>::max() + b) {
         return std::numeric_limits<T>::max();
       }
diff --git a/test/cctest/test-assembler-mips.cc b/test/cctest/test-assembler-mips.cc
index de149a2165..79a80c3a43 100644
--- a/test/cctest/test-assembler-mips.cc
+++ b/test/cctest/test-assembler-mips.cc
@@ -4968,6 +4968,189 @@ TEST(r6_beqzc) {
   }
 }
 
+void load_elements_of_vector(MacroAssembler& assm, const uint64_t elements[],
+                             MSARegister w, Register t0, Register t1) {
+  __ li(t0, static_cast<uint32_t>(elements[0] & 0xffffffff));
+  __ li(t1, static_cast<uint32_t>((elements[0] >> 32) & 0xffffffff));
+  __ insert_w(w, 0, t0);
+  __ insert_w(w, 1, t1);
+  __ li(t0, static_cast<uint32_t>(elements[1] & 0xffffffff));
+  __ li(t1, static_cast<uint32_t>((elements[1] >> 32) & 0xffffffff));
+  __ insert_w(w, 2, t0);
+  __ insert_w(w, 3, t1);
+}
+
+inline void store_elements_of_vector(MacroAssembler& assm, MSARegister w,
+                                     Register a) {
+  __ st_d(w, MemOperand(a, 0));
+}
+
+typedef union {
+  uint8_t b[16];
+  uint16_t h[8];
+  uint32_t w[4];
+  uint64_t d[2];
+} msa_reg_t;
+
+struct TestCaseMsaBranch {
+  uint64_t wt_lo;
+  uint64_t wt_hi;
+};
+
+template <typename Branch>
+void run_bz_bnz(TestCaseMsaBranch* input, Branch GenerateBranch,
+                bool branched) {
+  Isolate* isolate = CcTest::i_isolate();
+  HandleScope scope(isolate);
+
+  MacroAssembler assm(isolate, NULL, 0, v8::internal::CodeObjectRequired::kYes);
+  CpuFeatureScope fscope(&assm, MIPS_SIMD);
+
+  typedef struct {
+    uint64_t ws_lo;
+    uint64_t ws_hi;
+    uint64_t wd_lo;
+    uint64_t wd_hi;
+  } T;
+  T t = {0x20b9cc4f1a83e0c5, 0xa27e1b5f2f5bb18a, 0x0000000000000000,
+         0x0000000000000000};
+  msa_reg_t res;
+  Label do_not_move_w0_to_w2;
+
+  load_elements_of_vector(assm, &t.ws_lo, w0, t0, t1);
+  load_elements_of_vector(assm, &t.wd_lo, w2, t0, t1);
+  load_elements_of_vector(assm, &input->wt_lo, w1, t0, t1);
+  GenerateBranch(assm, do_not_move_w0_to_w2);
+  __ nop();
+  __ move_v(w2, w0);
+
+  __ bind(&do_not_move_w0_to_w2);
+  store_elements_of_vector(assm, w2, a0);
+  __ jr(ra);
+  __ nop();
+
+  CodeDesc desc;
+  assm.GetCode(isolate, &desc);
+  Handle<Code> code =
+      isolate->factory()->NewCode(desc, Code::STUB, Handle<Code>());
+#ifdef OBJECT_PRINT
+  code->Print(std::cout);
+#endif
+  F3 f = FUNCTION_CAST<F3>(code->entry());
+
+  (CALL_GENERATED_CODE(isolate, f, &res, 0, 0, 0, 0));
+  if (branched) {
+    CHECK_EQ(t.wd_lo, res.d[0]);
+    CHECK_EQ(t.wd_hi, res.d[1]);
+  } else {
+    CHECK_EQ(t.ws_lo, res.d[0]);
+    CHECK_EQ(t.ws_hi, res.d[1]);
+  }
+}
+
+TEST(MSA_bz_bnz) {
+  if (!IsMipsArchVariant(kMips32r6) || !CpuFeatures::IsSupported(MIPS_SIMD))
+    return;
+
+  TestCaseMsaBranch tz_v[] = {
+      {0x0, 0x0}, {0xabc, 0x0}, {0x0, 0xabc}, {0xabc, 0xabc}};
+  for (unsigned i = 0; i < arraysize(tz_v); ++i) {
+    run_bz_bnz(
+        &tz_v[i],
+        [](MacroAssembler& assm, Label& br_target) { __ bz_v(w1, &br_target); },
+        tz_v[i].wt_lo == 0 && tz_v[i].wt_hi == 0);
+  }
+
+#define TEST_BZ_DF(input_array, lanes, instruction, int_type)         \
+  for (unsigned i = 0; i < arraysize(input_array); ++i) {             \
+    int j;                                                            \
+    int_type* element = reinterpret_cast<int_type*>(&input_array[i]); \
+    for (j = 0; j < lanes; ++j) {                                     \
+      if (element[j] == 0) {                                          \
+        break;                                                        \
+      }                                                               \
+    }                                                                 \
+    run_bz_bnz(&input_array[i],                                       \
+               [](MacroAssembler& assm, Label& br_target) {           \
+                 __ instruction(w1, &br_target);                      \
+               },                                                     \
+               j != lanes);                                           \
+  }
+  TestCaseMsaBranch tz_b[] = {{0x0, 0x0},
+                              {0xbc0000, 0x0},
+                              {0x0, 0xab000000000000cd},
+                              {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BZ_DF(tz_b, kMSALanesByte, bz_b, int8_t)
+
+  TestCaseMsaBranch tz_h[] = {{0x0, 0x0},
+                              {0xbcde0000, 0x0},
+                              {0x0, 0xabcd00000000abcd},
+                              {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BZ_DF(tz_h, kMSALanesHalf, bz_h, int16_t)
+
+  TestCaseMsaBranch tz_w[] = {{0x0, 0x0},
+                              {0xbcde123400000000, 0x0},
+                              {0x0, 0x000000001234abcd},
+                              {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BZ_DF(tz_w, kMSALanesWord, bz_w, int32_t)
+
+  TestCaseMsaBranch tz_d[] = {{0x0, 0x0},
+                              {0xbcde0000, 0x0},
+                              {0x0, 0xabcd00000000abcd},
+                              {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BZ_DF(tz_d, kMSALanesDword, bz_d, int64_t)
+#undef TEST_BZ_DF
+
+  TestCaseMsaBranch tnz_v[] = {
+      {0x0, 0x0}, {0xabc, 0x0}, {0x0, 0xabc}, {0xabc, 0xabc}};
+  for (unsigned i = 0; i < arraysize(tnz_v); ++i) {
+    run_bz_bnz(&tnz_v[i],
+               [](MacroAssembler& assm, Label& br_target) {
+                 __ bnz_v(w1, &br_target);
+               },
+               tnz_v[i].wt_lo != 0 || tnz_v[i].wt_hi != 0);
+  }
+
+#define TEST_BNZ_DF(input_array, lanes, instruction, int_type)        \
+  for (unsigned i = 0; i < arraysize(input_array); ++i) {             \
+    int j;                                                            \
+    int_type* element = reinterpret_cast<int_type*>(&input_array[i]); \
+    for (j = 0; j < lanes; ++j) {                                     \
+      if (element[j] == 0) {                                          \
+        break;                                                        \
+      }                                                               \
+    }                                                                 \
+    run_bz_bnz(&input_array[i],                                       \
+               [](MacroAssembler& assm, Label& br_target) {           \
+                 __ instruction(w1, &br_target);                      \
+               },                                                     \
+               j == lanes);                                           \
+  }
+  TestCaseMsaBranch tnz_b[] = {{0x0, 0x0},
+                               {0xbc0000, 0x0},
+                               {0x0, 0xab000000000000cd},
+                               {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BNZ_DF(tnz_b, 16, bnz_b, int8_t)
+
+  TestCaseMsaBranch tnz_h[] = {{0x0, 0x0},
+                               {0xbcde0000, 0x0},
+                               {0x0, 0xabcd00000000abcd},
+                               {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BNZ_DF(tnz_h, 8, bnz_h, int16_t)
+
+  TestCaseMsaBranch tnz_w[] = {{0x0, 0x0},
+                               {0xbcde123400000000, 0x0},
+                               {0x0, 0x000000001234abcd},
+                               {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BNZ_DF(tnz_w, 4, bnz_w, int32_t)
+
+  TestCaseMsaBranch tnz_d[] = {{0x0, 0x0},
+                               {0xbcde0000, 0x0},
+                               {0x0, 0xabcd00000000abcd},
+                               {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BNZ_DF(tnz_d, 2, bnz_d, int64_t)
+#undef TEST_BNZ_DF
+}
 
 uint32_t run_jialc(int16_t offset) {
   Isolate* isolate = CcTest::i_isolate();
@@ -5626,70 +5809,6 @@ TEST(Subu) {
   }
 }
 
-void load_uint64_elements_of_vector(MacroAssembler& assm,
-                                    const uint64_t elements[], MSARegister w,
-                                    Register t0, Register t1) {
-  __ li(t0, static_cast<uint32_t>(elements[0] & 0xffffffff));
-  __ li(t1, static_cast<uint32_t>((elements[0] >> 32) & 0xffffffff));
-  __ insert_w(w, 0, t0);
-  __ insert_w(w, 1, t1);
-  __ li(t0, static_cast<uint32_t>(elements[1] & 0xffffffff));
-  __ li(t1, static_cast<uint32_t>((elements[1] >> 32) & 0xffffffff));
-  __ insert_w(w, 2, t0);
-  __ insert_w(w, 3, t1);
-}
-
-void load_uint32_elements_of_vector(MacroAssembler& assm,
-                                    const uint64_t elements[], MSARegister w,
-                                    Register t0, Register t1) {
-  const uint32_t* const element = reinterpret_cast<const uint32_t*>(elements);
-  __ li(t0, element[0]);
-  __ li(t1, element[1]);
-  __ insert_w(w, 0, t0);
-  __ insert_w(w, 1, t1);
-  __ li(t0, element[2]);
-  __ li(t1, element[3]);
-  __ insert_w(w, 2, t0);
-  __ insert_w(w, 3, t1);
-}
-
-void load_uint16_elements_of_vector(MacroAssembler& assm,
-                                    const uint64_t elements[], MSARegister w,
-                                    Register t0, Register t1) {
-  const uint16_t* const element = reinterpret_cast<const uint16_t*>(elements);
-  __ li(t0, element[0]);
-  __ li(t1, element[1]);
-  __ insert_h(w, 0, t0);
-  __ insert_h(w, 1, t1);
-  __ li(t0, element[2]);
-  __ li(t1, element[3]);
-  __ insert_h(w, 2, t0);
-  __ insert_h(w, 3, t1);
-  __ li(t0, element[4]);
-  __ li(t1, element[5]);
-  __ insert_h(w, 4, t0);
-  __ insert_h(w, 5, t1);
-  __ li(t0, element[6]);
-  __ li(t1, element[7]);
-  __ insert_h(w, 6, t0);
-  __ insert_h(w, 7, t1);
-}
-
-inline void store_uint64_elements_of_vector(MacroAssembler& assm, MSARegister w,
-                                            Register a) {
-  __ st_d(w, MemOperand(a, 0));
-}
-
-inline void store_uint32_elements_of_vector(MacroAssembler& assm, MSARegister w,
-                                            Register a) {
-  __ st_w(w, MemOperand(a, 0));
-}
-
-void store_uint16_elements_of_vector(MacroAssembler& assm, MSARegister w,
-                                     Register a) {
-  __ st_h(w, MemOperand(a, 0));
-}
-
 TEST(MSA_fill_copy) {
   CcTest::InitializeVM();
   Isolate* isolate = CcTest::i_isolate();
@@ -5881,13 +6000,6 @@ TEST(MSA_fill_copy_3) {
   CHECK_EQ(0x5555555555555555, t[1].d0);
 }
 
-typedef union {
-  uint8_t b[16];
-  uint16_t h[8];
-  uint32_t w[4];
-  uint64_t d[2];
-} msa_reg_t;
-
 template <typename T>
 void run_msa_insert(int32_t rs_value, int n, msa_reg_t* w) {
   Isolate* isolate = CcTest::i_isolate();
@@ -5914,7 +6026,7 @@ void run_msa_insert(int32_t rs_value, int n, msa_reg_t* w) {
     UNREACHABLE();
   }
 
-  store_uint64_elements_of_vector(assm, w0, a0);
+  store_elements_of_vector(assm, w0, a0);
 
   __ jr(ra);
   __ nop();
@@ -5987,6 +6099,152 @@ TEST(MSA_insert) {
   }
 }
 
+TEST(MSA_move_v) {
+  if (!IsMipsArchVariant(kMips32r6) || !CpuFeatures::IsSupported(MIPS_SIMD))
+    return;
+  CcTest::InitializeVM();
+  Isolate* isolate = CcTest::i_isolate();
+  HandleScope scope(isolate);
+
+  typedef struct {
+    uint64_t ws_lo;
+    uint64_t ws_hi;
+    uint64_t wd_lo;
+    uint64_t wd_hi;
+  } T;
+  T t[] = {{0x20b9cc4f1a83e0c5, 0xa27e1b5f2f5bb18a, 0x1e86678b52f8e1ff,
+            0x706e51290ac76fb9},
+           {0x4414aed7883ffd18, 0x047d183a06b67016, 0x4ef258cf8d822870,
+            0x2686b73484c2e843},
+           {0xd38ff9d048884ffc, 0x6dc63a57c0943ca7, 0x8520ca2f3e97c426,
+            0xa9913868fb819c59}};
+
+  for (unsigned i = 0; i < arraysize(t); ++i) {
+    MacroAssembler assm(isolate, nullptr, 0,
+                        v8::internal::CodeObjectRequired::kYes);
+    CpuFeatureScope fscope(&assm, MIPS_SIMD);
+
+    load_elements_of_vector(assm, &t[i].ws_lo, w0, t0, t1);
+    load_elements_of_vector(assm, &t[i].wd_lo, w2, t0, t1);
+    __ move_v(w2, w0);
+    store_elements_of_vector(assm, w2, a0);
+
+    __ jr(ra);
+    __ nop();
+
+    CodeDesc desc;
+    assm.GetCode(isolate, &desc);
+    Handle<Code> code =
+        isolate->factory()->NewCode(desc, Code::STUB, Handle<Code>());
+#ifdef OBJECT_PRINT
+    code->Print(std::cout);
+#endif
+    F3 f = FUNCTION_CAST<F3>(code->entry());
+    (CALL_GENERATED_CODE(isolate, f, &t[i].wd_lo, 0, 0, 0, 0));
+    CHECK_EQ(t[i].ws_lo, t[i].wd_lo);
+    CHECK_EQ(t[i].ws_hi, t[i].wd_hi);
+  }
+}
+
+template <typename ExpectFunc, typename OperFunc>
+void run_msa_sldi(OperFunc GenerateOperation,
+                  ExpectFunc GenerateExpectedResult) {
+  Isolate* isolate = CcTest::i_isolate();
+  HandleScope scope(isolate);
+
+  typedef struct {
+    uint64_t ws_lo;
+    uint64_t ws_hi;
+    uint64_t wd_lo;
+    uint64_t wd_hi;
+  } T;
+  T t[] = {{0x20b9cc4f1a83e0c5, 0xa27e1b5f2f5bb18a, 0x1e86678b52f8e1ff,
+            0x706e51290ac76fb9},
+           {0x4414aed7883ffd18, 0x047d183a06b67016, 0x4ef258cf8d822870,
+            0x2686b73484c2e843},
+           {0xd38ff9d048884ffc, 0x6dc63a57c0943ca7, 0x8520ca2f3e97c426,
+            0xa9913868fb819c59}};
+  uint64_t res[2];
+
+  for (unsigned i = 0; i < arraysize(t); ++i) {
+    MacroAssembler assm(isolate, nullptr, 0,
+                        v8::internal::CodeObjectRequired::kYes);
+    CpuFeatureScope fscope(&assm, MIPS_SIMD);
+    load_elements_of_vector(assm, &t[i].ws_lo, w0, t0, t1);
+    load_elements_of_vector(assm, &t[i].wd_lo, w2, t0, t1);
+    GenerateOperation(assm);
+    store_elements_of_vector(assm, w2, a0);
+
+    __ jr(ra);
+    __ nop();
+
+    CodeDesc desc;
+    assm.GetCode(isolate, &desc);
+    Handle<Code> code =
+        isolate->factory()->NewCode(desc, Code::STUB, Handle<Code>());
+#ifdef OBJECT_PRINT
+    code->Print(std::cout);
+#endif
+    F3 f = FUNCTION_CAST<F3>(code->entry());
+    (CALL_GENERATED_CODE(isolate, f, &res[0], 0, 0, 0, 0));
+    GenerateExpectedResult(reinterpret_cast<uint8_t*>(&t[i].ws_lo),
+                           reinterpret_cast<uint8_t*>(&t[i].wd_lo));
+    CHECK_EQ(res[0], t[i].wd_lo);
+    CHECK_EQ(res[1], t[i].wd_hi);
+  }
+}
+
+TEST(MSA_sldi) {
+  if (!IsMipsArchVariant(kMips32r6) || !CpuFeatures::IsSupported(MIPS_SIMD))
+    return;
+  CcTest::InitializeVM();
+
+#define SLDI_DF(s, k)                \
+  uint8_t v[32];                     \
+  for (unsigned i = 0; i < s; i++) { \
+    v[i] = ws[s * k + i];            \
+    v[i + s] = wd[s * k + i];        \
+  }                                  \
+  for (unsigned i = 0; i < s; i++) { \
+    wd[s * k + i] = v[i + n];        \
+  }
+
+  for (int n = 0; n < 16; ++n) {
+    run_msa_sldi([n](MacroAssembler& assm) { __ sldi_b(w2, w0, n); },
+                 [n](uint8_t* ws, uint8_t* wd) {
+                   SLDI_DF(kMSARegSize / sizeof(int8_t) / kBitsPerByte, 0)
+                 });
+  }
+
+  for (int n = 0; n < 8; ++n) {
+    run_msa_sldi([n](MacroAssembler& assm) { __ sldi_h(w2, w0, n); },
+                 [n](uint8_t* ws, uint8_t* wd) {
+                   for (int k = 0; k < 2; ++k) {
+                     SLDI_DF(kMSARegSize / sizeof(int16_t) / kBitsPerByte, k)
+                   }
+                 });
+  }
+
+  for (int n = 0; n < 4; ++n) {
+    run_msa_sldi([n](MacroAssembler& assm) { __ sldi_w(w2, w0, n); },
+                 [n](uint8_t* ws, uint8_t* wd) {
+                   for (int k = 0; k < 4; ++k) {
+                     SLDI_DF(kMSARegSize / sizeof(int32_t) / kBitsPerByte, k)
+                   }
+                 });
+  }
+
+  for (int n = 0; n < 2; ++n) {
+    run_msa_sldi([n](MacroAssembler& assm) { __ sldi_d(w2, w0, n); },
+                 [n](uint8_t* ws, uint8_t* wd) {
+                   for (int k = 0; k < 8; ++k) {
+                     SLDI_DF(kMSARegSize / sizeof(int64_t) / kBitsPerByte, k)
+                   }
+                 });
+  }
+#undef SLDI_DF
+}
+
 void run_msa_ctc_cfc(uint32_t value) {
   Isolate* isolate = CcTest::i_isolate();
   HandleScope scope(isolate);
@@ -6110,7 +6368,7 @@ void run_msa_i8(SecondaryField opcode, uint64_t ws_lo, uint64_t ws_hi,
       UNREACHABLE();
   }
 
-  store_uint64_elements_of_vector(assm, w2, a0);
+  store_elements_of_vector(assm, w2, a0);
 
   __ jr(ra);
   __ nop();
@@ -6403,11 +6661,11 @@ void run_msa_i5(struct TestCaseMsaI5* input, bool i5_sign_ext,
   int32_t i5 =
       i5_sign_ext ? static_cast<int32_t>(input->i5 << 27) >> 27 : input->i5;
 
-  load_uint64_elements_of_vector(assm, &(input->ws_lo), w0, t0, t1);
+  load_elements_of_vector(assm, &(input->ws_lo), w0, t0, t1);
 
   GenerateI5InstructionFunc(assm, i5);
 
-  store_uint64_elements_of_vector(assm, w2, a0);
+  store_elements_of_vector(assm, w2, a0);
 
   __ jr(ra);
   __ nop();
@@ -6814,11 +7072,9 @@ struct TestCaseMsa2R {
   uint64_t exp_res_hi;
 };
 
-template <typename Func, typename FuncLoad, typename FuncStore>
+template <typename Func>
 void run_msa_2r(const struct TestCaseMsa2R* input,
-                Func Generate2RInstructionFunc,
-                FuncLoad load_elements_of_vector,
-                FuncStore store_elements_of_vector) {
+                Func Generate2RInstructionFunc) {
   Isolate* isolate = CcTest::i_isolate();
   HandleScope scope(isolate);
 
@@ -6846,17 +7102,8 @@ void run_msa_2r(const struct TestCaseMsa2R* input,
 
   (CALL_GENERATED_CODE(isolate, f, &res, 0, 0, 0, 0));
 
-  if (store_elements_of_vector == store_uint64_elements_of_vector) {
-    CHECK_EQ(input->exp_res_lo, res.d[0]);
-    CHECK_EQ(input->exp_res_hi, res.d[1]);
-  } else if (store_elements_of_vector == store_uint32_elements_of_vector) {
-    const uint32_t* exp_res =
-        reinterpret_cast<const uint32_t*>(&input->exp_res_lo);
-    CHECK_EQ(exp_res[0], res.w[0]);
-    CHECK_EQ(exp_res[1], res.w[1]);
-    CHECK_EQ(exp_res[2], res.w[2]);
-    CHECK_EQ(exp_res[3], res.w[3]);
-  }
+  CHECK_EQ(input->exp_res_lo, res.d[0]);
+  CHECK_EQ(input->exp_res_hi, res.d[1]);
 }
 
 TEST(MSA_pcnt) {
@@ -6907,14 +7154,10 @@ TEST(MSA_pcnt) {
       {0xf35862e13e38f8b0, 0x4f41ffdef2bfe636, 0x20, 0x2a}};
 
   for (size_t i = 0; i < sizeof(tc_b) / sizeof(TestCaseMsa2R); ++i) {
-    run_msa_2r(&tc_b[i], [](MacroAssembler& assm) { __ pcnt_b(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_h[i], [](MacroAssembler& assm) { __ pcnt_h(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_w[i], [](MacroAssembler& assm) { __ pcnt_w(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_d[i], [](MacroAssembler& assm) { __ pcnt_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+    run_msa_2r(&tc_b[i], [](MacroAssembler& assm) { __ pcnt_b(w2, w0); });
+    run_msa_2r(&tc_h[i], [](MacroAssembler& assm) { __ pcnt_h(w2, w0); });
+    run_msa_2r(&tc_w[i], [](MacroAssembler& assm) { __ pcnt_w(w2, w0); });
+    run_msa_2r(&tc_d[i], [](MacroAssembler& assm) { __ pcnt_d(w2, w0); });
   }
 }
 
@@ -6966,14 +7209,10 @@ TEST(MSA_nlzc) {
       {0x00000000e338f8b0, 0x0754534acab32654, 0x20, 0x5}};
 
   for (size_t i = 0; i < sizeof(tc_b) / sizeof(TestCaseMsa2R); ++i) {
-    run_msa_2r(&tc_b[i], [](MacroAssembler& assm) { __ nlzc_b(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_h[i], [](MacroAssembler& assm) { __ nlzc_h(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_w[i], [](MacroAssembler& assm) { __ nlzc_w(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_d[i], [](MacroAssembler& assm) { __ nlzc_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+    run_msa_2r(&tc_b[i], [](MacroAssembler& assm) { __ nlzc_b(w2, w0); });
+    run_msa_2r(&tc_h[i], [](MacroAssembler& assm) { __ nlzc_h(w2, w0); });
+    run_msa_2r(&tc_w[i], [](MacroAssembler& assm) { __ nlzc_w(w2, w0); });
+    run_msa_2r(&tc_d[i], [](MacroAssembler& assm) { __ nlzc_d(w2, w0); });
   }
 }
 
@@ -7025,14 +7264,10 @@ TEST(MSA_nloc) {
       {0xFFFFFFFF1CC7074F, 0xF8ABACB5354CD9AB, 0x20, 0x5}};
 
   for (size_t i = 0; i < sizeof(tc_b) / sizeof(TestCaseMsa2R); ++i) {
-    run_msa_2r(&tc_b[i], [](MacroAssembler& assm) { __ nloc_b(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_h[i], [](MacroAssembler& assm) { __ nloc_h(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_w[i], [](MacroAssembler& assm) { __ nloc_w(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_d[i], [](MacroAssembler& assm) { __ nloc_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+    run_msa_2r(&tc_b[i], [](MacroAssembler& assm) { __ nloc_b(w2, w0); });
+    run_msa_2r(&tc_h[i], [](MacroAssembler& assm) { __ nloc_h(w2, w0); });
+    run_msa_2r(&tc_w[i], [](MacroAssembler& assm) { __ nloc_w(w2, w0); });
+    run_msa_2r(&tc_d[i], [](MacroAssembler& assm) { __ nloc_d(w2, w0); });
   }
 }
 
@@ -7093,13 +7328,11 @@ TEST(MSA_fclass) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_U); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ fclass_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ fclass_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_U); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ fclass_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ fclass_d(w2, w0); });
   }
 
 #undef BIT
@@ -7165,13 +7398,11 @@ TEST(MSA_ftrunc_s) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_I); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ ftrunc_s_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ ftrunc_s_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_I); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ ftrunc_s_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ ftrunc_s_d(w2, w0); });
   }
 }
 
@@ -7204,13 +7435,11 @@ TEST(MSA_ftrunc_u) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_U); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ ftrunc_u_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ ftrunc_u_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_U); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ ftrunc_u_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ ftrunc_u_d(w2, w0); });
   }
 }
 
@@ -7249,13 +7478,11 @@ TEST(MSA_fsqrt) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ fsqrt_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ fsqrt_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ fsqrt_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ fsqrt_d(w2, w0); });
   }
 }
 
@@ -7279,13 +7506,11 @@ TEST(MSA_frsqrt) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ frsqrt_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ frsqrt_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ frsqrt_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ frsqrt_d(w2, w0); });
   }
 }
 
@@ -7311,13 +7536,11 @@ TEST(MSA_frcp) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ frcp_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ frcp_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ frcp_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ frcp_d(w2, w0); });
   }
 }
 
@@ -7332,8 +7555,7 @@ void test_frint_s(size_t data_size, TestCaseMsa2RF_F_F tc_d[],
                  __ ctcmsa(msareg, t0);
                  __ frint_w(w2, w0);
                  __ ctcmsa(msareg, t1);
-               },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               });
   }
 }
 
@@ -7348,8 +7570,7 @@ void test_frint_d(size_t data_size, TestCaseMsa2RF_D_D tc_d[],
                  __ ctcmsa(msareg, t0);
                  __ frint_d(w2, w0);
                  __ ctcmsa(msareg, t1);
-               },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               });
   }
 }
 
@@ -7431,14 +7652,12 @@ TEST(MSA_flog2) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ flog2_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ flog2_w(w2, w0); });
   }
 
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ flog2_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ flog2_d(w2, w0); });
   }
 }
 
@@ -7453,8 +7672,7 @@ void test_ftint_s_s(size_t data_size, TestCaseMsa2RF_F_I tc_d[],
                  __ ctcmsa(msareg, t0);
                  __ ftint_s_w(w2, w0);
                  __ ctcmsa(msareg, t1);
-               },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               });
   }
 }
 
@@ -7469,8 +7687,7 @@ void test_ftint_s_d(size_t data_size, TestCaseMsa2RF_D_I tc_d[],
                  __ ctcmsa(msareg, t0);
                  __ ftint_s_d(w2, w0);
                  __ ctcmsa(msareg, t1);
-               },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               });
   }
 }
 
@@ -7567,8 +7784,7 @@ void test_ftint_u_s(size_t data_size, TestCaseMsa2RF_F_U tc_d[],
                  __ ctcmsa(msareg, t0);
                  __ ftint_u_w(w2, w0);
                  __ ctcmsa(msareg, t1);
-               },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               });
   }
 }
 
@@ -7583,8 +7799,7 @@ void test_ftint_u_d(size_t data_size, TestCaseMsa2RF_D_U tc_d[],
                  __ ctcmsa(msareg, t0);
                  __ ftint_u_d(w2, w0);
                  __ ctcmsa(msareg, t1);
-               },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               });
   }
 }
 
@@ -7700,13 +7915,11 @@ TEST(MSA_ffint_u) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_U_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ ffint_u_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffint_u_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_U_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ ffint_u_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffint_u_d(w2, w0); });
   }
 }
 
@@ -7742,13 +7955,11 @@ TEST(MSA_ffint_s) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_I_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ ffint_s_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffint_s_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_I_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ ffint_s_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffint_s_d(w2, w0); });
   }
 }
 
@@ -7801,13 +8012,11 @@ TEST(MSA_fexupl) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_U16_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ fexupl_w(w2, w0); },
-               load_uint16_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ fexupl_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_F_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ fexupl_d(w2, w0); },
-               load_uint32_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ fexupl_d(w2, w0); });
   }
 }
 
@@ -7836,13 +8045,11 @@ TEST(MSA_fexupr) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_U16_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ fexupr_w(w2, w0); },
-               load_uint16_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ fexupr_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_F_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ fexupr_d(w2, w0); },
-               load_uint32_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ fexupr_d(w2, w0); });
   }
 }
 
@@ -7871,13 +8078,11 @@ TEST(MSA_ffql) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_U16_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ ffql_w(w2, w0); },
-               load_uint16_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffql_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_U32_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ ffql_d(w2, w0); },
-               load_uint32_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffql_d(w2, w0); });
   }
 }
 
@@ -7897,13 +8102,11 @@ TEST(MSA_ffqr) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_U16_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ ffqr_w(w2, w0); },
-               load_uint16_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffqr_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_U32_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ ffqr_d(w2, w0); },
-               load_uint32_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffqr_d(w2, w0); });
   }
 }
 
@@ -7928,13 +8131,13 @@ void run_msa_vector(struct TestCaseMsaVector* input,
   CpuFeatureScope fscope(&assm, MIPS_SIMD);
   msa_reg_t res;
 
-  load_uint64_elements_of_vector(assm, &(input->ws_lo), w0, t0, t1);
-  load_uint64_elements_of_vector(assm, &(input->wt_lo), w2, t0, t1);
-  load_uint64_elements_of_vector(assm, &(input->wd_lo), w4, t0, t1);
+  load_elements_of_vector(assm, &(input->ws_lo), w0, t0, t1);
+  load_elements_of_vector(assm, &(input->wt_lo), w2, t0, t1);
+  load_elements_of_vector(assm, &(input->wd_lo), w4, t0, t1);
 
   GenerateVectorInstructionFunc(assm);
 
-  store_uint64_elements_of_vector(assm, w4, a0);
+  store_elements_of_vector(assm, w4, a0);
 
   __ jr(ra);
   __ nop();
@@ -8018,12 +8221,12 @@ void run_msa_bit(struct TestCaseMsaBit* input, InstFunc GenerateInstructionFunc,
   CpuFeatureScope fscope(&assm, MIPS_SIMD);
   msa_reg_t res;
 
-  load_uint64_elements_of_vector(assm, &(input->ws_lo), w0, t0, t1);
-  load_uint64_elements_of_vector(assm, &(input->wd_lo), w2, t0, t1);
+  load_elements_of_vector(assm, &(input->ws_lo), w0, t0, t1);
+  load_elements_of_vector(assm, &(input->wd_lo), w2, t0, t1);
 
   GenerateInstructionFunc(assm, input->m);
 
-  store_uint64_elements_of_vector(assm, w2, a0);
+  store_elements_of_vector(assm, w2, a0);
 
   __ jr(ra);
   __ nop();
@@ -8497,7 +8700,7 @@ void run_msa_i10(int32_t input, InstFunc GenerateVectorInstructionFunc,
 
   GenerateVectorInstructionFunc(assm, input);
 
-  store_uint64_elements_of_vector(assm, w0, a0);
+  store_elements_of_vector(assm, w0, a0);
 
   __ jr(ra);
   __ nop();
@@ -8626,7 +8829,6 @@ TEST(MSA_load_store_vector) {
       __ st_d(w0, MemOperand(a1, i));
     }
   });
-#undef LDI_DF
 }
 
 struct TestCaseMsa3R {
@@ -8650,15 +8852,14 @@ void run_msa_3r(struct TestCaseMsa3R* input, InstFunc GenerateI5InstructionFunc,
                       v8::internal::CodeObjectRequired::kYes);
   CpuFeatureScope fscope(&assm, MIPS_SIMD);
   msa_reg_t res;
-  uint64_t expected;
 
-  load_uint64_elements_of_vector(assm, &(input->wt_lo), w0, t0, t1);
-  load_uint64_elements_of_vector(assm, &(input->ws_lo), w1, t0, t1);
-  load_uint64_elements_of_vector(assm, &(input->wd_lo), w2, t0, t1);
+  load_elements_of_vector(assm, &(input->wt_lo), w0, t0, t1);
+  load_elements_of_vector(assm, &(input->ws_lo), w1, t0, t1);
+  load_elements_of_vector(assm, &(input->wd_lo), w2, t0, t1);
 
   GenerateI5InstructionFunc(assm);
 
-  store_uint64_elements_of_vector(assm, w2, a0);
+  store_elements_of_vector(assm, w2, a0);
 
   __ jr(ra);
   __ nop();
@@ -8674,14 +8875,12 @@ void run_msa_3r(struct TestCaseMsa3R* input, InstFunc GenerateI5InstructionFunc,
 
   (CALL_GENERATED_CODE(isolate, f, &res, 0, 0, 0, 0));
 
-  expected = GenerateOperationFunc(input->ws_lo, input->wt_lo, input->wd_lo);
-  if (expected != Unpredictable) {
-    CHECK_EQ(expected, res.d[0]);
+  GenerateOperationFunc(&input->ws_lo, &input->wt_lo, &input->wd_lo);
+  if (input->wd_lo != Unpredictable) {
+    CHECK_EQ(input->wd_lo, res.d[0]);
   }
-
-  expected = GenerateOperationFunc(input->ws_hi, input->wt_hi, input->wd_hi);
-  if (expected != Unpredictable) {
-    CHECK_EQ(expected, res.d[1]);
+  if (input->wd_hi != Unpredictable) {
+    CHECK_EQ(input->wd_hi, res.d[1]);
   }
 }
 
@@ -8719,479 +8918,630 @@ TEST(MSA_3R_instructions) {
       {0xffff00000000ffff, 0xffff00000000ffff, 0xffff00000000ffff,
        0xffff00000000ffff, 0xffff00000000ffff, 0xffff00000000ffff}};
 
-#define SLL_DF(T, lanes, mask)                                          \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T src_op = static_cast<T>((ws >> shift) & mask);                    \
-    T shift_op = static_cast<T>((wt >> shift) & mask) % size_in_bits;   \
-    res |= (static_cast<uint64_t>(src_op << shift_op) & mask) << shift; \
-  }                                                                     \
-  return res
+#define SLL_DF(T, lanes, mask)                                             \
+  int size_in_bits = kMSARegSize / lanes;                                  \
+  for (int i = 0; i < 2; i++) {                                            \
+    uint64_t res = 0;                                                      \
+    for (int j = 0; j < lanes / 2; ++j) {                                  \
+      uint64_t shift = size_in_bits * j;                                   \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                  \
+      T shift_op = static_cast<T>((wt[i] >> shift) & mask) % size_in_bits; \
+      res |= (static_cast<uint64_t>(src_op << shift_op) & mask) << shift;  \
+    }                                                                      \
+    wd[i] = res;                                                           \
+  }
 
-#define SRA_DF(T, lanes, mask)                                                 \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T src_op = static_cast<T>((ws >> shift) & mask);                           \
-    T shift_op = ((wt >> shift) & mask) % size_in_bits;                        \
-    res |=                                                                     \
-        (static_cast<uint64_t>(ArithmeticShiftRight(src_op, shift_op) & mask)) \
-        << shift;                                                              \
-  }                                                                            \
-  return res
-
-#define SRL_DF(T, lanes, mask)                                          \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T src_op = static_cast<T>((ws >> shift) & mask);                    \
-    T shift_op = static_cast<T>(((wt >> shift) & mask) % size_in_bits); \
-    res |= (static_cast<uint64_t>(src_op >> shift_op) & mask) << shift; \
-  }                                                                     \
-  return res
-
-#define BCRL_DF(T, lanes, mask)                                         \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T src_op = static_cast<T>((ws >> shift) & mask);                    \
-    T shift_op = static_cast<T>(((wt >> shift) & mask) % size_in_bits); \
-    T r = (static_cast<T>(~(1ull << shift_op)) & src_op) & mask;        \
-    res |= static_cast<uint64_t>(r) << shift;                           \
-  }                                                                     \
-  return res
-
-#define BSET_DF(T, lanes, mask)                                         \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T src_op = static_cast<T>((ws >> shift) & mask);                    \
-    T shift_op = static_cast<T>(((wt >> shift) & mask) % size_in_bits); \
-    T r = (static_cast<T>(1ull << shift_op) | src_op) & mask;           \
-    res |= static_cast<uint64_t>(r) << shift;                           \
-  }                                                                     \
-  return res
-
-#define BNEG_DF(T, lanes, mask)                                         \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T src_op = static_cast<T>((ws >> shift) & mask);                    \
-    T shift_op = static_cast<T>(((wt >> shift) & mask) % size_in_bits); \
-    T r = (static_cast<T>(1ull << shift_op) ^ src_op) & mask;           \
-    res |= static_cast<uint64_t>(r) << shift;                           \
-  }                                                                     \
-  return res
-
-#define BINSL_DF(T, lanes, mask)                                        \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                     \
-    T wd_op = static_cast<T>((wd >> shift) & mask);                     \
-    T shift_op = static_cast<T>(((wt >> shift) & mask) % size_in_bits); \
-    int bits = shift_op + 1;                                            \
-    T r;                                                                \
-    if (bits == size_in_bits) {                                         \
-      r = static_cast<T>(ws_op);                                        \
-    } else {                                                            \
-      uint64_t mask2 = ((1ull << bits) - 1) << (size_in_bits - bits);   \
-      r = static_cast<T>((static_cast<T>(mask2) & ws_op) |              \
-                         (static_cast<T>(~mask2) & wd_op));             \
-    }                                                                   \
-    res |= static_cast<uint64_t>(r) << shift;                           \
-  }                                                                     \
-  return res
-
-#define BINSR_DF(T, lanes, mask)                                        \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                     \
-    T wd_op = static_cast<T>((wd >> shift) & mask);                     \
-    T shift_op = static_cast<T>(((wt >> shift) & mask) % size_in_bits); \
-    int bits = shift_op + 1;                                            \
-    T r;                                                                \
-    if (bits == size_in_bits) {                                         \
-      r = static_cast<T>(ws_op);                                        \
-    } else {                                                            \
-      uint64_t mask2 = (1ull << bits) - 1;                              \
-      r = static_cast<T>((static_cast<T>(mask2) & ws_op) |              \
-                         (static_cast<T>(~mask2) & wd_op));             \
-    }                                                                   \
-    res |= static_cast<uint64_t>(r) << shift;                           \
-  }                                                                     \
-  return res
-
-#define ADDV_DF(T, lanes, mask)                                    \
-  uint64_t res = 0;                                                \
-  int size_in_bits = kMSARegSize / lanes;                          \
-  for (int i = 0; i < lanes / 2; ++i) {                            \
-    uint64_t shift = size_in_bits * i;                             \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                \
-    res |= (static_cast<uint64_t>(ws_op + wt_op) & mask) << shift; \
-  }                                                                \
-  return res
-
-#define SUBV_DF(T, lanes, mask)                                    \
-  uint64_t res = 0;                                                \
-  int size_in_bits = kMSARegSize / lanes;                          \
-  for (int i = 0; i < lanes / 2; ++i) {                            \
-    uint64_t shift = size_in_bits * i;                             \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                \
-    res |= (static_cast<uint64_t>(ws_op - wt_op) & mask) << shift; \
-  }                                                                \
-  return res
-
-#define MAX_DF(T, lanes, mask)                                            \
-  uint64_t res = 0;                                                       \
-  int size_in_bits = kMSARegSize / lanes;                                 \
-  for (int i = 0; i < lanes / 2; ++i) {                                   \
-    uint64_t shift = size_in_bits * i;                                    \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                       \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                       \
-    res |= (static_cast<uint64_t>(Max<T>(ws_op, wt_op)) & mask) << shift; \
-  }                                                                       \
-  return res
-
-#define MIN_DF(T, lanes, mask)                                            \
-  uint64_t res = 0;                                                       \
-  int size_in_bits = kMSARegSize / lanes;                                 \
-  for (int i = 0; i < lanes / 2; ++i) {                                   \
-    uint64_t shift = size_in_bits * i;                                    \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                       \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                       \
-    res |= (static_cast<uint64_t>(Min<T>(ws_op, wt_op)) & mask) << shift; \
-  }                                                                       \
-  return res
-
-#define MAXA_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |= (static_cast<uint64_t>(Nabs(ws_op) < Nabs(wt_op) ? ws_op : wt_op) & \
-            mask)                                                              \
-           << shift;                                                           \
-  }                                                                            \
-  return res
-
-#define MINA_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |= (static_cast<uint64_t>(Nabs(ws_op) > Nabs(wt_op) ? ws_op : wt_op) & \
-            mask)                                                              \
-           << shift;                                                           \
-  }                                                                            \
-  return res
-
-#define CEQ_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                           \
-  int size_in_bits = kMSARegSize / lanes;                                     \
-  for (int i = 0; i < lanes / 2; ++i) {                                       \
-    uint64_t shift = size_in_bits * i;                                        \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                           \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                           \
-    res |=                                                                    \
-        (static_cast<uint64_t>(!Compare(ws_op, wt_op) ? -1ull : 0ull) & mask) \
-        << shift;                                                             \
-  }                                                                           \
-  return res
-
-#define CLT_DF(T, lanes, mask)                                                 \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |=                                                                     \
-        (static_cast<uint64_t>((Compare(ws_op, wt_op) == -1) ? -1ull : 0ull) & \
-         mask)                                                                 \
-        << shift;                                                              \
-  }                                                                            \
-  return res
-
-#define CLE_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                           \
-  int size_in_bits = kMSARegSize / lanes;                                     \
-  for (int i = 0; i < lanes / 2; ++i) {                                       \
-    uint64_t shift = size_in_bits * i;                                        \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                           \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                           \
-    res |=                                                                    \
-        (static_cast<uint64_t>((Compare(ws_op, wt_op) != 1) ? -1ull : 0ull) & \
-         mask)                                                                \
-        << shift;                                                             \
-  }                                                                           \
-  return res
-
-#define ADD_A_DF(T, lanes, mask)                                             \
-  uint64_t res = 0;                                                          \
+#define SRA_DF(T, lanes, mask)                                               \
   int size_in_bits = kMSARegSize / lanes;                                    \
-  for (int i = 0; i < lanes / 2; ++i) {                                      \
-    uint64_t shift = size_in_bits * i;                                       \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                          \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                          \
-    res |= (static_cast<uint64_t>(Abs(ws_op) + Abs(wt_op)) & mask) << shift; \
-  }                                                                          \
-  return res
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T shift_op = ((wt[i] >> shift) & mask) % size_in_bits;                 \
+      res |= (static_cast<uint64_t>(ArithmeticShiftRight(src_op, shift_op) & \
+                                    mask))                                   \
+             << shift;                                                       \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define ADDS_A_DF(T, lanes, mask)                         \
-  uint64_t res = 0;                                       \
-  int size_in_bits = kMSARegSize / lanes;                 \
-  for (int i = 0; i < lanes / 2; ++i) {                   \
-    uint64_t shift = size_in_bits * i;                    \
-    T ws_op = Nabs(static_cast<T>((ws >> shift) & mask)); \
-    T wt_op = Nabs(static_cast<T>((wt >> shift) & mask)); \
-    T r;                                                  \
-    if (ws_op < -std::numeric_limits<T>::max() - wt_op) { \
-      r = std::numeric_limits<T>::max();                  \
-    } else {                                              \
-      r = -(ws_op + wt_op);                               \
-    }                                                     \
-    res |= (static_cast<uint64_t>(r) & mask) << shift;    \
-  }                                                       \
-  return res
+#define SRL_DF(T, lanes, mask)                                               \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T shift_op = static_cast<T>(((wt[i] >> shift) & mask) % size_in_bits); \
+      res |= (static_cast<uint64_t>(src_op >> shift_op) & mask) << shift;    \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define ADDS_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |= (static_cast<uint64_t>(SaturateAdd(ws_op, wt_op)) & mask) << shift; \
-  }                                                                            \
-  return res
+#define BCRL_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T shift_op = static_cast<T>(((wt[i] >> shift) & mask) % size_in_bits); \
+      T r = (static_cast<T>(~(1ull << shift_op)) & src_op) & mask;           \
+      res |= static_cast<uint64_t>(r) << shift;                              \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define AVE_DF(T, lanes, mask)                                                 \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |= (static_cast<uint64_t>(((wt_op & ws_op) + ((ws_op ^ wt_op) >> 1)) & \
-                                  mask))                                       \
-           << shift;                                                           \
-  }                                                                            \
-  return res
+#define BSET_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T shift_op = static_cast<T>(((wt[i] >> shift) & mask) % size_in_bits); \
+      T r = (static_cast<T>(1ull << shift_op) | src_op) & mask;              \
+      res |= static_cast<uint64_t>(r) << shift;                              \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define AVER_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |= (static_cast<uint64_t>(((wt_op | ws_op) - ((ws_op ^ wt_op) >> 1)) & \
-                                  mask))                                       \
-           << shift;                                                           \
-  }                                                                            \
-  return res
+#define BNEG_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T shift_op = static_cast<T>(((wt[i] >> shift) & mask) % size_in_bits); \
+      T r = (static_cast<T>(1ull << shift_op) ^ src_op) & mask;              \
+      res |= static_cast<uint64_t>(r) << shift;                              \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define SUBS_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |= (static_cast<uint64_t>(SaturateSub(ws_op, wt_op)) & mask) << shift; \
-  }                                                                            \
-  return res
+#define BINSL_DF(T, lanes, mask)                                             \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                     \
+      T wd_op = static_cast<T>((wd[i] >> shift) & mask);                     \
+      T shift_op = static_cast<T>(((wt[i] >> shift) & mask) % size_in_bits); \
+      int bits = shift_op + 1;                                               \
+      T r;                                                                   \
+      if (bits == size_in_bits) {                                            \
+        r = static_cast<T>(ws_op);                                           \
+      } else {                                                               \
+        uint64_t mask2 = ((1ull << bits) - 1) << (size_in_bits - bits);      \
+        r = static_cast<T>((static_cast<T>(mask2) & ws_op) |                 \
+                           (static_cast<T>(~mask2) & wd_op));                \
+      }                                                                      \
+      res |= static_cast<uint64_t>(r) << shift;                              \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define SUBSUS_U_DF(T, lanes, mask)                         \
-  typedef typename std::make_unsigned<T>::type uT;          \
-  uint64_t res = 0;                                         \
-  int size_in_bits = kMSARegSize / lanes;                   \
-  for (int i = 0; i < lanes / 2; ++i) {                     \
-    uint64_t shift = size_in_bits * i;                      \
-    uT ws_op = static_cast<uT>((ws >> shift) & mask);       \
-    T wt_op = static_cast<T>((wt >> shift) & mask);         \
-    T r;                                                    \
-    if (wt_op > 0) {                                        \
-      uT wtu = static_cast<uT>(wt_op);                      \
-      if (wtu > ws_op) {                                    \
-        r = 0;                                              \
-      } else {                                              \
-        r = static_cast<T>(ws_op - wtu);                    \
-      }                                                     \
-    } else {                                                \
-      if (ws_op > std::numeric_limits<uT>::max() + wt_op) { \
-        r = static_cast<T>(std::numeric_limits<uT>::max()); \
-      } else {                                              \
-        r = static_cast<T>(ws_op - wt_op);                  \
-      }                                                     \
-    }                                                       \
-    res |= (static_cast<uint64_t>(r) & mask) << shift;      \
-  }                                                         \
-  return res
+#define BINSR_DF(T, lanes, mask)                                             \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                     \
+      T wd_op = static_cast<T>((wd[i] >> shift) & mask);                     \
+      T shift_op = static_cast<T>(((wt[i] >> shift) & mask) % size_in_bits); \
+      int bits = shift_op + 1;                                               \
+      T r;                                                                   \
+      if (bits == size_in_bits) {                                            \
+        r = static_cast<T>(ws_op);                                           \
+      } else {                                                               \
+        uint64_t mask2 = (1ull << bits) - 1;                                 \
+        r = static_cast<T>((static_cast<T>(mask2) & ws_op) |                 \
+                           (static_cast<T>(~mask2) & wd_op));                \
+      }                                                                      \
+      res |= static_cast<uint64_t>(r) << shift;                              \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define SUBSUU_S_DF(T, lanes, mask)                    \
-  typedef typename std::make_unsigned<T>::type uT;     \
-  uint64_t res = 0;                                    \
-  int size_in_bits = kMSARegSize / lanes;              \
-  for (int i = 0; i < lanes / 2; ++i) {                \
-    uint64_t shift = size_in_bits * i;                 \
-    uT ws_op = static_cast<uT>((ws >> shift) & mask);  \
-    uT wt_op = static_cast<uT>((wt >> shift) & mask);  \
-    uT wdu;                                            \
-    T r;                                               \
-    if (ws_op > wt_op) {                               \
-      wdu = ws_op - wt_op;                             \
-      if (wdu > std::numeric_limits<T>::max()) {       \
-        r = std::numeric_limits<T>::max();             \
-      } else {                                         \
-        r = static_cast<T>(wdu);                       \
-      }                                                \
-    } else {                                           \
-      wdu = wt_op - ws_op;                             \
-      CHECK(-std::numeric_limits<T>::max() ==          \
-            std::numeric_limits<T>::min() + 1);        \
-      if (wdu <= std::numeric_limits<T>::max()) {      \
-        r = -static_cast<T>(wdu);                      \
-      } else {                                         \
-        r = std::numeric_limits<T>::min();             \
-      }                                                \
-    }                                                  \
-    res |= (static_cast<uint64_t>(r) & mask) << shift; \
-  }                                                    \
-  return res
+#define ADDV_DF(T, lanes, mask)                                      \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      res |= (static_cast<uint64_t>(ws_op + wt_op) & mask) << shift; \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
 
-#define ASUB_S_DF(T, lanes, mask)                                       \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                     \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                     \
-    res |= (static_cast<uint64_t>(Abs(ws_op - wt_op)) & mask) << shift; \
-  }                                                                     \
-  return res
+#define SUBV_DF(T, lanes, mask)                                      \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      res |= (static_cast<uint64_t>(ws_op - wt_op) & mask) << shift; \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
 
-#define ASUB_U_DF(T, lanes, mask)                                  \
-  uint64_t res = 0;                                                \
-  int size_in_bits = kMSARegSize / lanes;                          \
-  for (int i = 0; i < lanes / 2; ++i) {                            \
-    uint64_t shift = size_in_bits * i;                             \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                \
-    res |= (static_cast<uint64_t>(ws_op > wt_op ? ws_op - wt_op    \
-                                                : wt_op - ws_op) & \
-            mask)                                                  \
-           << shift;                                               \
-  }                                                                \
-  return res
+#define MAX_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                   \
+  for (int i = 0; i < 2; i++) {                                             \
+    uint64_t res = 0;                                                       \
+    for (int j = 0; j < lanes / 2; ++j) {                                   \
+      uint64_t shift = size_in_bits * j;                                    \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                    \
+      res |= (static_cast<uint64_t>(Max<T>(ws_op, wt_op)) & mask) << shift; \
+    }                                                                       \
+    wd[i] = res;                                                            \
+  }
 
-#define MULV_DF(T, lanes, mask)                                    \
-  uint64_t res = 0;                                                \
-  int size_in_bits = kMSARegSize / lanes;                          \
-  for (int i = 0; i < lanes / 2; ++i) {                            \
-    uint64_t shift = size_in_bits * i;                             \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                \
-    res |= (static_cast<uint64_t>(ws_op * wt_op) & mask) << shift; \
-  }                                                                \
-  return res
+#define MIN_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                   \
+  for (int i = 0; i < 2; i++) {                                             \
+    uint64_t res = 0;                                                       \
+    for (int j = 0; j < lanes / 2; ++j) {                                   \
+      uint64_t shift = size_in_bits * j;                                    \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                    \
+      res |= (static_cast<uint64_t>(Min<T>(ws_op, wt_op)) & mask) << shift; \
+    }                                                                       \
+    wd[i] = res;                                                            \
+  }
 
-#define MADDV_DF(T, lanes, mask)                                           \
-  uint64_t res = 0;                                                        \
+#define MAXA_DF(T, lanes, mask)                                               \
+  int size_in_bits = kMSARegSize / lanes;                                     \
+  for (int i = 0; i < 2; i++) {                                               \
+    uint64_t res = 0;                                                         \
+    for (int j = 0; j < lanes / 2; ++j) {                                     \
+      uint64_t shift = size_in_bits * j;                                      \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                      \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                      \
+      res |=                                                                  \
+          (static_cast<uint64_t>(Nabs(ws_op) < Nabs(wt_op) ? ws_op : wt_op) & \
+           mask)                                                              \
+          << shift;                                                           \
+    }                                                                         \
+    wd[i] = res;                                                              \
+  }
+
+#define MINA_DF(T, lanes, mask)                                               \
+  int size_in_bits = kMSARegSize / lanes;                                     \
+  for (int i = 0; i < 2; i++) {                                               \
+    uint64_t res = 0;                                                         \
+    for (int j = 0; j < lanes / 2; ++j) {                                     \
+      uint64_t shift = size_in_bits * j;                                      \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                      \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                      \
+      res |=                                                                  \
+          (static_cast<uint64_t>(Nabs(ws_op) > Nabs(wt_op) ? ws_op : wt_op) & \
+           mask)                                                              \
+          << shift;                                                           \
+    }                                                                         \
+    wd[i] = res;                                                              \
+  }
+
+#define CEQ_DF(T, lanes, mask)                                               \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                     \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                     \
+      res |= (static_cast<uint64_t>(!Compare(ws_op, wt_op) ? -1ull : 0ull) & \
+              mask)                                                          \
+             << shift;                                                       \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
+
+#define CLT_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                   \
+  for (int i = 0; i < 2; i++) {                                             \
+    uint64_t res = 0;                                                       \
+    for (int j = 0; j < lanes / 2; ++j) {                                   \
+      uint64_t shift = size_in_bits * j;                                    \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                    \
+      res |= (static_cast<uint64_t>((Compare(ws_op, wt_op) == -1) ? -1ull   \
+                                                                  : 0ull) & \
+              mask)                                                         \
+             << shift;                                                      \
+    }                                                                       \
+    wd[i] = res;                                                            \
+  }
+
+#define CLE_DF(T, lanes, mask)                                             \
   int size_in_bits = kMSARegSize / lanes;                                  \
-  for (int i = 0; i < lanes / 2; ++i) {                                    \
-    uint64_t shift = size_in_bits * i;                                     \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                        \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                        \
-    T wd_op = static_cast<T>((wd >> shift) & mask);                        \
-    res |= (static_cast<uint64_t>(wd_op + ws_op * wt_op) & mask) << shift; \
-  }                                                                        \
-  return res
+  for (int i = 0; i < 2; i++) {                                            \
+    uint64_t res = 0;                                                      \
+    for (int j = 0; j < lanes / 2; ++j) {                                  \
+      uint64_t shift = size_in_bits * j;                                   \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                   \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                   \
+      res |= (static_cast<uint64_t>((Compare(ws_op, wt_op) != 1) ? -1ull   \
+                                                                 : 0ull) & \
+              mask)                                                        \
+             << shift;                                                     \
+    }                                                                      \
+    wd[i] = res;                                                           \
+  }
 
-#define MSUBV_DF(T, lanes, mask)                                           \
-  uint64_t res = 0;                                                        \
-  int size_in_bits = kMSARegSize / lanes;                                  \
-  for (int i = 0; i < lanes / 2; ++i) {                                    \
-    uint64_t shift = size_in_bits * i;                                     \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                        \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                        \
-    T wd_op = static_cast<T>((wd >> shift) & mask);                        \
-    res |= (static_cast<uint64_t>(wd_op - ws_op * wt_op) & mask) << shift; \
-  }                                                                        \
-  return res
+#define ADD_A_DF(T, lanes, mask)                                               \
+  int size_in_bits = kMSARegSize / lanes;                                      \
+  for (int i = 0; i < 2; i++) {                                                \
+    uint64_t res = 0;                                                          \
+    for (int j = 0; j < lanes / 2; ++j) {                                      \
+      uint64_t shift = size_in_bits * j;                                       \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                       \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                       \
+      res |= (static_cast<uint64_t>(Abs(ws_op) + Abs(wt_op)) & mask) << shift; \
+    }                                                                          \
+    wd[i] = res;                                                               \
+  }
 
-#define DIV_DF(T, lanes, mask)                                     \
-  uint64_t res = 0;                                                \
-  int size_in_bits = kMSARegSize / lanes;                          \
-  for (int i = 0; i < lanes / 2; ++i) {                            \
-    uint64_t shift = size_in_bits * i;                             \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                \
-    if (wt_op == 0) {                                              \
-      res = Unpredictable;                                         \
-      break;                                                       \
-    }                                                              \
-    res |= (static_cast<uint64_t>(ws_op / wt_op) & mask) << shift; \
-  }                                                                \
-  return res
+#define ADDS_A_DF(T, lanes, mask)                              \
+  int size_in_bits = kMSARegSize / lanes;                      \
+  for (int i = 0; i < 2; i++) {                                \
+    uint64_t res = 0;                                          \
+    for (int j = 0; j < lanes / 2; ++j) {                      \
+      uint64_t shift = size_in_bits * j;                       \
+      T ws_op = Nabs(static_cast<T>((ws[i] >> shift) & mask)); \
+      T wt_op = Nabs(static_cast<T>((wt[i] >> shift) & mask)); \
+      T r;                                                     \
+      if (ws_op < -std::numeric_limits<T>::max() - wt_op) {    \
+        r = std::numeric_limits<T>::max();                     \
+      } else {                                                 \
+        r = -(ws_op + wt_op);                                  \
+      }                                                        \
+      res |= (static_cast<uint64_t>(r) & mask) << shift;       \
+    }                                                          \
+    wd[i] = res;                                               \
+  }
 
-#define MOD_DF(T, lanes, mask)                                            \
-  uint64_t res = 0;                                                       \
+#define ADDS_DF(T, lanes, mask)                                        \
+  int size_in_bits = kMSARegSize / lanes;                              \
+  for (int i = 0; i < 2; i++) {                                        \
+    uint64_t res = 0;                                                  \
+    for (int j = 0; j < lanes / 2; ++j) {                              \
+      uint64_t shift = size_in_bits * j;                               \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);               \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);               \
+      res |= (static_cast<uint64_t>(SaturateAdd(ws_op, wt_op)) & mask) \
+             << shift;                                                 \
+    }                                                                  \
+    wd[i] = res;                                                       \
+  }
+
+#define AVE_DF(T, lanes, mask)                                       \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      res |= (static_cast<uint64_t>(                                 \
+                 ((wt_op & ws_op) + ((ws_op ^ wt_op) >> 1)) & mask)) \
+             << shift;                                               \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
+
+#define AVER_DF(T, lanes, mask)                                      \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      res |= (static_cast<uint64_t>(                                 \
+                 ((wt_op | ws_op) - ((ws_op ^ wt_op) >> 1)) & mask)) \
+             << shift;                                               \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
+
+#define SUBS_DF(T, lanes, mask)                                        \
+  int size_in_bits = kMSARegSize / lanes;                              \
+  for (int i = 0; i < 2; i++) {                                        \
+    uint64_t res = 0;                                                  \
+    for (int j = 0; j < lanes / 2; ++j) {                              \
+      uint64_t shift = size_in_bits * j;                               \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);               \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);               \
+      res |= (static_cast<uint64_t>(SaturateSub(ws_op, wt_op)) & mask) \
+             << shift;                                                 \
+    }                                                                  \
+    wd[i] = res;                                                       \
+  }
+
+#define SUBSUS_U_DF(T, lanes, mask)                           \
+  typedef typename std::make_unsigned<T>::type uT;            \
+  int size_in_bits = kMSARegSize / lanes;                     \
+  for (int i = 0; i < 2; i++) {                               \
+    uint64_t res = 0;                                         \
+    for (int j = 0; j < lanes / 2; ++j) {                     \
+      uint64_t shift = size_in_bits * j;                      \
+      uT ws_op = static_cast<uT>((ws[i] >> shift) & mask);    \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);      \
+      T r;                                                    \
+      if (wt_op > 0) {                                        \
+        uT wtu = static_cast<uT>(wt_op);                      \
+        if (wtu > ws_op) {                                    \
+          r = 0;                                              \
+        } else {                                              \
+          r = static_cast<T>(ws_op - wtu);                    \
+        }                                                     \
+      } else {                                                \
+        if (ws_op > std::numeric_limits<uT>::max() + wt_op) { \
+          r = static_cast<T>(std::numeric_limits<uT>::max()); \
+        } else {                                              \
+          r = static_cast<T>(ws_op - wt_op);                  \
+        }                                                     \
+      }                                                       \
+      res |= (static_cast<uint64_t>(r) & mask) << shift;      \
+    }                                                         \
+    wd[i] = res;                                              \
+  }
+
+#define SUBSUU_S_DF(T, lanes, mask)                        \
+  typedef typename std::make_unsigned<T>::type uT;         \
+  int size_in_bits = kMSARegSize / lanes;                  \
+  for (int i = 0; i < 2; i++) {                            \
+    uint64_t res = 0;                                      \
+    for (int j = 0; j < lanes / 2; ++j) {                  \
+      uint64_t shift = size_in_bits * j;                   \
+      uT ws_op = static_cast<uT>((ws[i] >> shift) & mask); \
+      uT wt_op = static_cast<uT>((wt[i] >> shift) & mask); \
+      uT wdu;                                              \
+      T r;                                                 \
+      if (ws_op > wt_op) {                                 \
+        wdu = ws_op - wt_op;                               \
+        if (wdu > std::numeric_limits<T>::max()) {         \
+          r = std::numeric_limits<T>::max();               \
+        } else {                                           \
+          r = static_cast<T>(wdu);                         \
+        }                                                  \
+      } else {                                             \
+        wdu = wt_op - ws_op;                               \
+        CHECK(-std::numeric_limits<T>::max() ==            \
+              std::numeric_limits<T>::min() + 1);          \
+        if (wdu <= std::numeric_limits<T>::max()) {        \
+          r = -static_cast<T>(wdu);                        \
+        } else {                                           \
+          r = std::numeric_limits<T>::min();               \
+        }                                                  \
+      }                                                    \
+      res |= (static_cast<uint64_t>(r) & mask) << shift;   \
+    }                                                      \
+    wd[i] = res;                                           \
+  }
+
+#define ASUB_S_DF(T, lanes, mask)                                         \
   int size_in_bits = kMSARegSize / lanes;                                 \
-  for (int i = 0; i < lanes / 2; ++i) {                                   \
-    uint64_t shift = size_in_bits * i;                                    \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                       \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                       \
-    if (wt_op == 0) {                                                     \
-      res = Unpredictable;                                                \
-      break;                                                              \
+  for (int i = 0; i < 2; i++) {                                           \
+    uint64_t res = 0;                                                     \
+    for (int j = 0; j < lanes / 2; ++j) {                                 \
+      uint64_t shift = size_in_bits * j;                                  \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                  \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                  \
+      res |= (static_cast<uint64_t>(Abs(ws_op - wt_op)) & mask) << shift; \
     }                                                                     \
-    res |= (static_cast<uint64_t>(wt_op != 0 ? ws_op % wt_op : 0) & mask) \
-           << shift;                                                      \
-  }                                                                       \
-  return res
+    wd[i] = res;                                                          \
+  }
 
-#define SRAR_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T src_op = static_cast<T>((ws >> shift) & mask);                           \
-    T shift_op = ((wt >> shift) & mask) % size_in_bits;                        \
-    uint32_t bit = shift_op == 0 ? 0 : src_op >> (shift_op - 1) & 1;           \
-    res |=                                                                     \
-        (static_cast<uint64_t>(ArithmeticShiftRight(src_op, shift_op) + bit) & \
-         mask)                                                                 \
-        << shift;                                                              \
-  }                                                                            \
-  return res
+#define ASUB_U_DF(T, lanes, mask)                                    \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      res |= (static_cast<uint64_t>(ws_op > wt_op ? ws_op - wt_op    \
+                                                  : wt_op - ws_op) & \
+              mask)                                                  \
+             << shift;                                               \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
+
+#define MULV_DF(T, lanes, mask)                                      \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      res |= (static_cast<uint64_t>(ws_op * wt_op) & mask) << shift; \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
+
+#define MADDV_DF(T, lanes, mask)                                             \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                     \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                     \
+      T wd_op = static_cast<T>((wd[i] >> shift) & mask);                     \
+      res |= (static_cast<uint64_t>(wd_op + ws_op * wt_op) & mask) << shift; \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
+
+#define MSUBV_DF(T, lanes, mask)                                             \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                     \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                     \
+      T wd_op = static_cast<T>((wd[i] >> shift) & mask);                     \
+      res |= (static_cast<uint64_t>(wd_op - ws_op * wt_op) & mask) << shift; \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
+
+#define DIV_DF(T, lanes, mask)                                       \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      if (wt_op == 0) {                                              \
+        res = Unpredictable;                                         \
+        break;                                                       \
+      }                                                              \
+      res |= (static_cast<uint64_t>(ws_op / wt_op) & mask) << shift; \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
+
+#define MOD_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                   \
+  for (int i = 0; i < 2; i++) {                                             \
+    uint64_t res = 0;                                                       \
+    for (int j = 0; j < lanes / 2; ++j) {                                   \
+      uint64_t shift = size_in_bits * j;                                    \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                    \
+      if (wt_op == 0) {                                                     \
+        res = Unpredictable;                                                \
+        break;                                                              \
+      }                                                                     \
+      res |= (static_cast<uint64_t>(wt_op != 0 ? ws_op % wt_op : 0) & mask) \
+             << shift;                                                      \
+    }                                                                       \
+    wd[i] = res;                                                            \
+  }
+
+#define SRAR_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T shift_op = ((wt[i] >> shift) & mask) % size_in_bits;                 \
+      uint32_t bit = shift_op == 0 ? 0 : src_op >> (shift_op - 1) & 1;       \
+      res |= (static_cast<uint64_t>(ArithmeticShiftRight(src_op, shift_op) + \
+                                    bit) &                                   \
+              mask)                                                          \
+             << shift;                                                       \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
+
+#define PCKEV_DF(T, lanes, mask)        \
+  T* ws_p = reinterpret_cast<T*>(ws);   \
+  T* wt_p = reinterpret_cast<T*>(wt);   \
+  T* wd_p = reinterpret_cast<T*>(wd);   \
+  for (int i = 0; i < lanes / 2; ++i) { \
+    wd_p[i] = wt_p[2 * i];              \
+    wd_p[i + lanes / 2] = ws_p[2 * i];  \
+  }
+
+#define PCKOD_DF(T, lanes, mask)           \
+  T* ws_p = reinterpret_cast<T*>(ws);      \
+  T* wt_p = reinterpret_cast<T*>(wt);      \
+  T* wd_p = reinterpret_cast<T*>(wd);      \
+  for (int i = 0; i < lanes / 2; ++i) {    \
+    wd_p[i] = wt_p[2 * i + 1];             \
+    wd_p[i + lanes / 2] = ws_p[2 * i + 1]; \
+  }
+
+#define ILVL_DF(T, lanes, mask)            \
+  T* ws_p = reinterpret_cast<T*>(ws);      \
+  T* wt_p = reinterpret_cast<T*>(wt);      \
+  T* wd_p = reinterpret_cast<T*>(wd);      \
+  for (int i = 0; i < lanes / 2; ++i) {    \
+    wd_p[2 * i] = wt_p[i + lanes / 2];     \
+    wd_p[2 * i + 1] = ws_p[i + lanes / 2]; \
+  }
+
+#define ILVR_DF(T, lanes, mask)         \
+  T* ws_p = reinterpret_cast<T*>(ws);   \
+  T* wt_p = reinterpret_cast<T*>(wt);   \
+  T* wd_p = reinterpret_cast<T*>(wd);   \
+  for (int i = 0; i < lanes / 2; ++i) { \
+    wd_p[2 * i] = wt_p[i];              \
+    wd_p[2 * i + 1] = ws_p[i];          \
+  }
+
+#define ILVEV_DF(T, lanes, mask)        \
+  T* ws_p = reinterpret_cast<T*>(ws);   \
+  T* wt_p = reinterpret_cast<T*>(wt);   \
+  T* wd_p = reinterpret_cast<T*>(wd);   \
+  for (int i = 0; i < lanes / 2; ++i) { \
+    wd_p[2 * i] = wt_p[2 * i];          \
+    wd_p[2 * i + 1] = ws_p[2 * i];      \
+  }
+
+#define ILVOD_DF(T, lanes, mask)        \
+  T* ws_p = reinterpret_cast<T*>(ws);   \
+  T* wt_p = reinterpret_cast<T*>(wt);   \
+  T* wd_p = reinterpret_cast<T*>(wd);   \
+  for (int i = 0; i < lanes / 2; ++i) { \
+    wd_p[2 * i] = wt_p[2 * i + 1];      \
+    wd_p[2 * i + 1] = ws_p[2 * i + 1];  \
+  }
+
+#define VSHF_DF(T, lanes, mask)                        \
+  T* ws_p = reinterpret_cast<T*>(ws);                  \
+  T* wt_p = reinterpret_cast<T*>(wt);                  \
+  T* wd_p = reinterpret_cast<T*>(wd);                  \
+  const int mask_not_valid = 0xc0;                     \
+  const int mask_6bits = 0x3f;                         \
+  for (int i = 0; i < lanes; ++i) {                    \
+    if ((wd_p[i] & mask_not_valid)) {                  \
+      wd_p[i] = 0;                                     \
+    } else {                                           \
+      int k = (wd_p[i] & mask_6bits) % (lanes * 2);    \
+      wd_p[i] = k > lanes ? ws_p[k - lanes] : wt_p[k]; \
+    }                                                  \
+  }
+
+#define HADD_DF(T, T_small, lanes)                                           \
+  T_small* ws_p = reinterpret_cast<T_small*>(ws);                            \
+  T_small* wt_p = reinterpret_cast<T_small*>(wt);                            \
+  T* wd_p = reinterpret_cast<T*>(wd);                                        \
+  for (int i = 0; i < lanes; ++i) {                                          \
+    wd_p[i] = static_cast<T>(ws_p[2 * i + 1]) + static_cast<T>(wt_p[2 * i]); \
+  }
+
+#define HSUB_DF(T, T_small, lanes)                                           \
+  T_small* ws_p = reinterpret_cast<T_small*>(ws);                            \
+  T_small* wt_p = reinterpret_cast<T_small*>(wt);                            \
+  T* wd_p = reinterpret_cast<T*>(wd);                                        \
+  for (int i = 0; i < lanes; ++i) {                                          \
+    wd_p[i] = static_cast<T>(ws_p[2 * i + 1]) - static_cast<T>(wt_p[2 * i]); \
+  }
 
 #define TEST_CASE(V)                                              \
   V(sll_b, SLL_DF, uint8_t, kMSALanesByte, UINT8_MAX)             \
   V(sll_h, SLL_DF, uint16_t, kMSALanesHalf, UINT16_MAX)           \
   V(sll_w, SLL_DF, uint32_t, kMSALanesWord, UINT32_MAX)           \
   V(sll_d, SLL_DF, uint64_t, kMSALanesDword, UINT64_MAX)          \
-  V(sra_b, SRA_DF, int8_t, kMSALanesByte, UINT8_MAX)              \
-  V(sra_h, SRA_DF, int16_t, kMSALanesHalf, UINT16_MAX)            \
-  V(sra_w, SRA_DF, int32_t, kMSALanesWord, UINT32_MAX)            \
-  V(sra_d, SRA_DF, int64_t, kMSALanesDword, UINT64_MAX)           \
   V(srl_b, SRL_DF, uint8_t, kMSALanesByte, UINT8_MAX)             \
   V(srl_h, SRL_DF, uint16_t, kMSALanesHalf, UINT16_MAX)           \
   V(srl_w, SRL_DF, uint32_t, kMSALanesWord, UINT32_MAX)           \
@@ -9352,18 +9702,54 @@ TEST(MSA_3R_instructions) {
   V(mod_u_h, MOD_DF, uint16_t, kMSALanesHalf, UINT16_MAX)         \
   V(mod_u_w, MOD_DF, uint32_t, kMSALanesWord, UINT32_MAX)         \
   V(mod_u_d, MOD_DF, uint64_t, kMSALanesDword, UINT64_MAX)        \
-  V(srar_b, SRAR_DF, int8_t, kMSALanesByte, UINT8_MAX)            \
-  V(srar_h, SRAR_DF, int16_t, kMSALanesHalf, UINT16_MAX)          \
-  V(srar_w, SRAR_DF, int32_t, kMSALanesWord, UINT32_MAX)          \
-  V(srar_d, SRAR_DF, int64_t, kMSALanesDword, UINT64_MAX)         \
   V(srlr_b, SRAR_DF, uint8_t, kMSALanesByte, UINT8_MAX)           \
   V(srlr_h, SRAR_DF, uint16_t, kMSALanesHalf, UINT16_MAX)         \
   V(srlr_w, SRAR_DF, uint32_t, kMSALanesWord, UINT32_MAX)         \
-  V(srlr_d, SRAR_DF, uint64_t, kMSALanesDword, UINT64_MAX)
+  V(srlr_d, SRAR_DF, uint64_t, kMSALanesDword, UINT64_MAX)        \
+  V(pckev_b, PCKEV_DF, uint8_t, kMSALanesByte, UINT8_MAX)         \
+  V(pckev_h, PCKEV_DF, uint16_t, kMSALanesHalf, UINT16_MAX)       \
+  V(pckev_w, PCKEV_DF, uint32_t, kMSALanesWord, UINT32_MAX)       \
+  V(pckev_d, PCKEV_DF, uint64_t, kMSALanesDword, UINT64_MAX)      \
+  V(pckod_b, PCKOD_DF, uint8_t, kMSALanesByte, UINT8_MAX)         \
+  V(pckod_h, PCKOD_DF, uint16_t, kMSALanesHalf, UINT16_MAX)       \
+  V(pckod_w, PCKOD_DF, uint32_t, kMSALanesWord, UINT32_MAX)       \
+  V(pckod_d, PCKOD_DF, uint64_t, kMSALanesDword, UINT64_MAX)      \
+  V(ilvl_b, ILVL_DF, uint8_t, kMSALanesByte, UINT8_MAX)           \
+  V(ilvl_h, ILVL_DF, uint16_t, kMSALanesHalf, UINT16_MAX)         \
+  V(ilvl_w, ILVL_DF, uint32_t, kMSALanesWord, UINT32_MAX)         \
+  V(ilvl_d, ILVL_DF, uint64_t, kMSALanesDword, UINT64_MAX)        \
+  V(ilvr_b, ILVR_DF, uint8_t, kMSALanesByte, UINT8_MAX)           \
+  V(ilvr_h, ILVR_DF, uint16_t, kMSALanesHalf, UINT16_MAX)         \
+  V(ilvr_w, ILVR_DF, uint32_t, kMSALanesWord, UINT32_MAX)         \
+  V(ilvr_d, ILVR_DF, uint64_t, kMSALanesDword, UINT64_MAX)        \
+  V(ilvev_b, ILVEV_DF, uint8_t, kMSALanesByte, UINT8_MAX)         \
+  V(ilvev_h, ILVEV_DF, uint16_t, kMSALanesHalf, UINT16_MAX)       \
+  V(ilvev_w, ILVEV_DF, uint32_t, kMSALanesWord, UINT32_MAX)       \
+  V(ilvev_d, ILVEV_DF, uint64_t, kMSALanesDword, UINT64_MAX)      \
+  V(ilvod_b, ILVOD_DF, uint8_t, kMSALanesByte, UINT8_MAX)         \
+  V(ilvod_h, ILVOD_DF, uint16_t, kMSALanesHalf, UINT16_MAX)       \
+  V(ilvod_w, ILVOD_DF, uint32_t, kMSALanesWord, UINT32_MAX)       \
+  V(ilvod_d, ILVOD_DF, uint64_t, kMSALanesDword, UINT64_MAX)      \
+  V(vshf_b, VSHF_DF, uint8_t, kMSALanesByte, UINT8_MAX)           \
+  V(vshf_h, VSHF_DF, uint16_t, kMSALanesHalf, UINT16_MAX)         \
+  V(vshf_w, VSHF_DF, uint32_t, kMSALanesWord, UINT32_MAX)         \
+  V(vshf_d, VSHF_DF, uint64_t, kMSALanesDword, UINT64_MAX)        \
+  V(hadd_s_h, HADD_DF, int16_t, int8_t, kMSALanesHalf)            \
+  V(hadd_s_w, HADD_DF, int32_t, int16_t, kMSALanesWord)           \
+  V(hadd_s_d, HADD_DF, int64_t, int32_t, kMSALanesDword)          \
+  V(hadd_u_h, HADD_DF, uint16_t, uint8_t, kMSALanesHalf)          \
+  V(hadd_u_w, HADD_DF, uint32_t, uint16_t, kMSALanesWord)         \
+  V(hadd_u_d, HADD_DF, uint64_t, uint32_t, kMSALanesDword)        \
+  V(hsub_s_h, HSUB_DF, int16_t, int8_t, kMSALanesHalf)            \
+  V(hsub_s_w, HSUB_DF, int32_t, int16_t, kMSALanesWord)           \
+  V(hsub_s_d, HSUB_DF, int64_t, int32_t, kMSALanesDword)          \
+  V(hsub_u_h, HSUB_DF, uint16_t, uint8_t, kMSALanesHalf)          \
+  V(hsub_u_w, HSUB_DF, uint32_t, uint16_t, kMSALanesWord)         \
+  V(hsub_u_d, HSUB_DF, uint64_t, uint32_t, kMSALanesDword)
 
 #define RUN_TEST(instr, verify, type, lanes, mask)                       \
   run_msa_3r(&tc[i], [](MacroAssembler& assm) { __ instr(w2, w1, w0); }, \
-             [](uint64_t ws, uint64_t wt, uint64_t wd) {                 \
+             [](uint64_t* ws, uint64_t* wt, uint64_t* wd) {              \
                verify(type, lanes, mask);                                \
              });
 
@@ -9371,9 +9757,41 @@ TEST(MSA_3R_instructions) {
     TEST_CASE(RUN_TEST)
   }
 
+#define RUN_TEST2(instr, verify, type, lanes, mask)                      \
+  for (unsigned i = 0; i < arraysize(tc); i++) {                         \
+    for (unsigned j = 0; j < 3; j++) {                                   \
+      for (unsigned k = 0; k < lanes; k++) {                             \
+        type* element = reinterpret_cast<type*>(&tc[i]);                 \
+        element[k + j * lanes] &= std::numeric_limits<type>::max();      \
+      }                                                                  \
+    }                                                                    \
+  }                                                                      \
+  run_msa_3r(&tc[i], [](MacroAssembler& assm) { __ instr(w2, w1, w0); }, \
+             [](uint64_t* ws, uint64_t* wt, uint64_t* wd) {              \
+               verify(type, lanes, mask);                                \
+             });
+
+#define TEST_CASE2(V)                                    \
+  V(sra_b, SRA_DF, int8_t, kMSALanesByte, UINT8_MAX)     \
+  V(sra_h, SRA_DF, int16_t, kMSALanesHalf, UINT16_MAX)   \
+  V(sra_w, SRA_DF, int32_t, kMSALanesWord, UINT32_MAX)   \
+  V(sra_d, SRA_DF, int64_t, kMSALanesDword, UINT64_MAX)  \
+  V(srar_b, SRAR_DF, int8_t, kMSALanesByte, UINT8_MAX)   \
+  V(srar_h, SRAR_DF, int16_t, kMSALanesHalf, UINT16_MAX) \
+  V(srar_w, SRAR_DF, int32_t, kMSALanesWord, UINT32_MAX) \
+  V(srar_d, SRAR_DF, int64_t, kMSALanesDword, UINT64_MAX)
+
+  for (size_t i = 0; i < arraysize(tc); ++i) {
+    TEST_CASE2(RUN_TEST2)
+  }
+
+#undef TEST_CASE
+#undef TEST_CASE2
 #undef RUN_TEST
+#undef RUN_TEST2
 #undef SLL_DF
 #undef SRL_DF
+#undef SRA_DF
 #undef BCRL_DF
 #undef BSET_DF
 #undef BNEG_DF
@@ -9404,6 +9822,15 @@ TEST(MSA_3R_instructions) {
 #undef DIV_DF
 #undef MOD_DF
 #undef SRAR_DF
+#undef PCKEV_DF
+#undef PCKOD_DF
+#undef ILVL_DF
+#undef ILVR_DF
+#undef ILVEV_DF
+#undef ILVOD_DF
+#undef VSHF_DF
+#undef HADD_DF
+#undef HSUB_DF
 }  // namespace internal
 
 struct TestCaseMsa3RF {
@@ -9420,12 +9847,10 @@ struct ExpectedResult_MSA3RF {
   uint64_t exp_res_hi;
 };
 
-template <typename Func, typename FuncLoad, typename FuncStore>
+template <typename Func>
 void run_msa_3rf(const struct TestCaseMsa3RF* input,
                  const struct ExpectedResult_MSA3RF* output,
-                 Func Generate2RInstructionFunc,
-                 FuncLoad load_elements_of_vector,
-                 FuncStore store_elements_of_vector) {
+                 Func Generate2RInstructionFunc) {
   Isolate* isolate = CcTest::i_isolate();
   HandleScope scope(isolate);
 
@@ -9456,28 +9881,8 @@ void run_msa_3rf(const struct TestCaseMsa3RF* input,
 
   (CALL_GENERATED_CODE(isolate, f, &res, 0, 0, 0, 0));
 
-  if (store_elements_of_vector == store_uint64_elements_of_vector) {
-    CHECK_EQ(output->exp_res_lo, res.d[0]);
-    CHECK_EQ(output->exp_res_hi, res.d[1]);
-  } else if (store_elements_of_vector == store_uint32_elements_of_vector) {
-    const uint32_t* exp_res =
-        reinterpret_cast<const uint32_t*>(&output->exp_res_lo);
-    CHECK_EQ(exp_res[0], res.w[0]);
-    CHECK_EQ(exp_res[1], res.w[1]);
-    CHECK_EQ(exp_res[2], res.w[2]);
-    CHECK_EQ(exp_res[3], res.w[3]);
-  } else {
-    const uint16_t* exp_res =
-        reinterpret_cast<const uint16_t*>(&output->exp_res_lo);
-    CHECK_EQ(exp_res[0], res.h[0]);
-    CHECK_EQ(exp_res[1], res.h[1]);
-    CHECK_EQ(exp_res[2], res.h[2]);
-    CHECK_EQ(exp_res[3], res.h[3]);
-    CHECK_EQ(exp_res[4], res.h[4]);
-    CHECK_EQ(exp_res[5], res.h[5]);
-    CHECK_EQ(exp_res[6], res.h[6]);
-    CHECK_EQ(exp_res[7], res.h[7]);
-  }
+  CHECK_EQ(output->exp_res_lo, res.d[0]);
+  CHECK_EQ(output->exp_res_hi, res.d[1]);
 }
 
 struct TestCaseMsa3RF_F {
@@ -9579,15 +9984,11 @@ TEST(MSA_floating_point_quiet_compare) {
 #define TEST_FP_QUIET_COMPARE_W(instruction, src, exp_res)                    \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint32_elements_of_vector,                                 \
-              store_uint32_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 #define TEST_FP_QUIET_COMPARE_D(instruction, src, exp_res)                    \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint64_elements_of_vector,                                 \
-              store_uint64_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
   for (uint64_t i = 0; i < arraysize(tc_w); i++) {
     TEST_FP_QUIET_COMPARE_W(fcaf_w, &tc_w[i], &exp_res_fcaf)
@@ -9702,16 +10103,14 @@ TEST(MSA_floating_point_arithmetic) {
       reinterpret_cast<const struct TestCaseMsa3RF*>(src1),             \
       reinterpret_cast<const struct ExpectedResult_MSA3RF*>(function(   \
           src1, src2, src3, reinterpret_cast<float*>(&dst_container))), \
-      [](MacroAssembler& assm) { __ instr(w2, w0, w1); },               \
-      load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+      [](MacroAssembler& assm) { __ instr(w2, w0, w1); });
 
 #define FP_ARITHMETIC_DF_D(instr, function, src1, src2, src3)            \
   run_msa_3rf(                                                           \
       reinterpret_cast<const struct TestCaseMsa3RF*>(src1),              \
       reinterpret_cast<const struct ExpectedResult_MSA3RF*>(function(    \
           src1, src2, src3, reinterpret_cast<double*>(&dst_container))), \
-      [](MacroAssembler& assm) { __ instr(w2, w0, w1); },                \
-      load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+      [](MacroAssembler& assm) { __ instr(w2, w0, w1); });
 
   for (uint64_t i = 0; i < arraysize(tc_w); i++) {
     FP_ARITHMETIC_DF_W(fadd_w, fadd_function, &tc_w[i].ws_1, &tc_w[i].wt_1,
@@ -9808,16 +10207,12 @@ TEST(MSA_fmin_fmin_a_fmax_fmax_a) {
 #define TEST_FP_MIN_MAX_W(instruction, src, exp_res)                          \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint32_elements_of_vector,                                 \
-              store_uint32_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
 #define TEST_FP_MIN_MAX_D(instruction, src, exp_res)                          \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint64_elements_of_vector,                                 \
-              store_uint64_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
   for (uint64_t i = 0; i < arraysize(tc_w); i++) {
     TEST_FP_MIN_MAX_W(fmax_w, &tc_w[i], &exp_res_fmax_w[i])
@@ -9930,17 +10325,13 @@ TEST(MSA_fixed_point_arithmetic) {
       {0x800000007fffffff, 0x800000007c33f15e},
       {0xb5deb625939d884d, 0xe40dcbfe728756b5}};
 
-#define TEST_FIXED_POINT_DF_H(instruction, src, exp_res)                \
-  run_msa_3rf((src), (exp_res),                                         \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); }, \
-              load_uint16_elements_of_vector,                           \
-              store_uint16_elements_of_vector);
+#define TEST_FIXED_POINT_DF_H(instruction, src, exp_res) \
+  run_msa_3rf((src), (exp_res),                          \
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
-#define TEST_FIXED_POINT_DF_W(instruction, src, exp_res)                \
-  run_msa_3rf((src), (exp_res),                                         \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); }, \
-              load_uint32_elements_of_vector,                           \
-              store_uint32_elements_of_vector);
+#define TEST_FIXED_POINT_DF_W(instruction, src, exp_res) \
+  run_msa_3rf((src), (exp_res),                          \
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
   for (uint64_t i = 0; i < arraysize(tc_h); i++) {
     TEST_FIXED_POINT_DF_H(mul_q_h, &tc_h[i], &exp_res_mul_q_h[i])
@@ -10021,16 +10412,12 @@ TEST(MSA_fexdo) {
 #define TEST_FEXDO_H(instruction, src, exp_res)                               \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint32_elements_of_vector,                                 \
-              store_uint16_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
 #define TEST_FEXDO_W(instruction, src, exp_res)                               \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint64_elements_of_vector,                                 \
-              store_uint32_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
   for (uint64_t i = 0; i < arraysize(tc_w); i++) {
     TEST_FEXDO_H(fexdo_h, &tc_w[i], &exp_res_fexdo_w[i])
@@ -10100,16 +10487,12 @@ TEST(MSA_ftq) {
 #define TEST_FTQ_H(instruction, src, exp_res)                                 \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint32_elements_of_vector,                                 \
-              store_uint16_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
 #define TEST_FTQ_W(instruction, src, exp_res)                                 \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint64_elements_of_vector,                                 \
-              store_uint32_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
   for (uint64_t i = 0; i < arraysize(tc_w); i++) {
     TEST_FTQ_H(ftq_h, &tc_w[i], &exp_res_ftq_w[i])
diff --git a/test/cctest/test-assembler-mips64.cc b/test/cctest/test-assembler-mips64.cc
index 394ea882ce..f809ea8f39 100644
--- a/test/cctest/test-assembler-mips64.cc
+++ b/test/cctest/test-assembler-mips64.cc
@@ -5597,6 +5597,189 @@ TEST(r6_beqzc) {
   }
 }
 
+void load_elements_of_vector(MacroAssembler& assm, const uint64_t elements[],
+                             MSARegister w, Register t0, Register t1) {
+  __ li(t0, static_cast<uint32_t>(elements[0] & 0xffffffff));
+  __ li(t1, static_cast<uint32_t>((elements[0] >> 32) & 0xffffffff));
+  __ insert_w(w, 0, t0);
+  __ insert_w(w, 1, t1);
+  __ li(t0, static_cast<uint32_t>(elements[1] & 0xffffffff));
+  __ li(t1, static_cast<uint32_t>((elements[1] >> 32) & 0xffffffff));
+  __ insert_w(w, 2, t0);
+  __ insert_w(w, 3, t1);
+}
+
+inline void store_elements_of_vector(MacroAssembler& assm, MSARegister w,
+                                     Register a) {
+  __ st_d(w, MemOperand(a, 0));
+}
+
+typedef union {
+  uint8_t b[16];
+  uint16_t h[8];
+  uint32_t w[4];
+  uint64_t d[2];
+} msa_reg_t;
+
+struct TestCaseMsaBranch {
+  uint64_t wt_lo;
+  uint64_t wt_hi;
+};
+
+template <typename Branch>
+void run_bz_bnz(TestCaseMsaBranch* input, Branch GenerateBranch,
+                bool branched) {
+  Isolate* isolate = CcTest::i_isolate();
+  HandleScope scope(isolate);
+
+  MacroAssembler assm(isolate, NULL, 0, v8::internal::CodeObjectRequired::kYes);
+  CpuFeatureScope fscope(&assm, MIPS_SIMD);
+
+  typedef struct {
+    uint64_t ws_lo;
+    uint64_t ws_hi;
+    uint64_t wd_lo;
+    uint64_t wd_hi;
+  } T;
+  T t = {0x20b9cc4f1a83e0c5, 0xa27e1b5f2f5bb18a, 0x0000000000000000,
+         0x0000000000000000};
+  msa_reg_t res;
+  Label do_not_move_w0_to_w2;
+
+  load_elements_of_vector(assm, &t.ws_lo, w0, t0, t1);
+  load_elements_of_vector(assm, &t.wd_lo, w2, t0, t1);
+  load_elements_of_vector(assm, &input->wt_lo, w1, t0, t1);
+  GenerateBranch(assm, do_not_move_w0_to_w2);
+  __ nop();
+  __ move_v(w2, w0);
+
+  __ bind(&do_not_move_w0_to_w2);
+  store_elements_of_vector(assm, w2, a0);
+  __ jr(ra);
+  __ nop();
+
+  CodeDesc desc;
+  assm.GetCode(isolate, &desc);
+  Handle<Code> code =
+      isolate->factory()->NewCode(desc, Code::STUB, Handle<Code>());
+#ifdef OBJECT_PRINT
+  code->Print(std::cout);
+#endif
+  F3 f = FUNCTION_CAST<F3>(code->entry());
+
+  (CALL_GENERATED_CODE(isolate, f, &res, 0, 0, 0, 0));
+  if (branched) {
+    CHECK_EQ(t.wd_lo, res.d[0]);
+    CHECK_EQ(t.wd_hi, res.d[1]);
+  } else {
+    CHECK_EQ(t.ws_lo, res.d[0]);
+    CHECK_EQ(t.ws_hi, res.d[1]);
+  }
+}
+
+TEST(MSA_bz_bnz) {
+  if ((kArchVariant != kMips64r6) || !CpuFeatures::IsSupported(MIPS_SIMD))
+    return;
+
+  TestCaseMsaBranch tz_v[] = {
+      {0x0, 0x0}, {0xabc, 0x0}, {0x0, 0xabc}, {0xabc, 0xabc}};
+  for (unsigned i = 0; i < arraysize(tz_v); ++i) {
+    run_bz_bnz(
+        &tz_v[i],
+        [](MacroAssembler& assm, Label& br_target) { __ bz_v(w1, &br_target); },
+        tz_v[i].wt_lo == 0 && tz_v[i].wt_hi == 0);
+  }
+
+#define TEST_BZ_DF(input_array, lanes, instruction, int_type)         \
+  for (unsigned i = 0; i < arraysize(input_array); ++i) {             \
+    int j;                                                            \
+    int_type* element = reinterpret_cast<int_type*>(&input_array[i]); \
+    for (j = 0; j < lanes; ++j) {                                     \
+      if (element[j] == 0) {                                          \
+        break;                                                        \
+      }                                                               \
+    }                                                                 \
+    run_bz_bnz(&input_array[i],                                       \
+               [](MacroAssembler& assm, Label& br_target) {           \
+                 __ instruction(w1, &br_target);                      \
+               },                                                     \
+               j != lanes);                                           \
+  }
+  TestCaseMsaBranch tz_b[] = {{0x0, 0x0},
+                              {0xbc0000, 0x0},
+                              {0x0, 0xab000000000000cd},
+                              {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BZ_DF(tz_b, kMSALanesByte, bz_b, int8_t)
+
+  TestCaseMsaBranch tz_h[] = {{0x0, 0x0},
+                              {0xbcde0000, 0x0},
+                              {0x0, 0xabcd00000000abcd},
+                              {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BZ_DF(tz_h, kMSALanesHalf, bz_h, int16_t)
+
+  TestCaseMsaBranch tz_w[] = {{0x0, 0x0},
+                              {0xbcde123400000000, 0x0},
+                              {0x0, 0x000000001234abcd},
+                              {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BZ_DF(tz_w, kMSALanesWord, bz_w, int32_t)
+
+  TestCaseMsaBranch tz_d[] = {{0x0, 0x0},
+                              {0xbcde0000, 0x0},
+                              {0x0, 0xabcd00000000abcd},
+                              {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BZ_DF(tz_d, kMSALanesDword, bz_d, int64_t)
+#undef TEST_BZ_DF
+
+  TestCaseMsaBranch tnz_v[] = {
+      {0x0, 0x0}, {0xabc, 0x0}, {0x0, 0xabc}, {0xabc, 0xabc}};
+  for (unsigned i = 0; i < arraysize(tnz_v); ++i) {
+    run_bz_bnz(&tnz_v[i],
+               [](MacroAssembler& assm, Label& br_target) {
+                 __ bnz_v(w1, &br_target);
+               },
+               tnz_v[i].wt_lo != 0 || tnz_v[i].wt_hi != 0);
+  }
+
+#define TEST_BNZ_DF(input_array, lanes, instruction, int_type)        \
+  for (unsigned i = 0; i < arraysize(input_array); ++i) {             \
+    int j;                                                            \
+    int_type* element = reinterpret_cast<int_type*>(&input_array[i]); \
+    for (j = 0; j < lanes; ++j) {                                     \
+      if (element[j] == 0) {                                          \
+        break;                                                        \
+      }                                                               \
+    }                                                                 \
+    run_bz_bnz(&input_array[i],                                       \
+               [](MacroAssembler& assm, Label& br_target) {           \
+                 __ instruction(w1, &br_target);                      \
+               },                                                     \
+               j == lanes);                                           \
+  }
+  TestCaseMsaBranch tnz_b[] = {{0x0, 0x0},
+                               {0xbc0000, 0x0},
+                               {0x0, 0xab000000000000cd},
+                               {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BNZ_DF(tnz_b, 16, bnz_b, int8_t)
+
+  TestCaseMsaBranch tnz_h[] = {{0x0, 0x0},
+                               {0xbcde0000, 0x0},
+                               {0x0, 0xabcd00000000abcd},
+                               {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BNZ_DF(tnz_h, 8, bnz_h, int16_t)
+
+  TestCaseMsaBranch tnz_w[] = {{0x0, 0x0},
+                               {0xbcde123400000000, 0x0},
+                               {0x0, 0x000000001234abcd},
+                               {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BNZ_DF(tnz_w, 4, bnz_w, int32_t)
+
+  TestCaseMsaBranch tnz_d[] = {{0x0, 0x0},
+                               {0xbcde0000, 0x0},
+                               {0x0, 0xabcd00000000abcd},
+                               {0x123456789abcdef0, 0xaaaaaaaaaaaaaaaa}};
+  TEST_BNZ_DF(tnz_d, 2, bnz_d, int64_t)
+#undef TEST_BNZ_DF
+}
 
 uint64_t run_jialc(int16_t offset) {
   Isolate* isolate = CcTest::i_isolate();
@@ -6612,68 +6795,6 @@ TEST(Ext) {
   CHECK_EQ(run_Ext(0x0000000040000000, 31, 1), 0x0000000000000000);
 }
 
-// Load elements in w0 MSA vector register
-void load_uint64_elements_of_vector(MacroAssembler& assm,
-                                    const uint64_t elements[], MSARegister w,
-                                    Register t0, Register t1) {
-  __ li(t0, elements[0]);
-  __ li(t1, elements[1]);
-  __ insert_d(w, 0, t0);
-  __ insert_d(w, 1, t1);
-}
-
-void load_uint32_elements_of_vector(MacroAssembler& assm,
-                                    const uint64_t elements[], MSARegister w,
-                                    Register t0, Register t1) {
-  const uint32_t* const element = reinterpret_cast<const uint32_t*>(elements);
-  __ li(t0, element[0]);
-  __ li(t1, element[1]);
-  __ insert_w(w, 0, t0);
-  __ insert_w(w, 1, t1);
-  __ li(t0, element[2]);
-  __ li(t1, element[3]);
-  __ insert_w(w, 2, t0);
-  __ insert_w(w, 3, t1);
-}
-
-void load_uint16_elements_of_vector(MacroAssembler& assm,
-                                    const uint64_t elements[], MSARegister w,
-                                    Register t0, Register t1) {
-  const uint16_t* const element = reinterpret_cast<const uint16_t*>(elements);
-  __ li(t0, element[0]);
-  __ li(t1, element[1]);
-  __ insert_h(w, 0, t0);
-  __ insert_h(w, 1, t1);
-  __ li(t0, element[2]);
-  __ li(t1, element[3]);
-  __ insert_h(w, 2, t0);
-  __ insert_h(w, 3, t1);
-  __ li(t0, element[4]);
-  __ li(t1, element[5]);
-  __ insert_h(w, 4, t0);
-  __ insert_h(w, 5, t1);
-  __ li(t0, element[6]);
-  __ li(t1, element[7]);
-  __ insert_h(w, 6, t0);
-  __ insert_h(w, 7, t1);
-}
-
-// Store vector elements from w2 to the memory pointed by a0
-void store_uint64_elements_of_vector(MacroAssembler& assm, MSARegister w,
-                                     Register a) {
-  __ st_d(w, MemOperand(a, 0));
-}
-
-void store_uint32_elements_of_vector(MacroAssembler& assm, MSARegister w,
-                                     Register a) {
-  __ st_w(w, MemOperand(a, 0));
-}
-
-void store_uint16_elements_of_vector(MacroAssembler& assm, MSARegister w,
-                                     Register a) {
-  __ st_h(w, MemOperand(a, 0));
-}
-
 TEST(MSA_fill_copy) {
   CcTest::InitializeVM();
   Isolate* isolate = CcTest::i_isolate();
@@ -6860,12 +6981,6 @@ TEST(MSA_fill_copy_3) {
   CHECK_EQ(0x5555555555555555, t[1].d0);
 }
 
-typedef union {
-  uint8_t b[16];
-  uint16_t h[8];
-  uint32_t w[4];
-  uint64_t d[2];
-} msa_reg_t;
 
 template <typename T>
 void run_msa_insert(int64_t rs_value, int n, msa_reg_t* w) {
@@ -6896,7 +7011,7 @@ void run_msa_insert(int64_t rs_value, int n, msa_reg_t* w) {
     UNREACHABLE();
   }
 
-  store_uint64_elements_of_vector(assm, w0, a0);
+  store_elements_of_vector(assm, w0, a0);
 
   __ jr(ra);
   __ nop();
@@ -7017,6 +7132,152 @@ void run_msa_ctc_cfc(uint64_t value) {
            res);
 }
 
+TEST(MSA_move_v) {
+  if ((kArchVariant != kMips64r6) || !CpuFeatures::IsSupported(MIPS_SIMD))
+    return;
+  CcTest::InitializeVM();
+  Isolate* isolate = CcTest::i_isolate();
+  HandleScope scope(isolate);
+
+  typedef struct {
+    uint64_t ws_lo;
+    uint64_t ws_hi;
+    uint64_t wd_lo;
+    uint64_t wd_hi;
+  } T;
+  T t[] = {{0x20b9cc4f1a83e0c5, 0xa27e1b5f2f5bb18a, 0x1e86678b52f8e1ff,
+            0x706e51290ac76fb9},
+           {0x4414aed7883ffd18, 0x047d183a06b67016, 0x4ef258cf8d822870,
+            0x2686b73484c2e843},
+           {0xd38ff9d048884ffc, 0x6dc63a57c0943ca7, 0x8520ca2f3e97c426,
+            0xa9913868fb819c59}};
+
+  for (unsigned i = 0; i < arraysize(t); ++i) {
+    MacroAssembler assm(isolate, nullptr, 0,
+                        v8::internal::CodeObjectRequired::kYes);
+    CpuFeatureScope fscope(&assm, MIPS_SIMD);
+
+    load_elements_of_vector(assm, &t[i].ws_lo, w0, t0, t1);
+    load_elements_of_vector(assm, &t[i].wd_lo, w2, t0, t1);
+    __ move_v(w2, w0);
+    store_elements_of_vector(assm, w2, a0);
+
+    __ jr(ra);
+    __ nop();
+
+    CodeDesc desc;
+    assm.GetCode(isolate, &desc);
+    Handle<Code> code =
+        isolate->factory()->NewCode(desc, Code::STUB, Handle<Code>());
+#ifdef OBJECT_PRINT
+    code->Print(std::cout);
+#endif
+    F3 f = FUNCTION_CAST<F3>(code->entry());
+    (CALL_GENERATED_CODE(isolate, f, &t[i].wd_lo, 0, 0, 0, 0));
+    CHECK_EQ(t[i].ws_lo, t[i].wd_lo);
+    CHECK_EQ(t[i].ws_hi, t[i].wd_hi);
+  }
+}
+
+template <typename ExpectFunc, typename OperFunc>
+void run_msa_sldi(OperFunc GenerateOperation,
+                  ExpectFunc GenerateExpectedResult) {
+  Isolate* isolate = CcTest::i_isolate();
+  HandleScope scope(isolate);
+
+  typedef struct {
+    uint64_t ws_lo;
+    uint64_t ws_hi;
+    uint64_t wd_lo;
+    uint64_t wd_hi;
+  } T;
+  T t[] = {{0x20b9cc4f1a83e0c5, 0xa27e1b5f2f5bb18a, 0x1e86678b52f8e1ff,
+            0x706e51290ac76fb9},
+           {0x4414aed7883ffd18, 0x047d183a06b67016, 0x4ef258cf8d822870,
+            0x2686b73484c2e843},
+           {0xd38ff9d048884ffc, 0x6dc63a57c0943ca7, 0x8520ca2f3e97c426,
+            0xa9913868fb819c59}};
+  uint64_t res[2];
+
+  for (unsigned i = 0; i < arraysize(t); ++i) {
+    MacroAssembler assm(isolate, nullptr, 0,
+                        v8::internal::CodeObjectRequired::kYes);
+    CpuFeatureScope fscope(&assm, MIPS_SIMD);
+    load_elements_of_vector(assm, &t[i].ws_lo, w0, t0, t1);
+    load_elements_of_vector(assm, &t[i].wd_lo, w2, t0, t1);
+    GenerateOperation(assm);
+    store_elements_of_vector(assm, w2, a0);
+
+    __ jr(ra);
+    __ nop();
+
+    CodeDesc desc;
+    assm.GetCode(isolate, &desc);
+    Handle<Code> code =
+        isolate->factory()->NewCode(desc, Code::STUB, Handle<Code>());
+#ifdef OBJECT_PRINT
+    code->Print(std::cout);
+#endif
+    F3 f = FUNCTION_CAST<F3>(code->entry());
+    (CALL_GENERATED_CODE(isolate, f, &res[0], 0, 0, 0, 0));
+    GenerateExpectedResult(reinterpret_cast<uint8_t*>(&t[i].ws_lo),
+                           reinterpret_cast<uint8_t*>(&t[i].wd_lo));
+    CHECK_EQ(res[0], t[i].wd_lo);
+    CHECK_EQ(res[1], t[i].wd_hi);
+  }
+}
+
+TEST(MSA_sldi) {
+  if ((kArchVariant != kMips64r6) || !CpuFeatures::IsSupported(MIPS_SIMD))
+    return;
+  CcTest::InitializeVM();
+
+#define SLDI_DF(s, k)                \
+  uint8_t v[32];                     \
+  for (unsigned i = 0; i < s; i++) { \
+    v[i] = ws[s * k + i];            \
+    v[i + s] = wd[s * k + i];        \
+  }                                  \
+  for (unsigned i = 0; i < s; i++) { \
+    wd[s * k + i] = v[i + n];        \
+  }
+
+  for (int n = 0; n < 16; ++n) {
+    run_msa_sldi([n](MacroAssembler& assm) { __ sldi_b(w2, w0, n); },
+                 [n](uint8_t* ws, uint8_t* wd) {
+                   SLDI_DF(kMSARegSize / sizeof(int8_t) / kBitsPerByte, 0)
+                 });
+  }
+
+  for (int n = 0; n < 8; ++n) {
+    run_msa_sldi([n](MacroAssembler& assm) { __ sldi_h(w2, w0, n); },
+                 [n](uint8_t* ws, uint8_t* wd) {
+                   for (int k = 0; k < 2; ++k) {
+                     SLDI_DF(kMSARegSize / sizeof(int16_t) / kBitsPerByte, k)
+                   }
+                 });
+  }
+
+  for (int n = 0; n < 4; ++n) {
+    run_msa_sldi([n](MacroAssembler& assm) { __ sldi_w(w2, w0, n); },
+                 [n](uint8_t* ws, uint8_t* wd) {
+                   for (int k = 0; k < 4; ++k) {
+                     SLDI_DF(kMSARegSize / sizeof(int32_t) / kBitsPerByte, k)
+                   }
+                 });
+  }
+
+  for (int n = 0; n < 2; ++n) {
+    run_msa_sldi([n](MacroAssembler& assm) { __ sldi_d(w2, w0, n); },
+                 [n](uint8_t* ws, uint8_t* wd) {
+                   for (int k = 0; k < 8; ++k) {
+                     SLDI_DF(kMSARegSize / sizeof(int64_t) / kBitsPerByte, k)
+                   }
+                 });
+  }
+#undef SLDI_DF
+}
+
 TEST(MSA_cfc_ctc) {
   if ((kArchVariant != kMips64r6) || !CpuFeatures::IsSupported(MIPS_SIMD))
     return;
@@ -7104,7 +7365,7 @@ void run_msa_i8(SecondaryField opcode, uint64_t ws_lo, uint64_t ws_hi,
       UNREACHABLE();
   }
 
-  store_uint64_elements_of_vector(assm, w2, a0);
+  store_elements_of_vector(assm, w2, a0);
 
   __ jr(ra);
   __ nop();
@@ -7297,11 +7558,11 @@ void run_msa_i5(struct TestCaseMsaI5* input, bool i5_sign_ext,
   int32_t i5 =
       i5_sign_ext ? static_cast<int32_t>(input->i5 << 27) >> 27 : input->i5;
 
-  load_uint64_elements_of_vector(assm, &(input->ws_lo), w0, t0, t1);
+  load_elements_of_vector(assm, &(input->ws_lo), w0, t0, t1);
 
   GenerateI5InstructionFunc(assm, i5);
 
-  store_uint64_elements_of_vector(assm, w2, a0);
+  store_elements_of_vector(assm, w2, a0);
 
   __ jr(ra);
   __ nop();
@@ -7708,11 +7969,9 @@ struct TestCaseMsa2R {
   uint64_t exp_res_hi;
 };
 
-template <typename Func, typename FuncLoad, typename FuncStore>
+template <typename Func>
 void run_msa_2r(const struct TestCaseMsa2R* input,
-                Func Generate2RInstructionFunc,
-                FuncLoad load_elements_of_vector,
-                FuncStore store_elements_of_vector) {
+                Func Generate2RInstructionFunc) {
   Isolate* isolate = CcTest::i_isolate();
   HandleScope scope(isolate);
 
@@ -7740,17 +7999,8 @@ void run_msa_2r(const struct TestCaseMsa2R* input,
 
   (CALL_GENERATED_CODE(isolate, f, &res, 0, 0, 0, 0));
 
-  if (store_elements_of_vector == store_uint64_elements_of_vector) {
-    CHECK_EQ(input->exp_res_lo, res.d[0]);
-    CHECK_EQ(input->exp_res_hi, res.d[1]);
-  } else if (store_elements_of_vector == store_uint32_elements_of_vector) {
-    const uint32_t* exp_res =
-        reinterpret_cast<const uint32_t*>(&input->exp_res_lo);
-    CHECK_EQ(exp_res[0], res.w[0]);
-    CHECK_EQ(exp_res[1], res.w[1]);
-    CHECK_EQ(exp_res[2], res.w[2]);
-    CHECK_EQ(exp_res[3], res.w[3]);
-  }
+  CHECK_EQ(input->exp_res_lo, res.d[0]);
+  CHECK_EQ(input->exp_res_hi, res.d[1]);
 }
 
 TEST(MSA_pcnt) {
@@ -7801,14 +8051,10 @@ TEST(MSA_pcnt) {
       {0xf35862e13e38f8b0, 0x4f41ffdef2bfe636, 0x20, 0x2a}};
 
   for (size_t i = 0; i < sizeof(tc_b) / sizeof(TestCaseMsa2R); ++i) {
-    run_msa_2r(&tc_b[i], [](MacroAssembler& assm) { __ pcnt_b(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_h[i], [](MacroAssembler& assm) { __ pcnt_h(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_w[i], [](MacroAssembler& assm) { __ pcnt_w(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_d[i], [](MacroAssembler& assm) { __ pcnt_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+    run_msa_2r(&tc_b[i], [](MacroAssembler& assm) { __ pcnt_b(w2, w0); });
+    run_msa_2r(&tc_h[i], [](MacroAssembler& assm) { __ pcnt_h(w2, w0); });
+    run_msa_2r(&tc_w[i], [](MacroAssembler& assm) { __ pcnt_w(w2, w0); });
+    run_msa_2r(&tc_d[i], [](MacroAssembler& assm) { __ pcnt_d(w2, w0); });
   }
 }
 
@@ -7860,14 +8106,10 @@ TEST(MSA_nlzc) {
       {0x00000000e338f8b0, 0x0754534acab32654, 0x20, 0x5}};
 
   for (size_t i = 0; i < sizeof(tc_b) / sizeof(TestCaseMsa2R); ++i) {
-    run_msa_2r(&tc_b[i], [](MacroAssembler& assm) { __ nlzc_b(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_h[i], [](MacroAssembler& assm) { __ nlzc_h(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_w[i], [](MacroAssembler& assm) { __ nlzc_w(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_d[i], [](MacroAssembler& assm) { __ nlzc_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+    run_msa_2r(&tc_b[i], [](MacroAssembler& assm) { __ nlzc_b(w2, w0); });
+    run_msa_2r(&tc_h[i], [](MacroAssembler& assm) { __ nlzc_h(w2, w0); });
+    run_msa_2r(&tc_w[i], [](MacroAssembler& assm) { __ nlzc_w(w2, w0); });
+    run_msa_2r(&tc_d[i], [](MacroAssembler& assm) { __ nlzc_d(w2, w0); });
   }
 }
 
@@ -7919,14 +8161,10 @@ TEST(MSA_nloc) {
       {0xFFFFFFFF1CC7074F, 0xF8ABACB5354CD9AB, 0x20, 0x5}};
 
   for (size_t i = 0; i < sizeof(tc_b) / sizeof(TestCaseMsa2R); ++i) {
-    run_msa_2r(&tc_b[i], [](MacroAssembler& assm) { __ nloc_b(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_h[i], [](MacroAssembler& assm) { __ nloc_h(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_w[i], [](MacroAssembler& assm) { __ nloc_w(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
-    run_msa_2r(&tc_d[i], [](MacroAssembler& assm) { __ nloc_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+    run_msa_2r(&tc_b[i], [](MacroAssembler& assm) { __ nloc_b(w2, w0); });
+    run_msa_2r(&tc_h[i], [](MacroAssembler& assm) { __ nloc_h(w2, w0); });
+    run_msa_2r(&tc_w[i], [](MacroAssembler& assm) { __ nloc_w(w2, w0); });
+    run_msa_2r(&tc_d[i], [](MacroAssembler& assm) { __ nloc_d(w2, w0); });
   }
 }
 
@@ -7987,13 +8225,11 @@ TEST(MSA_fclass) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_U); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ fclass_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ fclass_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_U); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ fclass_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ fclass_d(w2, w0); });
   }
 
 #undef BIT
@@ -8059,13 +8295,11 @@ TEST(MSA_ftrunc_s) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_I); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ ftrunc_s_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ ftrunc_s_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_I); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ ftrunc_s_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ ftrunc_s_d(w2, w0); });
   }
 }
 
@@ -8098,13 +8332,11 @@ TEST(MSA_ftrunc_u) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_U); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ ftrunc_u_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ ftrunc_u_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_U); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ ftrunc_u_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ ftrunc_u_d(w2, w0); });
   }
 }
 
@@ -8143,13 +8375,11 @@ TEST(MSA_fsqrt) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ fsqrt_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ fsqrt_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ fsqrt_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ fsqrt_d(w2, w0); });
   }
 }
 
@@ -8173,13 +8403,11 @@ TEST(MSA_frsqrt) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ frsqrt_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ frsqrt_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ frsqrt_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ frsqrt_d(w2, w0); });
   }
 }
 
@@ -8205,13 +8433,11 @@ TEST(MSA_frcp) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ frcp_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ frcp_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ frcp_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ frcp_d(w2, w0); });
   }
 }
 
@@ -8226,8 +8452,7 @@ void test_frint_s(size_t data_size, TestCaseMsa2RF_F_F tc_d[],
                  __ ctcmsa(msareg, t0);
                  __ frint_w(w2, w0);
                  __ ctcmsa(msareg, t1);
-               },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               });
   }
 }
 
@@ -8242,8 +8467,7 @@ void test_frint_d(size_t data_size, TestCaseMsa2RF_D_D tc_d[],
                  __ ctcmsa(msareg, t0);
                  __ frint_d(w2, w0);
                  __ ctcmsa(msareg, t1);
-               },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               });
   }
 }
 
@@ -8325,14 +8549,12 @@ TEST(MSA_flog2) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_F_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ flog2_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ flog2_w(w2, w0); });
   }
 
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_D_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ flog2_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ flog2_d(w2, w0); });
   }
 }
 
@@ -8347,8 +8569,7 @@ void test_ftint_s_s(size_t data_size, TestCaseMsa2RF_F_I tc_d[],
                  __ ctcmsa(msareg, t0);
                  __ ftint_s_w(w2, w0);
                  __ ctcmsa(msareg, t1);
-               },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               });
   }
 }
 
@@ -8363,8 +8584,7 @@ void test_ftint_s_d(size_t data_size, TestCaseMsa2RF_D_I tc_d[],
                  __ ctcmsa(msareg, t0);
                  __ ftint_s_d(w2, w0);
                  __ ctcmsa(msareg, t1);
-               },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               });
   }
 }
 
@@ -8461,8 +8681,7 @@ void test_ftint_u_s(size_t data_size, TestCaseMsa2RF_F_U tc_d[],
                  __ ctcmsa(msareg, t0);
                  __ ftint_u_w(w2, w0);
                  __ ctcmsa(msareg, t1);
-               },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               });
   }
 }
 
@@ -8477,8 +8696,7 @@ void test_ftint_u_d(size_t data_size, TestCaseMsa2RF_D_U tc_d[],
                  __ ctcmsa(msareg, t0);
                  __ ftint_u_d(w2, w0);
                  __ ctcmsa(msareg, t1);
-               },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               });
   }
 }
 
@@ -8594,13 +8812,11 @@ TEST(MSA_ffint_u) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_U_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ ffint_u_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffint_u_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_U_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ ffint_u_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffint_u_d(w2, w0); });
   }
 }
 
@@ -8636,13 +8852,11 @@ TEST(MSA_ffint_s) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_I_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ ffint_s_w(w2, w0); },
-               load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffint_s_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_I_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ ffint_s_d(w2, w0); },
-               load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffint_s_d(w2, w0); });
   }
 }
 
@@ -8695,13 +8909,11 @@ TEST(MSA_fexupl) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_U16_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ fexupl_w(w2, w0); },
-               load_uint16_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ fexupl_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_F_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ fexupl_d(w2, w0); },
-               load_uint32_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ fexupl_d(w2, w0); });
   }
 }
 
@@ -8730,13 +8942,11 @@ TEST(MSA_fexupr) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_U16_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ fexupr_w(w2, w0); },
-               load_uint16_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ fexupr_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_F_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ fexupr_d(w2, w0); },
-               load_uint32_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ fexupr_d(w2, w0); });
   }
 }
 
@@ -8765,13 +8975,11 @@ TEST(MSA_ffql) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_U16_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ ffql_w(w2, w0); },
-               load_uint16_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffql_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_U32_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ ffql_d(w2, w0); },
-               load_uint32_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffql_d(w2, w0); });
   }
 }
 
@@ -8791,13 +8999,11 @@ TEST(MSA_ffqr) {
 
   for (size_t i = 0; i < sizeof(tc_s) / sizeof(TestCaseMsa2RF_U16_F); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_s[i]),
-               [](MacroAssembler& assm) { __ ffqr_w(w2, w0); },
-               load_uint16_elements_of_vector, store_uint32_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffqr_w(w2, w0); });
   }
   for (size_t i = 0; i < sizeof(tc_d) / sizeof(TestCaseMsa2RF_U32_D); ++i) {
     run_msa_2r(reinterpret_cast<const TestCaseMsa2R*>(&tc_d[i]),
-               [](MacroAssembler& assm) { __ ffqr_d(w2, w0); },
-               load_uint32_elements_of_vector, store_uint64_elements_of_vector);
+               [](MacroAssembler& assm) { __ ffqr_d(w2, w0); });
   }
 }
 
@@ -8822,13 +9028,13 @@ void run_msa_vector(struct TestCaseMsaVector* input,
   CpuFeatureScope fscope(&assm, MIPS_SIMD);
   msa_reg_t res;
 
-  load_uint64_elements_of_vector(assm, &(input->ws_lo), w0, t0, t1);
-  load_uint64_elements_of_vector(assm, &(input->wt_lo), w2, t0, t1);
-  load_uint64_elements_of_vector(assm, &(input->wd_lo), w4, t0, t1);
+  load_elements_of_vector(assm, &(input->ws_lo), w0, t0, t1);
+  load_elements_of_vector(assm, &(input->wt_lo), w2, t0, t1);
+  load_elements_of_vector(assm, &(input->wd_lo), w4, t0, t1);
 
   GenerateVectorInstructionFunc(assm);
 
-  store_uint64_elements_of_vector(assm, w4, a0);
+  store_elements_of_vector(assm, w4, a0);
 
   __ jr(ra);
   __ nop();
@@ -8912,12 +9118,12 @@ void run_msa_bit(struct TestCaseMsaBit* input, InstFunc GenerateInstructionFunc,
   CpuFeatureScope fscope(&assm, MIPS_SIMD);
   msa_reg_t res;
 
-  load_uint64_elements_of_vector(assm, &(input->ws_lo), w0, t0, t1);
-  load_uint64_elements_of_vector(assm, &(input->wd_lo), w2, t0, t1);
+  load_elements_of_vector(assm, &(input->ws_lo), w0, t0, t1);
+  load_elements_of_vector(assm, &(input->wd_lo), w2, t0, t1);
 
   GenerateInstructionFunc(assm, input->m);
 
-  store_uint64_elements_of_vector(assm, w2, a0);
+  store_elements_of_vector(assm, w2, a0);
 
   __ jr(ra);
   __ nop();
@@ -9391,7 +9597,7 @@ void run_msa_i10(int32_t input, InstFunc GenerateVectorInstructionFunc,
 
   GenerateVectorInstructionFunc(assm, input);
 
-  store_uint64_elements_of_vector(assm, w0, a0);
+  store_elements_of_vector(assm, w0, a0);
 
   __ jr(ra);
   __ nop();
@@ -9520,7 +9726,6 @@ TEST(MSA_load_store_vector) {
       __ st_d(w0, MemOperand(a1, i));
     }
   });
-#undef LDI_DF
 }
 
 struct TestCaseMsa3R {
@@ -9544,15 +9749,14 @@ void run_msa_3r(struct TestCaseMsa3R* input, InstFunc GenerateI5InstructionFunc,
                       v8::internal::CodeObjectRequired::kYes);
   CpuFeatureScope fscope(&assm, MIPS_SIMD);
   msa_reg_t res;
-  uint64_t expected;
 
-  load_uint64_elements_of_vector(assm, &(input->wt_lo), w0, t0, t1);
-  load_uint64_elements_of_vector(assm, &(input->ws_lo), w1, t0, t1);
-  load_uint64_elements_of_vector(assm, &(input->wd_lo), w2, t0, t1);
+  load_elements_of_vector(assm, &(input->wt_lo), w0, t0, t1);
+  load_elements_of_vector(assm, &(input->ws_lo), w1, t0, t1);
+  load_elements_of_vector(assm, &(input->wd_lo), w2, t0, t1);
 
   GenerateI5InstructionFunc(assm);
 
-  store_uint64_elements_of_vector(assm, w2, a0);
+  store_elements_of_vector(assm, w2, a0);
 
   __ jr(ra);
   __ nop();
@@ -9568,14 +9772,12 @@ void run_msa_3r(struct TestCaseMsa3R* input, InstFunc GenerateI5InstructionFunc,
 
   (CALL_GENERATED_CODE(isolate, f, &res, 0, 0, 0, 0));
 
-  expected = GenerateOperationFunc(input->ws_lo, input->wt_lo, input->wd_lo);
-  if (expected != Unpredictable) {
-    CHECK_EQ(expected, res.d[0]);
+  GenerateOperationFunc(&input->ws_lo, &input->wt_lo, &input->wd_lo);
+  if (input->wd_lo != Unpredictable) {
+    CHECK_EQ(input->wd_lo, res.d[0]);
   }
-
-  expected = GenerateOperationFunc(input->ws_hi, input->wt_hi, input->wd_hi);
-  if (expected != Unpredictable) {
-    CHECK_EQ(expected, res.d[1]);
+  if (input->wd_hi != Unpredictable) {
+    CHECK_EQ(input->wd_hi, res.d[1]);
   }
 }
 
@@ -9612,479 +9814,630 @@ TEST(MSA_3R_instructions) {
       {0xffff00000000ffff, 0xffff00000000ffff, 0xffff00000000ffff,
        0xffff00000000ffff, 0xffff00000000ffff, 0xffff00000000ffff}};
 
-#define SLL_DF(T, lanes, mask)                                          \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T src_op = static_cast<T>((ws >> shift) & mask);                    \
-    T shift_op = static_cast<T>((wt >> shift) & mask) % size_in_bits;   \
-    res |= (static_cast<uint64_t>(src_op << shift_op) & mask) << shift; \
-  }                                                                     \
-  return res
+#define SLL_DF(T, lanes, mask)                                             \
+  int size_in_bits = kMSARegSize / lanes;                                  \
+  for (int i = 0; i < 2; i++) {                                            \
+    uint64_t res = 0;                                                      \
+    for (int j = 0; j < lanes / 2; ++j) {                                  \
+      uint64_t shift = size_in_bits * j;                                   \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                  \
+      T shift_op = static_cast<T>((wt[i] >> shift) & mask) % size_in_bits; \
+      res |= (static_cast<uint64_t>(src_op << shift_op) & mask) << shift;  \
+    }                                                                      \
+    wd[i] = res;                                                           \
+  }
 
-#define SRA_DF(T, lanes, mask)                                                 \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T src_op = static_cast<T>((ws >> shift) & mask);                           \
-    int shift_op = ((wt >> shift) & mask) % size_in_bits;                      \
-    res |=                                                                     \
-        (static_cast<uint64_t>(ArithmeticShiftRight(src_op, shift_op) & mask)) \
-        << shift;                                                              \
-  }                                                                            \
-  return res
-
-#define SRL_DF(T, lanes, mask)                                          \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T src_op = static_cast<T>((ws >> shift) & mask);                    \
-    T shift_op = static_cast<T>(((wt >> shift) & mask) % size_in_bits); \
-    res |= (static_cast<uint64_t>(src_op >> shift_op) & mask) << shift; \
-  }                                                                     \
-  return res
-
-#define BCRL_DF(T, lanes, mask)                                         \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T src_op = static_cast<T>((ws >> shift) & mask);                    \
-    T shift_op = static_cast<T>(((wt >> shift) & mask) % size_in_bits); \
-    T r = (static_cast<T>(~(1ull << shift_op)) & src_op) & mask;        \
-    res |= static_cast<uint64_t>(r) << shift;                           \
-  }                                                                     \
-  return res
-
-#define BSET_DF(T, lanes, mask)                                         \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T src_op = static_cast<T>((ws >> shift) & mask);                    \
-    T shift_op = static_cast<T>(((wt >> shift) & mask) % size_in_bits); \
-    T r = (static_cast<T>(1ull << shift_op) | src_op) & mask;           \
-    res |= static_cast<uint64_t>(r) << shift;                           \
-  }                                                                     \
-  return res
-
-#define BNEG_DF(T, lanes, mask)                                         \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T src_op = static_cast<T>((ws >> shift) & mask);                    \
-    T shift_op = static_cast<T>(((wt >> shift) & mask) % size_in_bits); \
-    T r = (static_cast<T>(1ull << shift_op) ^ src_op) & mask;           \
-    res |= static_cast<uint64_t>(r) << shift;                           \
-  }                                                                     \
-  return res
-
-#define BINSL_DF(T, lanes, mask)                                            \
-  uint64_t res = 0;                                                         \
-  int size_in_bits = kMSARegSize / lanes;                                   \
-  for (int i = 0; i < lanes / 2; ++i) {                                     \
-    uint64_t shift = size_in_bits * i;                                      \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                         \
-    T wd_op = static_cast<T>((wd >> shift) & mask);                         \
-    int shift_op = static_cast<int>(((wt >> shift) & mask) % size_in_bits); \
-    int bits = shift_op + 1;                                                \
-    T r;                                                                    \
-    if (bits == size_in_bits) {                                             \
-      r = static_cast<T>(ws_op);                                            \
-    } else {                                                                \
-      uint64_t mask2 = ((1ull << bits) - 1) << (size_in_bits - bits);       \
-      r = static_cast<T>((static_cast<T>(mask2) & ws_op) |                  \
-                         (static_cast<T>(~mask2) & wd_op));                 \
-    }                                                                       \
-    res |= static_cast<uint64_t>(r) << shift;                               \
-  }                                                                         \
-  return res
-
-#define BINSR_DF(T, lanes, mask)                                            \
-  uint64_t res = 0;                                                         \
-  int size_in_bits = kMSARegSize / lanes;                                   \
-  for (int i = 0; i < lanes / 2; ++i) {                                     \
-    uint64_t shift = size_in_bits * i;                                      \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                         \
-    T wd_op = static_cast<T>((wd >> shift) & mask);                         \
-    int shift_op = static_cast<int>(((wt >> shift) & mask) % size_in_bits); \
-    int bits = shift_op + 1;                                                \
-    T r;                                                                    \
-    if (bits == size_in_bits) {                                             \
-      r = static_cast<T>(ws_op);                                            \
-    } else {                                                                \
-      uint64_t mask2 = (1ull << bits) - 1;                                  \
-      r = static_cast<T>((static_cast<T>(mask2) & ws_op) |                  \
-                         (static_cast<T>(~mask2) & wd_op));                 \
-    }                                                                       \
-    res |= static_cast<uint64_t>(r) << shift;                               \
-  }                                                                         \
-  return res
-
-#define ADDV_DF(T, lanes, mask)                                    \
-  uint64_t res = 0;                                                \
-  int size_in_bits = kMSARegSize / lanes;                          \
-  for (int i = 0; i < lanes / 2; ++i) {                            \
-    uint64_t shift = size_in_bits * i;                             \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                \
-    res |= (static_cast<uint64_t>(ws_op + wt_op) & mask) << shift; \
-  }                                                                \
-  return res
-
-#define SUBV_DF(T, lanes, mask)                                    \
-  uint64_t res = 0;                                                \
-  int size_in_bits = kMSARegSize / lanes;                          \
-  for (int i = 0; i < lanes / 2; ++i) {                            \
-    uint64_t shift = size_in_bits * i;                             \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                \
-    res |= (static_cast<uint64_t>(ws_op - wt_op) & mask) << shift; \
-  }                                                                \
-  return res
-
-#define MAX_DF(T, lanes, mask)                                            \
-  uint64_t res = 0;                                                       \
-  int size_in_bits = kMSARegSize / lanes;                                 \
-  for (int i = 0; i < lanes / 2; ++i) {                                   \
-    uint64_t shift = size_in_bits * i;                                    \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                       \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                       \
-    res |= (static_cast<uint64_t>(Max<T>(ws_op, wt_op)) & mask) << shift; \
-  }                                                                       \
-  return res
-
-#define MIN_DF(T, lanes, mask)                                            \
-  uint64_t res = 0;                                                       \
-  int size_in_bits = kMSARegSize / lanes;                                 \
-  for (int i = 0; i < lanes / 2; ++i) {                                   \
-    uint64_t shift = size_in_bits * i;                                    \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                       \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                       \
-    res |= (static_cast<uint64_t>(Min<T>(ws_op, wt_op)) & mask) << shift; \
-  }                                                                       \
-  return res
-
-#define MAXA_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |= (static_cast<uint64_t>(Nabs(ws_op) < Nabs(wt_op) ? ws_op : wt_op) & \
-            mask)                                                              \
-           << shift;                                                           \
-  }                                                                            \
-  return res
-
-#define MINA_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |= (static_cast<uint64_t>(Nabs(ws_op) > Nabs(wt_op) ? ws_op : wt_op) & \
-            mask)                                                              \
-           << shift;                                                           \
-  }                                                                            \
-  return res
-
-#define CEQ_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                           \
-  int size_in_bits = kMSARegSize / lanes;                                     \
-  for (int i = 0; i < lanes / 2; ++i) {                                       \
-    uint64_t shift = size_in_bits * i;                                        \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                           \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                           \
-    res |=                                                                    \
-        (static_cast<uint64_t>(!Compare(ws_op, wt_op) ? -1ull : 0ull) & mask) \
-        << shift;                                                             \
-  }                                                                           \
-  return res
-
-#define CLT_DF(T, lanes, mask)                                                 \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |=                                                                     \
-        (static_cast<uint64_t>((Compare(ws_op, wt_op) == -1) ? -1ull : 0ull) & \
-         mask)                                                                 \
-        << shift;                                                              \
-  }                                                                            \
-  return res
-
-#define CLE_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                           \
-  int size_in_bits = kMSARegSize / lanes;                                     \
-  for (int i = 0; i < lanes / 2; ++i) {                                       \
-    uint64_t shift = size_in_bits * i;                                        \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                           \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                           \
-    res |=                                                                    \
-        (static_cast<uint64_t>((Compare(ws_op, wt_op) != 1) ? -1ull : 0ull) & \
-         mask)                                                                \
-        << shift;                                                             \
-  }                                                                           \
-  return res
-
-#define ADD_A_DF(T, lanes, mask)                                             \
-  uint64_t res = 0;                                                          \
+#define SRA_DF(T, lanes, mask)                                               \
   int size_in_bits = kMSARegSize / lanes;                                    \
-  for (int i = 0; i < lanes / 2; ++i) {                                      \
-    uint64_t shift = size_in_bits * i;                                       \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                          \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                          \
-    res |= (static_cast<uint64_t>(Abs(ws_op) + Abs(wt_op)) & mask) << shift; \
-  }                                                                          \
-  return res
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      int shift_op = ((wt[i] >> shift) & mask) % size_in_bits;               \
+      res |= (static_cast<uint64_t>(ArithmeticShiftRight(src_op, shift_op) & \
+                                    mask))                                   \
+             << shift;                                                       \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define ADDS_A_DF(T, lanes, mask)                         \
-  uint64_t res = 0;                                       \
-  int size_in_bits = kMSARegSize / lanes;                 \
-  for (int i = 0; i < lanes / 2; ++i) {                   \
-    uint64_t shift = size_in_bits * i;                    \
-    T ws_op = Nabs(static_cast<T>((ws >> shift) & mask)); \
-    T wt_op = Nabs(static_cast<T>((wt >> shift) & mask)); \
-    T r;                                                  \
-    if (ws_op < -std::numeric_limits<T>::max() - wt_op) { \
-      r = std::numeric_limits<T>::max();                  \
-    } else {                                              \
-      r = -(ws_op + wt_op);                               \
-    }                                                     \
-    res |= (static_cast<uint64_t>(r) & mask) << shift;    \
-  }                                                       \
-  return res
+#define SRL_DF(T, lanes, mask)                                               \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T shift_op = static_cast<T>(((wt[i] >> shift) & mask) % size_in_bits); \
+      res |= (static_cast<uint64_t>(src_op >> shift_op) & mask) << shift;    \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define ADDS_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |= (static_cast<uint64_t>(SaturateAdd(ws_op, wt_op)) & mask) << shift; \
-  }                                                                            \
-  return res
+#define BCRL_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T shift_op = static_cast<T>(((wt[i] >> shift) & mask) % size_in_bits); \
+      T r = (static_cast<T>(~(1ull << shift_op)) & src_op) & mask;           \
+      res |= static_cast<uint64_t>(r) << shift;                              \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define AVE_DF(T, lanes, mask)                                                 \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |= (static_cast<uint64_t>(((wt_op & ws_op) + ((ws_op ^ wt_op) >> 1)) & \
-                                  mask))                                       \
-           << shift;                                                           \
-  }                                                                            \
-  return res
+#define BSET_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T shift_op = static_cast<T>(((wt[i] >> shift) & mask) % size_in_bits); \
+      T r = (static_cast<T>(1ull << shift_op) | src_op) & mask;              \
+      res |= static_cast<uint64_t>(r) << shift;                              \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define AVER_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |= (static_cast<uint64_t>(((wt_op | ws_op) - ((ws_op ^ wt_op) >> 1)) & \
-                                  mask))                                       \
-           << shift;                                                           \
-  }                                                                            \
-  return res
+#define BNEG_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T shift_op = static_cast<T>(((wt[i] >> shift) & mask) % size_in_bits); \
+      T r = (static_cast<T>(1ull << shift_op) ^ src_op) & mask;              \
+      res |= static_cast<uint64_t>(r) << shift;                              \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define SUBS_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                            \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                            \
-    res |= (static_cast<uint64_t>(SaturateSub(ws_op, wt_op)) & mask) << shift; \
-  }                                                                            \
-  return res
+#define BINSL_DF(T, lanes, mask)                                             \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                     \
+      T wd_op = static_cast<T>((wd[i] >> shift) & mask);                     \
+      T shift_op = static_cast<T>(((wt[i] >> shift) & mask) % size_in_bits); \
+      int64_t bits = shift_op + 1;                                           \
+      T r;                                                                   \
+      if (bits == size_in_bits) {                                            \
+        r = static_cast<T>(ws_op);                                           \
+      } else {                                                               \
+        uint64_t mask2 = ((1ull << bits) - 1) << (size_in_bits - bits);      \
+        r = static_cast<T>((static_cast<T>(mask2) & ws_op) |                 \
+                           (static_cast<T>(~mask2) & wd_op));                \
+      }                                                                      \
+      res |= static_cast<uint64_t>(r) << shift;                              \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define SUBSUS_U_DF(T, lanes, mask)                         \
-  typedef typename std::make_unsigned<T>::type uT;          \
-  uint64_t res = 0;                                         \
-  int size_in_bits = kMSARegSize / lanes;                   \
-  for (int i = 0; i < lanes / 2; ++i) {                     \
-    uint64_t shift = size_in_bits * i;                      \
-    uT ws_op = static_cast<uT>((ws >> shift) & mask);       \
-    T wt_op = static_cast<T>((wt >> shift) & mask);         \
-    T r;                                                    \
-    if (wt_op > 0) {                                        \
-      uT wtu = static_cast<uT>(wt_op);                      \
-      if (wtu > ws_op) {                                    \
-        r = 0;                                              \
-      } else {                                              \
-        r = static_cast<T>(ws_op - wtu);                    \
-      }                                                     \
-    } else {                                                \
-      if (ws_op > std::numeric_limits<uT>::max() + wt_op) { \
-        r = static_cast<T>(std::numeric_limits<uT>::max()); \
-      } else {                                              \
-        r = static_cast<T>(ws_op - wt_op);                  \
-      }                                                     \
-    }                                                       \
-    res |= (static_cast<uint64_t>(r) & mask) << shift;      \
-  }                                                         \
-  return res
+#define BINSR_DF(T, lanes, mask)                                             \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                     \
+      T wd_op = static_cast<T>((wd[i] >> shift) & mask);                     \
+      T shift_op = static_cast<T>(((wt[i] >> shift) & mask) % size_in_bits); \
+      int64_t bits = shift_op + 1;                                           \
+      T r;                                                                   \
+      if (bits == size_in_bits) {                                            \
+        r = static_cast<T>(ws_op);                                           \
+      } else {                                                               \
+        uint64_t mask2 = (1ull << bits) - 1;                                 \
+        r = static_cast<T>((static_cast<T>(mask2) & ws_op) |                 \
+                           (static_cast<T>(~mask2) & wd_op));                \
+      }                                                                      \
+      res |= static_cast<uint64_t>(r) << shift;                              \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
 
-#define SUBSUU_S_DF(T, lanes, mask)                    \
-  typedef typename std::make_unsigned<T>::type uT;     \
-  uint64_t res = 0;                                    \
-  int size_in_bits = kMSARegSize / lanes;              \
-  for (int i = 0; i < lanes / 2; ++i) {                \
-    uint64_t shift = size_in_bits * i;                 \
-    uT ws_op = static_cast<uT>((ws >> shift) & mask);  \
-    uT wt_op = static_cast<uT>((wt >> shift) & mask);  \
-    uT wdu;                                            \
-    T r;                                               \
-    if (ws_op > wt_op) {                               \
-      wdu = ws_op - wt_op;                             \
-      if (wdu > std::numeric_limits<T>::max()) {       \
-        r = std::numeric_limits<T>::max();             \
-      } else {                                         \
-        r = static_cast<T>(wdu);                       \
-      }                                                \
-    } else {                                           \
-      wdu = wt_op - ws_op;                             \
-      CHECK(-std::numeric_limits<T>::max() ==          \
-            std::numeric_limits<T>::min() + 1);        \
-      if (wdu <= std::numeric_limits<T>::max()) {      \
-        r = -static_cast<T>(wdu);                      \
-      } else {                                         \
-        r = std::numeric_limits<T>::min();             \
-      }                                                \
-    }                                                  \
-    res |= (static_cast<uint64_t>(r) & mask) << shift; \
-  }                                                    \
-  return res
+#define ADDV_DF(T, lanes, mask)                                      \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      res |= (static_cast<uint64_t>(ws_op + wt_op) & mask) << shift; \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
 
-#define ASUB_S_DF(T, lanes, mask)                                       \
-  uint64_t res = 0;                                                     \
-  int size_in_bits = kMSARegSize / lanes;                               \
-  for (int i = 0; i < lanes / 2; ++i) {                                 \
-    uint64_t shift = size_in_bits * i;                                  \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                     \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                     \
-    res |= (static_cast<uint64_t>(Abs(ws_op - wt_op)) & mask) << shift; \
-  }                                                                     \
-  return res
+#define SUBV_DF(T, lanes, mask)                                      \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      res |= (static_cast<uint64_t>(ws_op - wt_op) & mask) << shift; \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
 
-#define ASUB_U_DF(T, lanes, mask)                                  \
-  uint64_t res = 0;                                                \
-  int size_in_bits = kMSARegSize / lanes;                          \
-  for (int i = 0; i < lanes / 2; ++i) {                            \
-    uint64_t shift = size_in_bits * i;                             \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                \
-    res |= (static_cast<uint64_t>(ws_op > wt_op ? ws_op - wt_op    \
-                                                : wt_op - ws_op) & \
-            mask)                                                  \
-           << shift;                                               \
-  }                                                                \
-  return res
+#define MAX_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                   \
+  for (int i = 0; i < 2; i++) {                                             \
+    uint64_t res = 0;                                                       \
+    for (int j = 0; j < lanes / 2; ++j) {                                   \
+      uint64_t shift = size_in_bits * j;                                    \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                    \
+      res |= (static_cast<uint64_t>(Max<T>(ws_op, wt_op)) & mask) << shift; \
+    }                                                                       \
+    wd[i] = res;                                                            \
+  }
 
-#define MULV_DF(T, lanes, mask)                                    \
-  uint64_t res = 0;                                                \
-  int size_in_bits = kMSARegSize / lanes;                          \
-  for (int i = 0; i < lanes / 2; ++i) {                            \
-    uint64_t shift = size_in_bits * i;                             \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                \
-    res |= (static_cast<uint64_t>(ws_op * wt_op) & mask) << shift; \
-  }                                                                \
-  return res
+#define MIN_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                   \
+  for (int i = 0; i < 2; i++) {                                             \
+    uint64_t res = 0;                                                       \
+    for (int j = 0; j < lanes / 2; ++j) {                                   \
+      uint64_t shift = size_in_bits * j;                                    \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                    \
+      res |= (static_cast<uint64_t>(Min<T>(ws_op, wt_op)) & mask) << shift; \
+    }                                                                       \
+    wd[i] = res;                                                            \
+  }
 
-#define MADDV_DF(T, lanes, mask)                                           \
-  uint64_t res = 0;                                                        \
+#define MAXA_DF(T, lanes, mask)                                               \
+  int size_in_bits = kMSARegSize / lanes;                                     \
+  for (int i = 0; i < 2; i++) {                                               \
+    uint64_t res = 0;                                                         \
+    for (int j = 0; j < lanes / 2; ++j) {                                     \
+      uint64_t shift = size_in_bits * j;                                      \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                      \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                      \
+      res |=                                                                  \
+          (static_cast<uint64_t>(Nabs(ws_op) < Nabs(wt_op) ? ws_op : wt_op) & \
+           mask)                                                              \
+          << shift;                                                           \
+    }                                                                         \
+    wd[i] = res;                                                              \
+  }
+
+#define MINA_DF(T, lanes, mask)                                               \
+  int size_in_bits = kMSARegSize / lanes;                                     \
+  for (int i = 0; i < 2; i++) {                                               \
+    uint64_t res = 0;                                                         \
+    for (int j = 0; j < lanes / 2; ++j) {                                     \
+      uint64_t shift = size_in_bits * j;                                      \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                      \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                      \
+      res |=                                                                  \
+          (static_cast<uint64_t>(Nabs(ws_op) > Nabs(wt_op) ? ws_op : wt_op) & \
+           mask)                                                              \
+          << shift;                                                           \
+    }                                                                         \
+    wd[i] = res;                                                              \
+  }
+
+#define CEQ_DF(T, lanes, mask)                                               \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                     \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                     \
+      res |= (static_cast<uint64_t>(!Compare(ws_op, wt_op) ? -1ull : 0ull) & \
+              mask)                                                          \
+             << shift;                                                       \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
+
+#define CLT_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                   \
+  for (int i = 0; i < 2; i++) {                                             \
+    uint64_t res = 0;                                                       \
+    for (int j = 0; j < lanes / 2; ++j) {                                   \
+      uint64_t shift = size_in_bits * j;                                    \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                    \
+      res |= (static_cast<uint64_t>((Compare(ws_op, wt_op) == -1) ? -1ull   \
+                                                                  : 0ull) & \
+              mask)                                                         \
+             << shift;                                                      \
+    }                                                                       \
+    wd[i] = res;                                                            \
+  }
+
+#define CLE_DF(T, lanes, mask)                                             \
   int size_in_bits = kMSARegSize / lanes;                                  \
-  for (int i = 0; i < lanes / 2; ++i) {                                    \
-    uint64_t shift = size_in_bits * i;                                     \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                        \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                        \
-    T wd_op = static_cast<T>((wd >> shift) & mask);                        \
-    res |= (static_cast<uint64_t>(wd_op + ws_op * wt_op) & mask) << shift; \
-  }                                                                        \
-  return res
+  for (int i = 0; i < 2; i++) {                                            \
+    uint64_t res = 0;                                                      \
+    for (int j = 0; j < lanes / 2; ++j) {                                  \
+      uint64_t shift = size_in_bits * j;                                   \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                   \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                   \
+      res |= (static_cast<uint64_t>((Compare(ws_op, wt_op) != 1) ? -1ull   \
+                                                                 : 0ull) & \
+              mask)                                                        \
+             << shift;                                                     \
+    }                                                                      \
+    wd[i] = res;                                                           \
+  }
 
-#define MSUBV_DF(T, lanes, mask)                                           \
-  uint64_t res = 0;                                                        \
-  int size_in_bits = kMSARegSize / lanes;                                  \
-  for (int i = 0; i < lanes / 2; ++i) {                                    \
-    uint64_t shift = size_in_bits * i;                                     \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                        \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                        \
-    T wd_op = static_cast<T>((wd >> shift) & mask);                        \
-    res |= (static_cast<uint64_t>(wd_op - ws_op * wt_op) & mask) << shift; \
-  }                                                                        \
-  return res
+#define ADD_A_DF(T, lanes, mask)                                               \
+  int size_in_bits = kMSARegSize / lanes;                                      \
+  for (int i = 0; i < 2; i++) {                                                \
+    uint64_t res = 0;                                                          \
+    for (int j = 0; j < lanes / 2; ++j) {                                      \
+      uint64_t shift = size_in_bits * j;                                       \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                       \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                       \
+      res |= (static_cast<uint64_t>(Abs(ws_op) + Abs(wt_op)) & mask) << shift; \
+    }                                                                          \
+    wd[i] = res;                                                               \
+  }
 
-#define DIV_DF(T, lanes, mask)                                     \
-  uint64_t res = 0;                                                \
-  int size_in_bits = kMSARegSize / lanes;                          \
-  for (int i = 0; i < lanes / 2; ++i) {                            \
-    uint64_t shift = size_in_bits * i;                             \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                \
-    if (wt_op == 0) {                                              \
-      res = Unpredictable;                                         \
-      break;                                                       \
-    }                                                              \
-    res |= (static_cast<uint64_t>(ws_op / wt_op) & mask) << shift; \
-  }                                                                \
-  return res
+#define ADDS_A_DF(T, lanes, mask)                              \
+  int size_in_bits = kMSARegSize / lanes;                      \
+  for (int i = 0; i < 2; i++) {                                \
+    uint64_t res = 0;                                          \
+    for (int j = 0; j < lanes / 2; ++j) {                      \
+      uint64_t shift = size_in_bits * j;                       \
+      T ws_op = Nabs(static_cast<T>((ws[i] >> shift) & mask)); \
+      T wt_op = Nabs(static_cast<T>((wt[i] >> shift) & mask)); \
+      T r;                                                     \
+      if (ws_op < -std::numeric_limits<T>::max() - wt_op) {    \
+        r = std::numeric_limits<T>::max();                     \
+      } else {                                                 \
+        r = -(ws_op + wt_op);                                  \
+      }                                                        \
+      res |= (static_cast<uint64_t>(r) & mask) << shift;       \
+    }                                                          \
+    wd[i] = res;                                               \
+  }
 
-#define MOD_DF(T, lanes, mask)                                            \
-  uint64_t res = 0;                                                       \
+#define ADDS_DF(T, lanes, mask)                                        \
+  int size_in_bits = kMSARegSize / lanes;                              \
+  for (int i = 0; i < 2; i++) {                                        \
+    uint64_t res = 0;                                                  \
+    for (int j = 0; j < lanes / 2; ++j) {                              \
+      uint64_t shift = size_in_bits * j;                               \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);               \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);               \
+      res |= (static_cast<uint64_t>(SaturateAdd(ws_op, wt_op)) & mask) \
+             << shift;                                                 \
+    }                                                                  \
+    wd[i] = res;                                                       \
+  }
+
+#define AVE_DF(T, lanes, mask)                                       \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      res |= (static_cast<uint64_t>(                                 \
+                 ((wt_op & ws_op) + ((ws_op ^ wt_op) >> 1)) & mask)) \
+             << shift;                                               \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
+
+#define AVER_DF(T, lanes, mask)                                      \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      res |= (static_cast<uint64_t>(                                 \
+                 ((wt_op | ws_op) - ((ws_op ^ wt_op) >> 1)) & mask)) \
+             << shift;                                               \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
+
+#define SUBS_DF(T, lanes, mask)                                        \
+  int size_in_bits = kMSARegSize / lanes;                              \
+  for (int i = 0; i < 2; i++) {                                        \
+    uint64_t res = 0;                                                  \
+    for (int j = 0; j < lanes / 2; ++j) {                              \
+      uint64_t shift = size_in_bits * j;                               \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);               \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);               \
+      res |= (static_cast<uint64_t>(SaturateSub(ws_op, wt_op)) & mask) \
+             << shift;                                                 \
+    }                                                                  \
+    wd[i] = res;                                                       \
+  }
+
+#define SUBSUS_U_DF(T, lanes, mask)                           \
+  typedef typename std::make_unsigned<T>::type uT;            \
+  int size_in_bits = kMSARegSize / lanes;                     \
+  for (int i = 0; i < 2; i++) {                               \
+    uint64_t res = 0;                                         \
+    for (int j = 0; j < lanes / 2; ++j) {                     \
+      uint64_t shift = size_in_bits * j;                      \
+      uT ws_op = static_cast<uT>((ws[i] >> shift) & mask);    \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);      \
+      T r;                                                    \
+      if (wt_op > 0) {                                        \
+        uT wtu = static_cast<uT>(wt_op);                      \
+        if (wtu > ws_op) {                                    \
+          r = 0;                                              \
+        } else {                                              \
+          r = static_cast<T>(ws_op - wtu);                    \
+        }                                                     \
+      } else {                                                \
+        if (ws_op > std::numeric_limits<uT>::max() + wt_op) { \
+          r = static_cast<T>(std::numeric_limits<uT>::max()); \
+        } else {                                              \
+          r = static_cast<T>(ws_op - wt_op);                  \
+        }                                                     \
+      }                                                       \
+      res |= (static_cast<uint64_t>(r) & mask) << shift;      \
+    }                                                         \
+    wd[i] = res;                                              \
+  }
+
+#define SUBSUU_S_DF(T, lanes, mask)                        \
+  typedef typename std::make_unsigned<T>::type uT;         \
+  int size_in_bits = kMSARegSize / lanes;                  \
+  for (int i = 0; i < 2; i++) {                            \
+    uint64_t res = 0;                                      \
+    for (int j = 0; j < lanes / 2; ++j) {                  \
+      uint64_t shift = size_in_bits * j;                   \
+      uT ws_op = static_cast<uT>((ws[i] >> shift) & mask); \
+      uT wt_op = static_cast<uT>((wt[i] >> shift) & mask); \
+      uT wdu;                                              \
+      T r;                                                 \
+      if (ws_op > wt_op) {                                 \
+        wdu = ws_op - wt_op;                               \
+        if (wdu > std::numeric_limits<T>::max()) {         \
+          r = std::numeric_limits<T>::max();               \
+        } else {                                           \
+          r = static_cast<T>(wdu);                         \
+        }                                                  \
+      } else {                                             \
+        wdu = wt_op - ws_op;                               \
+        CHECK(-std::numeric_limits<T>::max() ==            \
+              std::numeric_limits<T>::min() + 1);          \
+        if (wdu <= std::numeric_limits<T>::max()) {        \
+          r = -static_cast<T>(wdu);                        \
+        } else {                                           \
+          r = std::numeric_limits<T>::min();               \
+        }                                                  \
+      }                                                    \
+      res |= (static_cast<uint64_t>(r) & mask) << shift;   \
+    }                                                      \
+    wd[i] = res;                                           \
+  }
+
+#define ASUB_S_DF(T, lanes, mask)                                         \
   int size_in_bits = kMSARegSize / lanes;                                 \
-  for (int i = 0; i < lanes / 2; ++i) {                                   \
-    uint64_t shift = size_in_bits * i;                                    \
-    T ws_op = static_cast<T>((ws >> shift) & mask);                       \
-    T wt_op = static_cast<T>((wt >> shift) & mask);                       \
-    if (wt_op == 0) {                                                     \
-      res = Unpredictable;                                                \
-      break;                                                              \
+  for (int i = 0; i < 2; i++) {                                           \
+    uint64_t res = 0;                                                     \
+    for (int j = 0; j < lanes / 2; ++j) {                                 \
+      uint64_t shift = size_in_bits * j;                                  \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                  \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                  \
+      res |= (static_cast<uint64_t>(Abs(ws_op - wt_op)) & mask) << shift; \
     }                                                                     \
-    res |= (static_cast<uint64_t>(wt_op != 0 ? ws_op % wt_op : 0) & mask) \
-           << shift;                                                      \
-  }                                                                       \
-  return res
+    wd[i] = res;                                                          \
+  }
 
-#define SRAR_DF(T, lanes, mask)                                                \
-  uint64_t res = 0;                                                            \
-  int size_in_bits = kMSARegSize / lanes;                                      \
-  for (int i = 0; i < lanes / 2; ++i) {                                        \
-    uint64_t shift = size_in_bits * i;                                         \
-    T src_op = static_cast<T>((ws >> shift) & mask);                           \
-    int shift_op = ((wt >> shift) & mask) % size_in_bits;                      \
-    uint32_t bit = shift_op == 0 ? 0 : src_op >> (shift_op - 1) & 1;           \
-    res |=                                                                     \
-        (static_cast<uint64_t>(ArithmeticShiftRight(src_op, shift_op) + bit) & \
-         mask)                                                                 \
-        << shift;                                                              \
-  }                                                                            \
-  return res
+#define ASUB_U_DF(T, lanes, mask)                                    \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      res |= (static_cast<uint64_t>(ws_op > wt_op ? ws_op - wt_op    \
+                                                  : wt_op - ws_op) & \
+              mask)                                                  \
+             << shift;                                               \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
+
+#define MULV_DF(T, lanes, mask)                                      \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      res |= (static_cast<uint64_t>(ws_op * wt_op) & mask) << shift; \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
+
+#define MADDV_DF(T, lanes, mask)                                             \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                     \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                     \
+      T wd_op = static_cast<T>((wd[i] >> shift) & mask);                     \
+      res |= (static_cast<uint64_t>(wd_op + ws_op * wt_op) & mask) << shift; \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
+
+#define MSUBV_DF(T, lanes, mask)                                             \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                     \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                     \
+      T wd_op = static_cast<T>((wd[i] >> shift) & mask);                     \
+      res |= (static_cast<uint64_t>(wd_op - ws_op * wt_op) & mask) << shift; \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
+
+#define DIV_DF(T, lanes, mask)                                       \
+  int size_in_bits = kMSARegSize / lanes;                            \
+  for (int i = 0; i < 2; i++) {                                      \
+    uint64_t res = 0;                                                \
+    for (int j = 0; j < lanes / 2; ++j) {                            \
+      uint64_t shift = size_in_bits * j;                             \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);             \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);             \
+      if (wt_op == 0) {                                              \
+        res = Unpredictable;                                         \
+        break;                                                       \
+      }                                                              \
+      res |= (static_cast<uint64_t>(ws_op / wt_op) & mask) << shift; \
+    }                                                                \
+    wd[i] = res;                                                     \
+  }
+
+#define MOD_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                   \
+  for (int i = 0; i < 2; i++) {                                             \
+    uint64_t res = 0;                                                       \
+    for (int j = 0; j < lanes / 2; ++j) {                                   \
+      uint64_t shift = size_in_bits * j;                                    \
+      T ws_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      T wt_op = static_cast<T>((wt[i] >> shift) & mask);                    \
+      if (wt_op == 0) {                                                     \
+        res = Unpredictable;                                                \
+        break;                                                              \
+      }                                                                     \
+      res |= (static_cast<uint64_t>(wt_op != 0 ? ws_op % wt_op : 0) & mask) \
+             << shift;                                                      \
+    }                                                                       \
+    wd[i] = res;                                                            \
+  }
+
+#define SRAR_DF(T, lanes, mask)                                              \
+  int size_in_bits = kMSARegSize / lanes;                                    \
+  for (int i = 0; i < 2; i++) {                                              \
+    uint64_t res = 0;                                                        \
+    for (int j = 0; j < lanes / 2; ++j) {                                    \
+      uint64_t shift = size_in_bits * j;                                     \
+      T src_op = static_cast<T>((ws[i] >> shift) & mask);                    \
+      int shift_op = ((wt[i] >> shift) & mask) % size_in_bits;               \
+      uint32_t bit = shift_op == 0 ? 0 : src_op >> (shift_op - 1) & 1;       \
+      res |= (static_cast<uint64_t>(ArithmeticShiftRight(src_op, shift_op) + \
+                                    bit) &                                   \
+              mask)                                                          \
+             << shift;                                                       \
+    }                                                                        \
+    wd[i] = res;                                                             \
+  }
+
+#define PCKEV_DF(T, lanes, mask)        \
+  T* ws_p = reinterpret_cast<T*>(ws);   \
+  T* wt_p = reinterpret_cast<T*>(wt);   \
+  T* wd_p = reinterpret_cast<T*>(wd);   \
+  for (int i = 0; i < lanes / 2; ++i) { \
+    wd_p[i] = wt_p[2 * i];              \
+    wd_p[i + lanes / 2] = ws_p[2 * i];  \
+  }
+
+#define PCKOD_DF(T, lanes, mask)           \
+  T* ws_p = reinterpret_cast<T*>(ws);      \
+  T* wt_p = reinterpret_cast<T*>(wt);      \
+  T* wd_p = reinterpret_cast<T*>(wd);      \
+  for (int i = 0; i < lanes / 2; ++i) {    \
+    wd_p[i] = wt_p[2 * i + 1];             \
+    wd_p[i + lanes / 2] = ws_p[2 * i + 1]; \
+  }
+
+#define ILVL_DF(T, lanes, mask)            \
+  T* ws_p = reinterpret_cast<T*>(ws);      \
+  T* wt_p = reinterpret_cast<T*>(wt);      \
+  T* wd_p = reinterpret_cast<T*>(wd);      \
+  for (int i = 0; i < lanes / 2; ++i) {    \
+    wd_p[2 * i] = wt_p[i + lanes / 2];     \
+    wd_p[2 * i + 1] = ws_p[i + lanes / 2]; \
+  }
+
+#define ILVR_DF(T, lanes, mask)         \
+  T* ws_p = reinterpret_cast<T*>(ws);   \
+  T* wt_p = reinterpret_cast<T*>(wt);   \
+  T* wd_p = reinterpret_cast<T*>(wd);   \
+  for (int i = 0; i < lanes / 2; ++i) { \
+    wd_p[2 * i] = wt_p[i];              \
+    wd_p[2 * i + 1] = ws_p[i];          \
+  }
+
+#define ILVEV_DF(T, lanes, mask)        \
+  T* ws_p = reinterpret_cast<T*>(ws);   \
+  T* wt_p = reinterpret_cast<T*>(wt);   \
+  T* wd_p = reinterpret_cast<T*>(wd);   \
+  for (int i = 0; i < lanes / 2; ++i) { \
+    wd_p[2 * i] = wt_p[2 * i];          \
+    wd_p[2 * i + 1] = ws_p[2 * i];      \
+  }
+
+#define ILVOD_DF(T, lanes, mask)        \
+  T* ws_p = reinterpret_cast<T*>(ws);   \
+  T* wt_p = reinterpret_cast<T*>(wt);   \
+  T* wd_p = reinterpret_cast<T*>(wd);   \
+  for (int i = 0; i < lanes / 2; ++i) { \
+    wd_p[2 * i] = wt_p[2 * i + 1];      \
+    wd_p[2 * i + 1] = ws_p[2 * i + 1];  \
+  }
+
+#define VSHF_DF(T, lanes, mask)                        \
+  T* ws_p = reinterpret_cast<T*>(ws);                  \
+  T* wt_p = reinterpret_cast<T*>(wt);                  \
+  T* wd_p = reinterpret_cast<T*>(wd);                  \
+  const int mask_not_valid = 0xc0;                     \
+  const int mask_6bits = 0x3f;                         \
+  for (int i = 0; i < lanes; ++i) {                    \
+    if ((wd_p[i] & mask_not_valid)) {                  \
+      wd_p[i] = 0;                                     \
+    } else {                                           \
+      int k = (wd_p[i] & mask_6bits) % (lanes * 2);    \
+      wd_p[i] = k > lanes ? ws_p[k - lanes] : wt_p[k]; \
+    }                                                  \
+  }
+
+#define HADD_DF(T, T_small, lanes)                                           \
+  T_small* ws_p = reinterpret_cast<T_small*>(ws);                            \
+  T_small* wt_p = reinterpret_cast<T_small*>(wt);                            \
+  T* wd_p = reinterpret_cast<T*>(wd);                                        \
+  for (int i = 0; i < lanes; ++i) {                                          \
+    wd_p[i] = static_cast<T>(ws_p[2 * i + 1]) + static_cast<T>(wt_p[2 * i]); \
+  }
+
+#define HSUB_DF(T, T_small, lanes)                                           \
+  T_small* ws_p = reinterpret_cast<T_small*>(ws);                            \
+  T_small* wt_p = reinterpret_cast<T_small*>(wt);                            \
+  T* wd_p = reinterpret_cast<T*>(wd);                                        \
+  for (int i = 0; i < lanes; ++i) {                                          \
+    wd_p[i] = static_cast<T>(ws_p[2 * i + 1]) - static_cast<T>(wt_p[2 * i]); \
+  }
 
 #define TEST_CASE(V)                                              \
   V(sll_b, SLL_DF, uint8_t, kMSALanesByte, UINT8_MAX)             \
   V(sll_h, SLL_DF, uint16_t, kMSALanesHalf, UINT16_MAX)           \
   V(sll_w, SLL_DF, uint32_t, kMSALanesWord, UINT32_MAX)           \
   V(sll_d, SLL_DF, uint64_t, kMSALanesDword, UINT64_MAX)          \
-  V(sra_b, SRA_DF, int8_t, kMSALanesByte, UINT8_MAX)              \
-  V(sra_h, SRA_DF, int16_t, kMSALanesHalf, UINT16_MAX)            \
-  V(sra_w, SRA_DF, int32_t, kMSALanesWord, UINT32_MAX)            \
-  V(sra_d, SRA_DF, int64_t, kMSALanesDword, UINT64_MAX)           \
   V(srl_b, SRL_DF, uint8_t, kMSALanesByte, UINT8_MAX)             \
   V(srl_h, SRL_DF, uint16_t, kMSALanesHalf, UINT16_MAX)           \
   V(srl_w, SRL_DF, uint32_t, kMSALanesWord, UINT32_MAX)           \
@@ -10245,18 +10598,54 @@ TEST(MSA_3R_instructions) {
   V(mod_u_h, MOD_DF, uint16_t, kMSALanesHalf, UINT16_MAX)         \
   V(mod_u_w, MOD_DF, uint32_t, kMSALanesWord, UINT32_MAX)         \
   V(mod_u_d, MOD_DF, uint64_t, kMSALanesDword, UINT64_MAX)        \
-  V(srar_b, SRAR_DF, int8_t, kMSALanesByte, UINT8_MAX)            \
-  V(srar_h, SRAR_DF, int16_t, kMSALanesHalf, UINT16_MAX)          \
-  V(srar_w, SRAR_DF, int32_t, kMSALanesWord, UINT32_MAX)          \
-  V(srar_d, SRAR_DF, int64_t, kMSALanesDword, UINT64_MAX)         \
   V(srlr_b, SRAR_DF, uint8_t, kMSALanesByte, UINT8_MAX)           \
   V(srlr_h, SRAR_DF, uint16_t, kMSALanesHalf, UINT16_MAX)         \
   V(srlr_w, SRAR_DF, uint32_t, kMSALanesWord, UINT32_MAX)         \
-  V(srlr_d, SRAR_DF, uint64_t, kMSALanesDword, UINT64_MAX)
+  V(srlr_d, SRAR_DF, uint64_t, kMSALanesDword, UINT64_MAX)        \
+  V(pckev_b, PCKEV_DF, uint8_t, kMSALanesByte, UINT8_MAX)         \
+  V(pckev_h, PCKEV_DF, uint16_t, kMSALanesHalf, UINT16_MAX)       \
+  V(pckev_w, PCKEV_DF, uint32_t, kMSALanesWord, UINT32_MAX)       \
+  V(pckev_d, PCKEV_DF, uint64_t, kMSALanesDword, UINT64_MAX)      \
+  V(pckod_b, PCKOD_DF, uint8_t, kMSALanesByte, UINT8_MAX)         \
+  V(pckod_h, PCKOD_DF, uint16_t, kMSALanesHalf, UINT16_MAX)       \
+  V(pckod_w, PCKOD_DF, uint32_t, kMSALanesWord, UINT32_MAX)       \
+  V(pckod_d, PCKOD_DF, uint64_t, kMSALanesDword, UINT64_MAX)      \
+  V(ilvl_b, ILVL_DF, uint8_t, kMSALanesByte, UINT8_MAX)           \
+  V(ilvl_h, ILVL_DF, uint16_t, kMSALanesHalf, UINT16_MAX)         \
+  V(ilvl_w, ILVL_DF, uint32_t, kMSALanesWord, UINT32_MAX)         \
+  V(ilvl_d, ILVL_DF, uint64_t, kMSALanesDword, UINT64_MAX)        \
+  V(ilvr_b, ILVR_DF, uint8_t, kMSALanesByte, UINT8_MAX)           \
+  V(ilvr_h, ILVR_DF, uint16_t, kMSALanesHalf, UINT16_MAX)         \
+  V(ilvr_w, ILVR_DF, uint32_t, kMSALanesWord, UINT32_MAX)         \
+  V(ilvr_d, ILVR_DF, uint64_t, kMSALanesDword, UINT64_MAX)        \
+  V(ilvev_b, ILVEV_DF, uint8_t, kMSALanesByte, UINT8_MAX)         \
+  V(ilvev_h, ILVEV_DF, uint16_t, kMSALanesHalf, UINT16_MAX)       \
+  V(ilvev_w, ILVEV_DF, uint32_t, kMSALanesWord, UINT32_MAX)       \
+  V(ilvev_d, ILVEV_DF, uint64_t, kMSALanesDword, UINT64_MAX)      \
+  V(ilvod_b, ILVOD_DF, uint8_t, kMSALanesByte, UINT8_MAX)         \
+  V(ilvod_h, ILVOD_DF, uint16_t, kMSALanesHalf, UINT16_MAX)       \
+  V(ilvod_w, ILVOD_DF, uint32_t, kMSALanesWord, UINT32_MAX)       \
+  V(ilvod_d, ILVOD_DF, uint64_t, kMSALanesDword, UINT64_MAX)      \
+  V(vshf_b, VSHF_DF, uint8_t, kMSALanesByte, UINT8_MAX)           \
+  V(vshf_h, VSHF_DF, uint16_t, kMSALanesHalf, UINT16_MAX)         \
+  V(vshf_w, VSHF_DF, uint32_t, kMSALanesWord, UINT32_MAX)         \
+  V(vshf_d, VSHF_DF, uint64_t, kMSALanesDword, UINT64_MAX)        \
+  V(hadd_s_h, HADD_DF, int16_t, int8_t, kMSALanesHalf)            \
+  V(hadd_s_w, HADD_DF, int32_t, int16_t, kMSALanesWord)           \
+  V(hadd_s_d, HADD_DF, int64_t, int32_t, kMSALanesDword)          \
+  V(hadd_u_h, HADD_DF, uint16_t, uint8_t, kMSALanesHalf)          \
+  V(hadd_u_w, HADD_DF, uint32_t, uint16_t, kMSALanesWord)         \
+  V(hadd_u_d, HADD_DF, uint64_t, uint32_t, kMSALanesDword)        \
+  V(hsub_s_h, HSUB_DF, int16_t, int8_t, kMSALanesHalf)            \
+  V(hsub_s_w, HSUB_DF, int32_t, int16_t, kMSALanesWord)           \
+  V(hsub_s_d, HSUB_DF, int64_t, int32_t, kMSALanesDword)          \
+  V(hsub_u_h, HSUB_DF, uint16_t, uint8_t, kMSALanesHalf)          \
+  V(hsub_u_w, HSUB_DF, uint32_t, uint16_t, kMSALanesWord)         \
+  V(hsub_u_d, HSUB_DF, uint64_t, uint32_t, kMSALanesDword)
 
 #define RUN_TEST(instr, verify, type, lanes, mask)                       \
   run_msa_3r(&tc[i], [](MacroAssembler& assm) { __ instr(w2, w1, w0); }, \
-             [](uint64_t ws, uint64_t wt, uint64_t wd) {                 \
+             [](uint64_t* ws, uint64_t* wt, uint64_t* wd) {              \
                verify(type, lanes, mask);                                \
              });
 
@@ -10264,9 +10653,41 @@ TEST(MSA_3R_instructions) {
     TEST_CASE(RUN_TEST)
   }
 
+#define RUN_TEST2(instr, verify, type, lanes, mask)                      \
+  for (unsigned i = 0; i < arraysize(tc); i++) {                         \
+    for (unsigned j = 0; j < 3; j++) {                                   \
+      for (unsigned k = 0; k < lanes; k++) {                             \
+        type* element = reinterpret_cast<type*>(&tc[i]);                 \
+        element[k + j * lanes] &= std::numeric_limits<type>::max();      \
+      }                                                                  \
+    }                                                                    \
+  }                                                                      \
+  run_msa_3r(&tc[i], [](MacroAssembler& assm) { __ instr(w2, w1, w0); }, \
+             [](uint64_t* ws, uint64_t* wt, uint64_t* wd) {              \
+               verify(type, lanes, mask);                                \
+             });
+
+#define TEST_CASE2(V)                                    \
+  V(sra_b, SRA_DF, int8_t, kMSALanesByte, UINT8_MAX)     \
+  V(sra_h, SRA_DF, int16_t, kMSALanesHalf, UINT16_MAX)   \
+  V(sra_w, SRA_DF, int32_t, kMSALanesWord, UINT32_MAX)   \
+  V(sra_d, SRA_DF, int64_t, kMSALanesDword, UINT64_MAX)  \
+  V(srar_b, SRAR_DF, int8_t, kMSALanesByte, UINT8_MAX)   \
+  V(srar_h, SRAR_DF, int16_t, kMSALanesHalf, UINT16_MAX) \
+  V(srar_w, SRAR_DF, int32_t, kMSALanesWord, UINT32_MAX) \
+  V(srar_d, SRAR_DF, int64_t, kMSALanesDword, UINT64_MAX)
+
+  for (size_t i = 0; i < arraysize(tc); ++i) {
+    TEST_CASE2(RUN_TEST2)
+  }
+
+#undef TEST_CASE
+#undef TEST_CASE2
 #undef RUN_TEST
+#undef RUN_TEST2
 #undef SLL_DF
 #undef SRL_DF
+#undef SRA_DF
 #undef BCRL_DF
 #undef BSET_DF
 #undef BNEG_DF
@@ -10297,6 +10718,15 @@ TEST(MSA_3R_instructions) {
 #undef DIV_DF
 #undef MOD_DF
 #undef SRAR_DF
+#undef PCKEV_DF
+#undef PCKOD_DF
+#undef ILVL_DF
+#undef ILVR_DF
+#undef ILVEV_DF
+#undef ILVOD_DF
+#undef VSHF_DF
+#undef HADD_DF
+#undef HSUB_DF
 }
 
 struct TestCaseMsa3RF {
@@ -10313,12 +10743,10 @@ struct ExpectedResult_MSA3RF {
   uint64_t exp_res_hi;
 };
 
-template <typename Func, typename FuncLoad, typename FuncStore>
+template <typename Func>
 void run_msa_3rf(const struct TestCaseMsa3RF* input,
                  const struct ExpectedResult_MSA3RF* output,
-                 Func Generate2RInstructionFunc,
-                 FuncLoad load_elements_of_vector,
-                 FuncStore store_elements_of_vector) {
+                 Func Generate2RInstructionFunc) {
   Isolate* isolate = CcTest::i_isolate();
   HandleScope scope(isolate);
 
@@ -10349,28 +10777,8 @@ void run_msa_3rf(const struct TestCaseMsa3RF* input,
 
   (CALL_GENERATED_CODE(isolate, f, &res, 0, 0, 0, 0));
 
-  if (store_elements_of_vector == store_uint64_elements_of_vector) {
-    CHECK_EQ(output->exp_res_lo, res.d[0]);
-    CHECK_EQ(output->exp_res_hi, res.d[1]);
-  } else if (store_elements_of_vector == store_uint32_elements_of_vector) {
-    const uint32_t* exp_res =
-        reinterpret_cast<const uint32_t*>(&output->exp_res_lo);
-    CHECK_EQ(exp_res[0], res.w[0]);
-    CHECK_EQ(exp_res[1], res.w[1]);
-    CHECK_EQ(exp_res[2], res.w[2]);
-    CHECK_EQ(exp_res[3], res.w[3]);
-  } else {
-    const uint16_t* exp_res =
-        reinterpret_cast<const uint16_t*>(&output->exp_res_lo);
-    CHECK_EQ(exp_res[0], res.h[0]);
-    CHECK_EQ(exp_res[1], res.h[1]);
-    CHECK_EQ(exp_res[2], res.h[2]);
-    CHECK_EQ(exp_res[3], res.h[3]);
-    CHECK_EQ(exp_res[4], res.h[4]);
-    CHECK_EQ(exp_res[5], res.h[5]);
-    CHECK_EQ(exp_res[6], res.h[6]);
-    CHECK_EQ(exp_res[7], res.h[7]);
-  }
+  CHECK_EQ(output->exp_res_lo, res.d[0]);
+  CHECK_EQ(output->exp_res_hi, res.d[1]);
 }
 
 struct TestCaseMsa3RF_F {
@@ -10472,15 +10880,12 @@ TEST(MSA_floating_point_quiet_compare) {
 #define TEST_FP_QUIET_COMPARE_W(instruction, src, exp_res)                    \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint32_elements_of_vector,                                 \
-              store_uint32_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
+
 #define TEST_FP_QUIET_COMPARE_D(instruction, src, exp_res)                    \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint64_elements_of_vector,                                 \
-              store_uint64_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
   for (uint64_t i = 0; i < arraysize(tc_w); i++) {
     TEST_FP_QUIET_COMPARE_W(fcaf_w, &tc_w[i], &exp_res_fcaf)
@@ -10595,16 +11000,14 @@ TEST(MSA_floating_point_arithmetic) {
       reinterpret_cast<const struct TestCaseMsa3RF*>(src1),             \
       reinterpret_cast<const struct ExpectedResult_MSA3RF*>(function(   \
           src1, src2, src3, reinterpret_cast<float*>(&dst_container))), \
-      [](MacroAssembler& assm) { __ instr(w2, w0, w1); },               \
-      load_uint32_elements_of_vector, store_uint32_elements_of_vector);
+      [](MacroAssembler& assm) { __ instr(w2, w0, w1); });
 
 #define FP_ARITHMETIC_DF_D(instr, function, src1, src2, src3)            \
   run_msa_3rf(                                                           \
       reinterpret_cast<const struct TestCaseMsa3RF*>(src1),              \
       reinterpret_cast<const struct ExpectedResult_MSA3RF*>(function(    \
           src1, src2, src3, reinterpret_cast<double*>(&dst_container))), \
-      [](MacroAssembler& assm) { __ instr(w2, w0, w1); },                \
-      load_uint64_elements_of_vector, store_uint64_elements_of_vector);
+      [](MacroAssembler& assm) { __ instr(w2, w0, w1); });
 
   for (uint64_t i = 0; i < arraysize(tc_w); i++) {
     FP_ARITHMETIC_DF_W(fadd_w, fadd_function, &tc_w[i].ws_1, &tc_w[i].wt_1,
@@ -10701,16 +11104,12 @@ TEST(MSA_fmin_fmin_a_fmax_fmax_a) {
 #define TEST_FP_MIN_MAX_W(instruction, src, exp_res)                          \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint32_elements_of_vector,                                 \
-              store_uint32_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
 #define TEST_FP_MIN_MAX_D(instruction, src, exp_res)                          \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint64_elements_of_vector,                                 \
-              store_uint64_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
   for (uint64_t i = 0; i < arraysize(tc_w); i++) {
     TEST_FP_MIN_MAX_W(fmax_w, &tc_w[i], &exp_res_fmax_w[i])
@@ -10823,17 +11222,13 @@ TEST(MSA_fixed_point_arithmetic) {
       {0x800000007fffffff, 0x800000007c33f15e},
       {0xb5deb625939d884d, 0xe40dcbfe728756b5}};
 
-#define TEST_FIXED_POINT_DF_H(instruction, src, exp_res)                \
-  run_msa_3rf((src), (exp_res),                                         \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); }, \
-              load_uint16_elements_of_vector,                           \
-              store_uint16_elements_of_vector);
+#define TEST_FIXED_POINT_DF_H(instruction, src, exp_res) \
+  run_msa_3rf((src), (exp_res),                          \
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
-#define TEST_FIXED_POINT_DF_W(instruction, src, exp_res)                \
-  run_msa_3rf((src), (exp_res),                                         \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); }, \
-              load_uint32_elements_of_vector,                           \
-              store_uint32_elements_of_vector);
+#define TEST_FIXED_POINT_DF_W(instruction, src, exp_res) \
+  run_msa_3rf((src), (exp_res),                          \
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
   for (uint64_t i = 0; i < arraysize(tc_h); i++) {
     TEST_FIXED_POINT_DF_H(mul_q_h, &tc_h[i], &exp_res_mul_q_h[i])
@@ -10914,16 +11309,12 @@ TEST(MSA_fexdo) {
 #define TEST_FEXDO_H(instruction, src, exp_res)                               \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint32_elements_of_vector,                                 \
-              store_uint16_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
 #define TEST_FEXDO_W(instruction, src, exp_res)                               \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint64_elements_of_vector,                                 \
-              store_uint32_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
   for (uint64_t i = 0; i < arraysize(tc_w); i++) {
     TEST_FEXDO_H(fexdo_h, &tc_w[i], &exp_res_fexdo_w[i])
@@ -10993,16 +11384,12 @@ TEST(MSA_ftq) {
 #define TEST_FTQ_H(instruction, src, exp_res)                                 \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint32_elements_of_vector,                                 \
-              store_uint16_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
 #define TEST_FTQ_W(instruction, src, exp_res)                                 \
   run_msa_3rf(reinterpret_cast<const struct TestCaseMsa3RF*>(src),            \
               reinterpret_cast<const struct ExpectedResult_MSA3RF*>(exp_res), \
-              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); },       \
-              load_uint64_elements_of_vector,                                 \
-              store_uint32_elements_of_vector);
+              [](MacroAssembler& assm) { __ instruction(w2, w0, w1); });
 
   for (uint64_t i = 0; i < arraysize(tc_w); i++) {
     TEST_FTQ_H(ftq_h, &tc_w[i], &exp_res_ftq_w[i])
diff --git a/test/unittests/BUILD.gn b/test/unittests/BUILD.gn
index 10e5afc17f..d6d0a1067f 100644
--- a/test/unittests/BUILD.gn
+++ b/test/unittests/BUILD.gn
@@ -176,6 +176,7 @@ v8_source_set("unittests_sources") {
     "test-utils.cc",
     "test-utils.h",
     "unicode-unittest.cc",
+    "utils-unittest.cc",
     "value-serializer-unittest.cc",
     "wasm/control-transfer-unittest.cc",
     "wasm/decoder-unittest.cc",
diff --git a/test/unittests/unittests.gyp b/test/unittests/unittests.gyp
index 1528d9fd4a..575f550871 100644
--- a/test/unittests/unittests.gyp
+++ b/test/unittests/unittests.gyp
@@ -149,6 +149,7 @@
       'test-utils.h',
       'test-utils.cc',
       'unicode-unittest.cc',
+      'utils-unittest.cc',
       'value-serializer-unittest.cc',
       'zone/segmentpool-unittest.cc',
       'zone/zone-allocator-unittest.cc',
diff --git a/test/unittests/utils-unittest.cc b/test/unittests/utils-unittest.cc
new file mode 100644
index 0000000000..65088d873b
--- /dev/null
+++ b/test/unittests/utils-unittest.cc
@@ -0,0 +1,113 @@
+// Copyright 2014 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <limits>
+
+#include "src/utils.h"
+#include "testing/gtest-support.h"
+
+namespace v8 {
+namespace internal {
+
+template <typename T>
+class UtilsTest : public ::testing::Test {};
+
+typedef ::testing::Types<signed char, unsigned char,
+                         short,                    // NOLINT(runtime/int)
+                         unsigned short,           // NOLINT(runtime/int)
+                         int, unsigned int, long,  // NOLINT(runtime/int)
+                         unsigned long,            // NOLINT(runtime/int)
+                         long long,                // NOLINT(runtime/int)
+                         unsigned long long,       // NOLINT(runtime/int)
+                         int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t,
+                         int64_t, uint64_t>
+    IntegerTypes;
+
+TYPED_TEST_CASE(UtilsTest, IntegerTypes);
+
+TYPED_TEST(UtilsTest, SaturateSub) {
+  TypeParam min = std::numeric_limits<TypeParam>::min();
+  TypeParam max = std::numeric_limits<TypeParam>::max();
+  EXPECT_EQ(SaturateSub<TypeParam>(min, 0), min);
+  EXPECT_EQ(SaturateSub<TypeParam>(max, 0), max);
+  EXPECT_EQ(SaturateSub<TypeParam>(max, min), max);
+  EXPECT_EQ(SaturateSub<TypeParam>(min, max), min);
+  EXPECT_EQ(SaturateSub<TypeParam>(min, max / 3), min);
+  EXPECT_EQ(SaturateSub<TypeParam>(min + 1, 2), min);
+  if (std::numeric_limits<TypeParam>::is_signed) {
+    EXPECT_EQ(SaturateSub<TypeParam>(min, min), static_cast<TypeParam>(0));
+    EXPECT_EQ(SaturateSub<TypeParam>(0, min), max);
+    EXPECT_EQ(SaturateSub<TypeParam>(max / 3, min), max);
+    EXPECT_EQ(SaturateSub<TypeParam>(max / 5, min), max);
+    EXPECT_EQ(SaturateSub<TypeParam>(min / 3, max), min);
+    EXPECT_EQ(SaturateSub<TypeParam>(min / 9, max), min);
+    EXPECT_EQ(SaturateSub<TypeParam>(max, min / 3), max);
+    EXPECT_EQ(SaturateSub<TypeParam>(min, max / 3), min);
+    EXPECT_EQ(SaturateSub<TypeParam>(max / 3 * 2, min / 2), max);
+    EXPECT_EQ(SaturateSub<TypeParam>(min / 3 * 2, max / 2), min);
+  } else {
+    EXPECT_EQ(SaturateSub<TypeParam>(min, min), min);
+    EXPECT_EQ(SaturateSub<TypeParam>(0, min), min);
+    EXPECT_EQ(SaturateSub<TypeParam>(0, max), min);
+    EXPECT_EQ(SaturateSub<TypeParam>(max / 3, max), min);
+    EXPECT_EQ(SaturateSub<TypeParam>(max - 3, max), min);
+  }
+  TypeParam test_cases[] = {static_cast<TypeParam>(min / 23),
+                            static_cast<TypeParam>(max / 3),
+                            63,
+                            static_cast<TypeParam>(min / 6),
+                            static_cast<TypeParam>(max / 55),
+                            static_cast<TypeParam>(min / 2),
+                            static_cast<TypeParam>(max / 2),
+                            0,
+                            1,
+                            2,
+                            3,
+                            4,
+                            42};
+  TRACED_FOREACH(TypeParam, x, test_cases) {
+    TRACED_FOREACH(TypeParam, y, test_cases) {
+      if (std::numeric_limits<TypeParam>::is_signed) {
+        EXPECT_EQ(SaturateSub<TypeParam>(x, y), x - y);
+      } else {
+        EXPECT_EQ(SaturateSub<TypeParam>(x, y), y > x ? min : x - y);
+      }
+    }
+  }
+}
+
+TYPED_TEST(UtilsTest, SaturateAdd) {
+  TypeParam min = std::numeric_limits<TypeParam>::min();
+  TypeParam max = std::numeric_limits<TypeParam>::max();
+  EXPECT_EQ(SaturateAdd<TypeParam>(min, min), min);
+  EXPECT_EQ(SaturateAdd<TypeParam>(max, max), max);
+  EXPECT_EQ(SaturateAdd<TypeParam>(min, min / 3), min);
+  EXPECT_EQ(SaturateAdd<TypeParam>(max / 8 * 7, max / 3 * 2), max);
+  EXPECT_EQ(SaturateAdd<TypeParam>(min / 3 * 2, min / 8 * 7), min);
+  EXPECT_EQ(SaturateAdd<TypeParam>(max / 20 * 18, max / 25 * 18), max);
+  EXPECT_EQ(SaturateAdd<TypeParam>(min / 3 * 2, min / 3 * 2), min);
+  EXPECT_EQ(SaturateAdd<TypeParam>(max - 1, 2), max);
+  EXPECT_EQ(SaturateAdd<TypeParam>(max - 100, 101), max);
+  TypeParam test_cases[] = {static_cast<TypeParam>(min / 23),
+                            static_cast<TypeParam>(max / 3),
+                            63,
+                            static_cast<TypeParam>(min / 6),
+                            static_cast<TypeParam>(max / 55),
+                            static_cast<TypeParam>(min / 2),
+                            static_cast<TypeParam>(max / 2),
+                            0,
+                            1,
+                            2,
+                            3,
+                            4,
+                            42};
+  TRACED_FOREACH(TypeParam, x, test_cases) {
+    TRACED_FOREACH(TypeParam, y, test_cases) {
+      EXPECT_EQ(SaturateAdd<TypeParam>(x, y), x + y);
+    }
+  }
+}
+
+}  // namespace internal
+}  // namespace v8