PPC [simd]: Implement vector extend multiply low/high

Also added multiply low/high and vector merge instructions to the simulator. Change-Id: I889004b5572ee7df75be706c424ac2e83e53e8b3 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2769058 Commit-Queue: Milad Fa <mfarazma@redhat.com> Reviewed-by: Junliang Yan <junyan@redhat.com> Cr-Commit-Position: refs/heads/master@{#73492}
2021-03-17 15:22:59 -04:00 · 2021-03-17 15:22:59 -04:00 · 63661ce7c6
commit 63661ce7c6
parent f32b18bc62
6 changed files with 205 additions and 71 deletions
--- a/src/codegen/ppc/constants-ppc.h
+++ b/src/codegen/ppc/constants-ppc.h
@ -2289,6 +2289,14 @@ using Instr = uint32_t;
  V(vmulouh, VMULOUH, 0x10000048)                          \
  /* Vector Multiply Odd Signed Halfword */                \
  V(vmulosh, VMULOSH, 0x10000148)                          \
+  /* Vector Multiply Even Signed Word */                   \
+  V(vmulesw, VMULESW, 0x10000388)                          \
+  /* Vector Multiply Even Unsigned Word */                 \
+  V(vmuleuw, VMULEUW, 0x10000288)                          \
+  /* Vector Multiply Odd Signed Word */                    \
+  V(vmulosw, VMULOSW, 0x10000188)                          \
+  /* Vector Multiply Odd Unsigned Word */                  \
+  V(vmulouw, VMULOUW, 0x10000088)                          \
  /* Vector Sum across Quarter Signed Halfword Saturate */ \
  V(vsum4shs, VSUM4SHS, 0x10000648)                        \
  /* Vector Pack Unsigned Word Unsigned Saturate */        \
@ -2390,7 +2398,19 @@ using Instr = uint32_t;
  /* Vector Maximum Single-Precision */                    \
  V(vmaxfp, VMAXFP, 0x1000040A)                            \
  /* Vector Bit Permute Quadword */                        \
-  V(vbpermq, VBPERMQ, 0x1000054C)
+  V(vbpermq, VBPERMQ, 0x1000054C)                          \
+  /* Vector Merge High Byte */                             \
+  V(vmrghb, VMRGHB, 0x1000000C)                            \
+  /* Vector Merge High Halfword */                         \
+  V(vmrghh, VMRGHH, 0x1000004C)                            \
+  /* Vector Merge High Word */                             \
+  V(vmrghw, VMRGHW, 0x1000008C)                            \
+  /* Vector Merge Low Byte */                              \
+  V(vmrglb, VMRGLB, 0x1000010C)                            \
+  /* Vector Merge Low Halfword */                          \
+  V(vmrglh, VMRGLH, 0x1000014C)                            \
+  /* Vector Merge Low Word */                              \
+  V(vmrglw, VMRGLW, 0x1000018C)

 #define PPC_VX_OPCODE_C_FORM_LIST(V)       \
  /* Vector Unpack Low Signed Word */      \
@ -2459,26 +2479,6 @@ using Instr = uint32_t;
  V(vgbbd, VGBBD, 0x1000050C)                                             \
  /* Vector Log Base 2 Estimate Single-Precision */                       \
  V(vlogefp, VLOGEFP, 0x100001CA)                                         \
-  /* Vector Merge High Byte */                                            \
-  V(vmrghb, VMRGHB, 0x1000000C)                                           \
-  /* Vector Merge High Halfword */                                        \
-  V(vmrghh, VMRGHH, 0x1000004C)                                           \
-  /* Vector Merge High Word */                                            \
-  V(vmrghw, VMRGHW, 0x1000008C)                                           \
-  /* Vector Merge Low Byte */                                             \
-  V(vmrglb, VMRGLB, 0x1000010C)                                           \
-  /* Vector Merge Low Halfword */                                         \
-  V(vmrglh, VMRGLH, 0x1000014C)                                           \
-  /* Vector Merge Low Word */                                             \
-  V(vmrglw, VMRGLW, 0x1000018C)                                           \
-  /* Vector Multiply Even Signed Word */                                  \
-  V(vmulesw, VMULESW, 0x10000388)                                         \
-  /* Vector Multiply Even Unsigned Word */                                \
-  V(vmuleuw, VMULEUW, 0x10000288)                                         \
-  /* Vector Multiply Odd Signed Word */                                   \
-  V(vmulosw, VMULOSW, 0x10000188)                                         \
-  /* Vector Multiply Odd Unsigned Word */                                 \
-  V(vmulouw, VMULOUW, 0x10000088)                                         \
  /* Vector NAND */                                                       \
  V(vnand, VNAND, 0x10000584)                                             \
  /* Vector OR with Complement */                                         \
--- a/src/compiler/backend/ppc/code-generator-ppc.cc
+++ b/src/compiler/backend/ppc/code-generator-ppc.cc
@ -3682,6 +3682,83 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
                    i.InputSimd128Register(1), kScratchSimd128Reg);
      break;
    }
+#define EXT_MUL(mul_even, mul_odd)                  \
+  Simd128Register dst = i.OutputSimd128Register(),  \
+                  src0 = i.InputSimd128Register(0), \
+                  src1 = i.InputSimd128Register(1); \
+  __ mul_even(dst, src0, src1);                     \
+  __ mul_odd(kScratchSimd128Reg, src0, src1);
+    case kPPC_I64x2ExtMulLowI32x4S: {
+      constexpr int lane_width_in_bytes = 8;
+      EXT_MUL(vmulesw, vmulosw)
+      __ vextractd(dst, dst, Operand(1 * lane_width_in_bytes));
+      __ vextractd(kScratchSimd128Reg, kScratchSimd128Reg,
+                   Operand(1 * lane_width_in_bytes));
+      __ vinsertd(dst, kScratchSimd128Reg, Operand(1 * lane_width_in_bytes));
+      break;
+    }
+    case kPPC_I64x2ExtMulHighI32x4S: {
+      constexpr int lane_width_in_bytes = 8;
+      EXT_MUL(vmulesw, vmulosw)
+      __ vinsertd(dst, kScratchSimd128Reg, Operand(1 * lane_width_in_bytes));
+      break;
+    }
+    case kPPC_I64x2ExtMulLowI32x4U: {
+      constexpr int lane_width_in_bytes = 8;
+      EXT_MUL(vmuleuw, vmulouw)
+      __ vextractd(dst, dst, Operand(1 * lane_width_in_bytes));
+      __ vextractd(kScratchSimd128Reg, kScratchSimd128Reg,
+                   Operand(1 * lane_width_in_bytes));
+      __ vinsertd(dst, kScratchSimd128Reg, Operand(1 * lane_width_in_bytes));
+      break;
+    }
+    case kPPC_I64x2ExtMulHighI32x4U: {
+      constexpr int lane_width_in_bytes = 8;
+      EXT_MUL(vmuleuw, vmulouw)
+      __ vinsertd(dst, kScratchSimd128Reg, Operand(1 * lane_width_in_bytes));
+      break;
+    }
+    case kPPC_I32x4ExtMulLowI16x8S: {
+      EXT_MUL(vmulesh, vmulosh)
+      __ vmrglw(dst, dst, kScratchSimd128Reg);
+      break;
+    }
+    case kPPC_I32x4ExtMulHighI16x8S: {
+      EXT_MUL(vmulesh, vmulosh)
+      __ vmrghw(dst, dst, kScratchSimd128Reg);
+      break;
+    }
+    case kPPC_I32x4ExtMulLowI16x8U: {
+      EXT_MUL(vmuleuh, vmulouh)
+      __ vmrglw(dst, dst, kScratchSimd128Reg);
+      break;
+    }
+    case kPPC_I32x4ExtMulHighI16x8U: {
+      EXT_MUL(vmuleuh, vmulouh)
+      __ vmrghw(dst, dst, kScratchSimd128Reg);
+      break;
+    }
+    case kPPC_I16x8ExtMulLowI8x16S: {
+      EXT_MUL(vmulesb, vmulosb)
+      __ vmrglh(dst, dst, kScratchSimd128Reg);
+      break;
+    }
+    case kPPC_I16x8ExtMulHighI8x16S: {
+      EXT_MUL(vmulesb, vmulosb)
+      __ vmrghh(dst, dst, kScratchSimd128Reg);
+      break;
+    }
+    case kPPC_I16x8ExtMulLowI8x16U: {
+      EXT_MUL(vmuleub, vmuloub)
+      __ vmrglh(dst, dst, kScratchSimd128Reg);
+      break;
+    }
+    case kPPC_I16x8ExtMulHighI8x16U: {
+      EXT_MUL(vmuleub, vmuloub)
+      __ vmrghh(dst, dst, kScratchSimd128Reg);
+      break;
+    }
+#undef EXT_MUL
    case kPPC_StoreCompressTagged: {
      ASSEMBLE_STORE_INTEGER(StoreTaggedField, StoreTaggedFieldX);
      break;
--- a/src/compiler/backend/ppc/instruction-codes-ppc.h
+++ b/src/compiler/backend/ppc/instruction-codes-ppc.h
@ -242,6 +242,8 @@ namespace compiler {
  V(PPC_F32x4NearestInt)             \
  V(PPC_F32x4Pmin)                   \
  V(PPC_F32x4Pmax)                   \
+  V(PPC_F32x4Qfma)                   \
+  V(PPC_F32x4Qfms)                   \
  V(PPC_I64x2Splat)                  \
  V(PPC_I64x2ExtractLane)            \
  V(PPC_I64x2ReplaceLane)            \
@ -267,6 +269,10 @@ namespace compiler {
  V(PPC_I64x2SConvertI32x4High)      \
  V(PPC_I64x2UConvertI32x4Low)       \
  V(PPC_I64x2UConvertI32x4High)      \
+  V(PPC_I64x2ExtMulLowI32x4S)        \
+  V(PPC_I64x2ExtMulHighI32x4S)       \
+  V(PPC_I64x2ExtMulLowI32x4U)        \
+  V(PPC_I64x2ExtMulHighI32x4U)       \
  V(PPC_I32x4Splat)                  \
  V(PPC_I32x4ExtractLane)            \
  V(PPC_I32x4ReplaceLane)            \
@ -298,8 +304,10 @@ namespace compiler {
  V(PPC_I32x4DotI16x8S)              \
  V(PPC_I32x4ExtAddPairwiseI16x8S)   \
  V(PPC_I32x4ExtAddPairwiseI16x8U)   \
-  V(PPC_F32x4Qfma)                   \
-  V(PPC_F32x4Qfms)                   \
+  V(PPC_I32x4ExtMulLowI16x8S)        \
+  V(PPC_I32x4ExtMulHighI16x8S)       \
+  V(PPC_I32x4ExtMulLowI16x8U)        \
+  V(PPC_I32x4ExtMulHighI16x8U)       \
  V(PPC_I16x8Splat)                  \
  V(PPC_I16x8ExtractLaneU)           \
  V(PPC_I16x8ExtractLaneS)           \
@ -337,6 +345,10 @@ namespace compiler {
  V(PPC_I16x8ExtAddPairwiseI8x16S)   \
  V(PPC_I16x8ExtAddPairwiseI8x16U)   \
  V(PPC_I16x8Q15MulRSatS)            \
+  V(PPC_I16x8ExtMulLowI8x16S)        \
+  V(PPC_I16x8ExtMulHighI8x16S)       \
+  V(PPC_I16x8ExtMulLowI8x16U)        \
+  V(PPC_I16x8ExtMulHighI8x16U)       \
  V(PPC_I8x16Splat)                  \
  V(PPC_I8x16ExtractLaneU)           \
  V(PPC_I8x16ExtractLaneS)           \
--- a/src/compiler/backend/ppc/instruction-scheduler-ppc.cc
+++ b/src/compiler/backend/ppc/instruction-scheduler-ppc.cc
@ -192,6 +192,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kPPC_I64x2SConvertI32x4High:
    case kPPC_I64x2UConvertI32x4Low:
    case kPPC_I64x2UConvertI32x4High:
+    case kPPC_I64x2ExtMulLowI32x4S:
+    case kPPC_I64x2ExtMulHighI32x4S:
+    case kPPC_I64x2ExtMulLowI32x4U:
+    case kPPC_I64x2ExtMulHighI32x4U:
    case kPPC_I32x4Splat:
    case kPPC_I32x4ExtractLane:
    case kPPC_I32x4ReplaceLane:
@ -223,6 +227,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kPPC_I32x4DotI16x8S:
    case kPPC_I32x4ExtAddPairwiseI16x8S:
    case kPPC_I32x4ExtAddPairwiseI16x8U:
+    case kPPC_I32x4ExtMulLowI16x8S:
+    case kPPC_I32x4ExtMulHighI16x8S:
+    case kPPC_I32x4ExtMulLowI16x8U:
+    case kPPC_I32x4ExtMulHighI16x8U:
    case kPPC_I16x8Splat:
    case kPPC_I16x8ExtractLaneU:
    case kPPC_I16x8ExtractLaneS:
@ -260,6 +268,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kPPC_I16x8ExtAddPairwiseI8x16S:
    case kPPC_I16x8ExtAddPairwiseI8x16U:
    case kPPC_I16x8Q15MulRSatS:
+    case kPPC_I16x8ExtMulLowI8x16S:
+    case kPPC_I16x8ExtMulHighI8x16S:
+    case kPPC_I16x8ExtMulLowI8x16U:
+    case kPPC_I16x8ExtMulHighI8x16U:
    case kPPC_I8x16Splat:
    case kPPC_I8x16ExtractLaneU:
    case kPPC_I8x16ExtractLaneS:
--- a/src/compiler/backend/ppc/instruction-selector-ppc.cc
+++ b/src/compiler/backend/ppc/instruction-selector-ppc.cc
@ -2177,6 +2177,10 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
  V(I64x2Mul)              \
  V(I64x2Eq)               \
  V(I64x2Ne)               \
+  V(I64x2ExtMulLowI32x4S)  \
+  V(I64x2ExtMulHighI32x4S) \
+  V(I64x2ExtMulLowI32x4U)  \
+  V(I64x2ExtMulHighI32x4U) \
  V(I32x4Add)              \
  V(I32x4Sub)              \
  V(I32x4Mul)              \
@ -2191,6 +2195,10 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
  V(I32x4GtU)              \
  V(I32x4GeU)              \
  V(I32x4DotI16x8S)        \
+  V(I32x4ExtMulLowI16x8S)  \
+  V(I32x4ExtMulHighI16x8S) \
+  V(I32x4ExtMulLowI16x8U)  \
+  V(I32x4ExtMulHighI16x8U) \
  V(I16x8Add)              \
  V(I16x8Sub)              \
  V(I16x8Mul)              \
@ -2212,6 +2220,10 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
  V(I16x8SubSatU)          \
  V(I16x8RoundingAverageU) \
  V(I16x8Q15MulRSatS)      \
+  V(I16x8ExtMulLowI8x16S)  \
+  V(I16x8ExtMulHighI8x16S) \
+  V(I16x8ExtMulLowI8x16U)  \
+  V(I16x8ExtMulHighI8x16U) \
  V(I8x16Add)              \
  V(I8x16Sub)              \
  V(I8x16MinS)             \
@ -2471,42 +2483,6 @@ void InstructionSelector::VisitS128Const(Node* node) {
  }
 }

-void InstructionSelector::VisitI64x2ExtMulLowI32x4S(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI64x2ExtMulHighI32x4S(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI64x2ExtMulLowI32x4U(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI64x2ExtMulHighI32x4U(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI32x4ExtMulLowI16x8S(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI32x4ExtMulHighI16x8S(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI32x4ExtMulLowI16x8U(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI32x4ExtMulHighI16x8U(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI16x8ExtMulLowI8x16S(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI16x8ExtMulHighI8x16S(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI16x8ExtMulLowI8x16U(Node* node) {
-  UNIMPLEMENTED();
-}
-void InstructionSelector::VisitI16x8ExtMulHighI8x16U(Node* node) {
-  UNIMPLEMENTED();
-}
 void InstructionSelector::VisitI8x16Popcnt(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitF64x2ConvertLowI32x4S(Node* node) {
  UNIMPLEMENTED();
--- a/src/execution/ppc/simulator-ppc.cc
+++ b/src/execution/ppc/simulator-ppc.cc
@ -4149,18 +4149,20 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
      VECTOR_ARITHMETIC_OP(int8_t, -)
      break;
    }
-#define VECTOR_MULTIPLY_EVEN_ODD(input_type, result_type, is_odd)  \
-  DECODE_VX_INSTRUCTION(t, a, b, T)                                \
-  size_t i = 0, j = 0, k = 0;                                      \
-  size_t lane_size = sizeof(input_type);                           \
-  if (is_odd) {                                                    \
-    i = 1;                                                         \
-    j = lane_size;                                                 \
-  }                                                                \
-  for (; j < kSimd128Size; i += 2, j += lane_size * 2, k++) {      \
-    input_type src0 = get_simd_register_by_lane<input_type>(a, i); \
-    input_type src1 = get_simd_register_by_lane<input_type>(b, i); \
-    set_simd_register_by_lane<result_type>(t, k, src0 * src1);     \
+#define VECTOR_MULTIPLY_EVEN_ODD(input_type, result_type, is_odd)              \
+  DECODE_VX_INSTRUCTION(t, a, b, T)                                            \
+  size_t i = 0, j = 0, k = 0;                                                  \
+  size_t lane_size = sizeof(input_type);                                       \
+  if (is_odd) {                                                                \
+    i = 1;                                                                     \
+    j = lane_size;                                                             \
+  }                                                                            \
+  for (; j < kSimd128Size; i += 2, j += lane_size * 2, k++) {                  \
+    result_type src0 =                                                         \
+        static_cast<result_type>(get_simd_register_by_lane<input_type>(a, i)); \
+    result_type src1 =                                                         \
+        static_cast<result_type>(get_simd_register_by_lane<input_type>(b, i)); \
+    set_simd_register_by_lane<result_type>(t, k, src0 * src1);                 \
  }
    case VMULEUB: {
      VECTOR_MULTIPLY_EVEN_ODD(uint8_t, uint16_t, false)
@ -4170,6 +4172,14 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
      VECTOR_MULTIPLY_EVEN_ODD(int8_t, int16_t, false)
      break;
    }
+    case VMULOUB: {
+      VECTOR_MULTIPLY_EVEN_ODD(uint8_t, uint16_t, true)
+      break;
+    }
+    case VMULOSB: {
+      VECTOR_MULTIPLY_EVEN_ODD(int8_t, int16_t, true)
+      break;
+    }
    case VMULEUH: {
      VECTOR_MULTIPLY_EVEN_ODD(uint16_t, uint32_t, false)
      break;
@ -4179,10 +4189,57 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
      break;
    }
    case VMULOUH: {
+      VECTOR_MULTIPLY_EVEN_ODD(uint16_t, uint32_t, true)
+      break;
+    }
+    case VMULOSH: {
      VECTOR_MULTIPLY_EVEN_ODD(int16_t, int32_t, true)
      break;
    }
+    case VMULEUW: {
+      VECTOR_MULTIPLY_EVEN_ODD(uint32_t, uint64_t, false)
+      break;
+    }
+    case VMULESW: {
+      VECTOR_MULTIPLY_EVEN_ODD(int32_t, int64_t, false)
+      break;
+    }
+    case VMULOUW: {
+      VECTOR_MULTIPLY_EVEN_ODD(uint32_t, uint64_t, true)
+      break;
+    }
+    case VMULOSW: {
+      VECTOR_MULTIPLY_EVEN_ODD(int32_t, int64_t, true)
+      break;
+    }
 #undef VECTOR_MULTIPLY_EVEN_ODD
+#define VECTOR_MERGE(type, is_low_side)                                    \
+  DECODE_VX_INSTRUCTION(t, a, b, T)                                        \
+  constexpr size_t index_limit = (kSimd128Size / sizeof(type)) / 2;        \
+  for (size_t i = 0, source_index = is_low_side ? i + index_limit : i;     \
+       i < index_limit; i++, source_index++) {                             \
+    set_simd_register_by_lane<type>(                                       \
+        t, 2 * i, get_simd_register_by_lane<type>(a, source_index));       \
+    set_simd_register_by_lane<type>(                                       \
+        t, (2 * i) + 1, get_simd_register_by_lane<type>(b, source_index)); \
+  }
+    case VMRGLW: {
+      VECTOR_MERGE(int32_t, true)
+      break;
+    }
+    case VMRGHW: {
+      VECTOR_MERGE(int32_t, false)
+      break;
+    }
+    case VMRGLH: {
+      VECTOR_MERGE(int16_t, true)
+      break;
+    }
+    case VMRGHH: {
+      VECTOR_MERGE(int16_t, false)
+      break;
+    }
+#undef VECTOR_MERGE
 #undef VECTOR_ARITHMETIC_OP
 #define VECTOR_MIN_MAX_OP(type, op)                                        \
  DECODE_VX_INSTRUCTION(t, a, b, T)                                        \