PPC [simd]: Implement vector extend multiply low/high

Also added multiply low/high and vector merge instructions to
the simulator.

Change-Id: I889004b5572ee7df75be706c424ac2e83e53e8b3
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2769058
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Reviewed-by: Junliang Yan <junyan@redhat.com>
Cr-Commit-Position: refs/heads/master@{#73492}
This commit is contained in:
Milad Fa 2021-03-17 15:22:59 -04:00 committed by Commit Bot
parent f32b18bc62
commit 63661ce7c6
6 changed files with 205 additions and 71 deletions

View File

@ -2289,6 +2289,14 @@ using Instr = uint32_t;
V(vmulouh, VMULOUH, 0x10000048) \
/* Vector Multiply Odd Signed Halfword */ \
V(vmulosh, VMULOSH, 0x10000148) \
/* Vector Multiply Even Signed Word */ \
V(vmulesw, VMULESW, 0x10000388) \
/* Vector Multiply Even Unsigned Word */ \
V(vmuleuw, VMULEUW, 0x10000288) \
/* Vector Multiply Odd Signed Word */ \
V(vmulosw, VMULOSW, 0x10000188) \
/* Vector Multiply Odd Unsigned Word */ \
V(vmulouw, VMULOUW, 0x10000088) \
/* Vector Sum across Quarter Signed Halfword Saturate */ \
V(vsum4shs, VSUM4SHS, 0x10000648) \
/* Vector Pack Unsigned Word Unsigned Saturate */ \
@ -2390,7 +2398,19 @@ using Instr = uint32_t;
/* Vector Maximum Single-Precision */ \
V(vmaxfp, VMAXFP, 0x1000040A) \
/* Vector Bit Permute Quadword */ \
V(vbpermq, VBPERMQ, 0x1000054C)
V(vbpermq, VBPERMQ, 0x1000054C) \
/* Vector Merge High Byte */ \
V(vmrghb, VMRGHB, 0x1000000C) \
/* Vector Merge High Halfword */ \
V(vmrghh, VMRGHH, 0x1000004C) \
/* Vector Merge High Word */ \
V(vmrghw, VMRGHW, 0x1000008C) \
/* Vector Merge Low Byte */ \
V(vmrglb, VMRGLB, 0x1000010C) \
/* Vector Merge Low Halfword */ \
V(vmrglh, VMRGLH, 0x1000014C) \
/* Vector Merge Low Word */ \
V(vmrglw, VMRGLW, 0x1000018C)
#define PPC_VX_OPCODE_C_FORM_LIST(V) \
/* Vector Unpack Low Signed Word */ \
@ -2459,26 +2479,6 @@ using Instr = uint32_t;
V(vgbbd, VGBBD, 0x1000050C) \
/* Vector Log Base 2 Estimate Single-Precision */ \
V(vlogefp, VLOGEFP, 0x100001CA) \
/* Vector Merge High Byte */ \
V(vmrghb, VMRGHB, 0x1000000C) \
/* Vector Merge High Halfword */ \
V(vmrghh, VMRGHH, 0x1000004C) \
/* Vector Merge High Word */ \
V(vmrghw, VMRGHW, 0x1000008C) \
/* Vector Merge Low Byte */ \
V(vmrglb, VMRGLB, 0x1000010C) \
/* Vector Merge Low Halfword */ \
V(vmrglh, VMRGLH, 0x1000014C) \
/* Vector Merge Low Word */ \
V(vmrglw, VMRGLW, 0x1000018C) \
/* Vector Multiply Even Signed Word */ \
V(vmulesw, VMULESW, 0x10000388) \
/* Vector Multiply Even Unsigned Word */ \
V(vmuleuw, VMULEUW, 0x10000288) \
/* Vector Multiply Odd Signed Word */ \
V(vmulosw, VMULOSW, 0x10000188) \
/* Vector Multiply Odd Unsigned Word */ \
V(vmulouw, VMULOUW, 0x10000088) \
/* Vector NAND */ \
V(vnand, VNAND, 0x10000584) \
/* Vector OR with Complement */ \

View File

@ -3682,6 +3682,83 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1), kScratchSimd128Reg);
break;
}
#define EXT_MUL(mul_even, mul_odd) \
Simd128Register dst = i.OutputSimd128Register(), \
src0 = i.InputSimd128Register(0), \
src1 = i.InputSimd128Register(1); \
__ mul_even(dst, src0, src1); \
__ mul_odd(kScratchSimd128Reg, src0, src1);
case kPPC_I64x2ExtMulLowI32x4S: {
constexpr int lane_width_in_bytes = 8;
EXT_MUL(vmulesw, vmulosw)
__ vextractd(dst, dst, Operand(1 * lane_width_in_bytes));
__ vextractd(kScratchSimd128Reg, kScratchSimd128Reg,
Operand(1 * lane_width_in_bytes));
__ vinsertd(dst, kScratchSimd128Reg, Operand(1 * lane_width_in_bytes));
break;
}
case kPPC_I64x2ExtMulHighI32x4S: {
constexpr int lane_width_in_bytes = 8;
EXT_MUL(vmulesw, vmulosw)
__ vinsertd(dst, kScratchSimd128Reg, Operand(1 * lane_width_in_bytes));
break;
}
case kPPC_I64x2ExtMulLowI32x4U: {
constexpr int lane_width_in_bytes = 8;
EXT_MUL(vmuleuw, vmulouw)
__ vextractd(dst, dst, Operand(1 * lane_width_in_bytes));
__ vextractd(kScratchSimd128Reg, kScratchSimd128Reg,
Operand(1 * lane_width_in_bytes));
__ vinsertd(dst, kScratchSimd128Reg, Operand(1 * lane_width_in_bytes));
break;
}
case kPPC_I64x2ExtMulHighI32x4U: {
constexpr int lane_width_in_bytes = 8;
EXT_MUL(vmuleuw, vmulouw)
__ vinsertd(dst, kScratchSimd128Reg, Operand(1 * lane_width_in_bytes));
break;
}
case kPPC_I32x4ExtMulLowI16x8S: {
EXT_MUL(vmulesh, vmulosh)
__ vmrglw(dst, dst, kScratchSimd128Reg);
break;
}
case kPPC_I32x4ExtMulHighI16x8S: {
EXT_MUL(vmulesh, vmulosh)
__ vmrghw(dst, dst, kScratchSimd128Reg);
break;
}
case kPPC_I32x4ExtMulLowI16x8U: {
EXT_MUL(vmuleuh, vmulouh)
__ vmrglw(dst, dst, kScratchSimd128Reg);
break;
}
case kPPC_I32x4ExtMulHighI16x8U: {
EXT_MUL(vmuleuh, vmulouh)
__ vmrghw(dst, dst, kScratchSimd128Reg);
break;
}
case kPPC_I16x8ExtMulLowI8x16S: {
EXT_MUL(vmulesb, vmulosb)
__ vmrglh(dst, dst, kScratchSimd128Reg);
break;
}
case kPPC_I16x8ExtMulHighI8x16S: {
EXT_MUL(vmulesb, vmulosb)
__ vmrghh(dst, dst, kScratchSimd128Reg);
break;
}
case kPPC_I16x8ExtMulLowI8x16U: {
EXT_MUL(vmuleub, vmuloub)
__ vmrglh(dst, dst, kScratchSimd128Reg);
break;
}
case kPPC_I16x8ExtMulHighI8x16U: {
EXT_MUL(vmuleub, vmuloub)
__ vmrghh(dst, dst, kScratchSimd128Reg);
break;
}
#undef EXT_MUL
case kPPC_StoreCompressTagged: {
ASSEMBLE_STORE_INTEGER(StoreTaggedField, StoreTaggedFieldX);
break;

View File

@ -242,6 +242,8 @@ namespace compiler {
V(PPC_F32x4NearestInt) \
V(PPC_F32x4Pmin) \
V(PPC_F32x4Pmax) \
V(PPC_F32x4Qfma) \
V(PPC_F32x4Qfms) \
V(PPC_I64x2Splat) \
V(PPC_I64x2ExtractLane) \
V(PPC_I64x2ReplaceLane) \
@ -267,6 +269,10 @@ namespace compiler {
V(PPC_I64x2SConvertI32x4High) \
V(PPC_I64x2UConvertI32x4Low) \
V(PPC_I64x2UConvertI32x4High) \
V(PPC_I64x2ExtMulLowI32x4S) \
V(PPC_I64x2ExtMulHighI32x4S) \
V(PPC_I64x2ExtMulLowI32x4U) \
V(PPC_I64x2ExtMulHighI32x4U) \
V(PPC_I32x4Splat) \
V(PPC_I32x4ExtractLane) \
V(PPC_I32x4ReplaceLane) \
@ -298,8 +304,10 @@ namespace compiler {
V(PPC_I32x4DotI16x8S) \
V(PPC_I32x4ExtAddPairwiseI16x8S) \
V(PPC_I32x4ExtAddPairwiseI16x8U) \
V(PPC_F32x4Qfma) \
V(PPC_F32x4Qfms) \
V(PPC_I32x4ExtMulLowI16x8S) \
V(PPC_I32x4ExtMulHighI16x8S) \
V(PPC_I32x4ExtMulLowI16x8U) \
V(PPC_I32x4ExtMulHighI16x8U) \
V(PPC_I16x8Splat) \
V(PPC_I16x8ExtractLaneU) \
V(PPC_I16x8ExtractLaneS) \
@ -337,6 +345,10 @@ namespace compiler {
V(PPC_I16x8ExtAddPairwiseI8x16S) \
V(PPC_I16x8ExtAddPairwiseI8x16U) \
V(PPC_I16x8Q15MulRSatS) \
V(PPC_I16x8ExtMulLowI8x16S) \
V(PPC_I16x8ExtMulHighI8x16S) \
V(PPC_I16x8ExtMulLowI8x16U) \
V(PPC_I16x8ExtMulHighI8x16U) \
V(PPC_I8x16Splat) \
V(PPC_I8x16ExtractLaneU) \
V(PPC_I8x16ExtractLaneS) \

View File

@ -192,6 +192,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kPPC_I64x2SConvertI32x4High:
case kPPC_I64x2UConvertI32x4Low:
case kPPC_I64x2UConvertI32x4High:
case kPPC_I64x2ExtMulLowI32x4S:
case kPPC_I64x2ExtMulHighI32x4S:
case kPPC_I64x2ExtMulLowI32x4U:
case kPPC_I64x2ExtMulHighI32x4U:
case kPPC_I32x4Splat:
case kPPC_I32x4ExtractLane:
case kPPC_I32x4ReplaceLane:
@ -223,6 +227,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kPPC_I32x4DotI16x8S:
case kPPC_I32x4ExtAddPairwiseI16x8S:
case kPPC_I32x4ExtAddPairwiseI16x8U:
case kPPC_I32x4ExtMulLowI16x8S:
case kPPC_I32x4ExtMulHighI16x8S:
case kPPC_I32x4ExtMulLowI16x8U:
case kPPC_I32x4ExtMulHighI16x8U:
case kPPC_I16x8Splat:
case kPPC_I16x8ExtractLaneU:
case kPPC_I16x8ExtractLaneS:
@ -260,6 +268,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kPPC_I16x8ExtAddPairwiseI8x16S:
case kPPC_I16x8ExtAddPairwiseI8x16U:
case kPPC_I16x8Q15MulRSatS:
case kPPC_I16x8ExtMulLowI8x16S:
case kPPC_I16x8ExtMulHighI8x16S:
case kPPC_I16x8ExtMulLowI8x16U:
case kPPC_I16x8ExtMulHighI8x16U:
case kPPC_I8x16Splat:
case kPPC_I8x16ExtractLaneU:
case kPPC_I8x16ExtractLaneS:

View File

@ -2177,6 +2177,10 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
V(I64x2Mul) \
V(I64x2Eq) \
V(I64x2Ne) \
V(I64x2ExtMulLowI32x4S) \
V(I64x2ExtMulHighI32x4S) \
V(I64x2ExtMulLowI32x4U) \
V(I64x2ExtMulHighI32x4U) \
V(I32x4Add) \
V(I32x4Sub) \
V(I32x4Mul) \
@ -2191,6 +2195,10 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
V(I32x4GtU) \
V(I32x4GeU) \
V(I32x4DotI16x8S) \
V(I32x4ExtMulLowI16x8S) \
V(I32x4ExtMulHighI16x8S) \
V(I32x4ExtMulLowI16x8U) \
V(I32x4ExtMulHighI16x8U) \
V(I16x8Add) \
V(I16x8Sub) \
V(I16x8Mul) \
@ -2212,6 +2220,10 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
V(I16x8SubSatU) \
V(I16x8RoundingAverageU) \
V(I16x8Q15MulRSatS) \
V(I16x8ExtMulLowI8x16S) \
V(I16x8ExtMulHighI8x16S) \
V(I16x8ExtMulLowI8x16U) \
V(I16x8ExtMulHighI8x16U) \
V(I8x16Add) \
V(I8x16Sub) \
V(I8x16MinS) \
@ -2471,42 +2483,6 @@ void InstructionSelector::VisitS128Const(Node* node) {
}
}
void InstructionSelector::VisitI64x2ExtMulLowI32x4S(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI64x2ExtMulHighI32x4S(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI64x2ExtMulLowI32x4U(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI64x2ExtMulHighI32x4U(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI32x4ExtMulLowI16x8S(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI32x4ExtMulHighI16x8S(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI32x4ExtMulLowI16x8U(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI32x4ExtMulHighI16x8U(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI16x8ExtMulLowI8x16S(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI16x8ExtMulHighI8x16S(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI16x8ExtMulLowI8x16U(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI16x8ExtMulHighI8x16U(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI8x16Popcnt(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2ConvertLowI32x4S(Node* node) {
UNIMPLEMENTED();

View File

@ -4149,18 +4149,20 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
VECTOR_ARITHMETIC_OP(int8_t, -)
break;
}
#define VECTOR_MULTIPLY_EVEN_ODD(input_type, result_type, is_odd) \
DECODE_VX_INSTRUCTION(t, a, b, T) \
size_t i = 0, j = 0, k = 0; \
size_t lane_size = sizeof(input_type); \
if (is_odd) { \
i = 1; \
j = lane_size; \
} \
for (; j < kSimd128Size; i += 2, j += lane_size * 2, k++) { \
input_type src0 = get_simd_register_by_lane<input_type>(a, i); \
input_type src1 = get_simd_register_by_lane<input_type>(b, i); \
set_simd_register_by_lane<result_type>(t, k, src0 * src1); \
#define VECTOR_MULTIPLY_EVEN_ODD(input_type, result_type, is_odd) \
DECODE_VX_INSTRUCTION(t, a, b, T) \
size_t i = 0, j = 0, k = 0; \
size_t lane_size = sizeof(input_type); \
if (is_odd) { \
i = 1; \
j = lane_size; \
} \
for (; j < kSimd128Size; i += 2, j += lane_size * 2, k++) { \
result_type src0 = \
static_cast<result_type>(get_simd_register_by_lane<input_type>(a, i)); \
result_type src1 = \
static_cast<result_type>(get_simd_register_by_lane<input_type>(b, i)); \
set_simd_register_by_lane<result_type>(t, k, src0 * src1); \
}
case VMULEUB: {
VECTOR_MULTIPLY_EVEN_ODD(uint8_t, uint16_t, false)
@ -4170,6 +4172,14 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
VECTOR_MULTIPLY_EVEN_ODD(int8_t, int16_t, false)
break;
}
case VMULOUB: {
VECTOR_MULTIPLY_EVEN_ODD(uint8_t, uint16_t, true)
break;
}
case VMULOSB: {
VECTOR_MULTIPLY_EVEN_ODD(int8_t, int16_t, true)
break;
}
case VMULEUH: {
VECTOR_MULTIPLY_EVEN_ODD(uint16_t, uint32_t, false)
break;
@ -4179,10 +4189,57 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
break;
}
case VMULOUH: {
VECTOR_MULTIPLY_EVEN_ODD(uint16_t, uint32_t, true)
break;
}
case VMULOSH: {
VECTOR_MULTIPLY_EVEN_ODD(int16_t, int32_t, true)
break;
}
case VMULEUW: {
VECTOR_MULTIPLY_EVEN_ODD(uint32_t, uint64_t, false)
break;
}
case VMULESW: {
VECTOR_MULTIPLY_EVEN_ODD(int32_t, int64_t, false)
break;
}
case VMULOUW: {
VECTOR_MULTIPLY_EVEN_ODD(uint32_t, uint64_t, true)
break;
}
case VMULOSW: {
VECTOR_MULTIPLY_EVEN_ODD(int32_t, int64_t, true)
break;
}
#undef VECTOR_MULTIPLY_EVEN_ODD
#define VECTOR_MERGE(type, is_low_side) \
DECODE_VX_INSTRUCTION(t, a, b, T) \
constexpr size_t index_limit = (kSimd128Size / sizeof(type)) / 2; \
for (size_t i = 0, source_index = is_low_side ? i + index_limit : i; \
i < index_limit; i++, source_index++) { \
set_simd_register_by_lane<type>( \
t, 2 * i, get_simd_register_by_lane<type>(a, source_index)); \
set_simd_register_by_lane<type>( \
t, (2 * i) + 1, get_simd_register_by_lane<type>(b, source_index)); \
}
case VMRGLW: {
VECTOR_MERGE(int32_t, true)
break;
}
case VMRGHW: {
VECTOR_MERGE(int32_t, false)
break;
}
case VMRGLH: {
VECTOR_MERGE(int16_t, true)
break;
}
case VMRGHH: {
VECTOR_MERGE(int16_t, false)
break;
}
#undef VECTOR_MERGE
#undef VECTOR_ARITHMETIC_OP
#define VECTOR_MIN_MAX_OP(type, op) \
DECODE_VX_INSTRUCTION(t, a, b, T) \