PPC [simd]: Implement vector extend multiply low/high
Also added multiply low/high and vector merge instructions to the simulator. Change-Id: I889004b5572ee7df75be706c424ac2e83e53e8b3 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2769058 Commit-Queue: Milad Fa <mfarazma@redhat.com> Reviewed-by: Junliang Yan <junyan@redhat.com> Cr-Commit-Position: refs/heads/master@{#73492}
This commit is contained in:
parent
f32b18bc62
commit
63661ce7c6
@ -2289,6 +2289,14 @@ using Instr = uint32_t;
|
||||
V(vmulouh, VMULOUH, 0x10000048) \
|
||||
/* Vector Multiply Odd Signed Halfword */ \
|
||||
V(vmulosh, VMULOSH, 0x10000148) \
|
||||
/* Vector Multiply Even Signed Word */ \
|
||||
V(vmulesw, VMULESW, 0x10000388) \
|
||||
/* Vector Multiply Even Unsigned Word */ \
|
||||
V(vmuleuw, VMULEUW, 0x10000288) \
|
||||
/* Vector Multiply Odd Signed Word */ \
|
||||
V(vmulosw, VMULOSW, 0x10000188) \
|
||||
/* Vector Multiply Odd Unsigned Word */ \
|
||||
V(vmulouw, VMULOUW, 0x10000088) \
|
||||
/* Vector Sum across Quarter Signed Halfword Saturate */ \
|
||||
V(vsum4shs, VSUM4SHS, 0x10000648) \
|
||||
/* Vector Pack Unsigned Word Unsigned Saturate */ \
|
||||
@ -2390,7 +2398,19 @@ using Instr = uint32_t;
|
||||
/* Vector Maximum Single-Precision */ \
|
||||
V(vmaxfp, VMAXFP, 0x1000040A) \
|
||||
/* Vector Bit Permute Quadword */ \
|
||||
V(vbpermq, VBPERMQ, 0x1000054C)
|
||||
V(vbpermq, VBPERMQ, 0x1000054C) \
|
||||
/* Vector Merge High Byte */ \
|
||||
V(vmrghb, VMRGHB, 0x1000000C) \
|
||||
/* Vector Merge High Halfword */ \
|
||||
V(vmrghh, VMRGHH, 0x1000004C) \
|
||||
/* Vector Merge High Word */ \
|
||||
V(vmrghw, VMRGHW, 0x1000008C) \
|
||||
/* Vector Merge Low Byte */ \
|
||||
V(vmrglb, VMRGLB, 0x1000010C) \
|
||||
/* Vector Merge Low Halfword */ \
|
||||
V(vmrglh, VMRGLH, 0x1000014C) \
|
||||
/* Vector Merge Low Word */ \
|
||||
V(vmrglw, VMRGLW, 0x1000018C)
|
||||
|
||||
#define PPC_VX_OPCODE_C_FORM_LIST(V) \
|
||||
/* Vector Unpack Low Signed Word */ \
|
||||
@ -2459,26 +2479,6 @@ using Instr = uint32_t;
|
||||
V(vgbbd, VGBBD, 0x1000050C) \
|
||||
/* Vector Log Base 2 Estimate Single-Precision */ \
|
||||
V(vlogefp, VLOGEFP, 0x100001CA) \
|
||||
/* Vector Merge High Byte */ \
|
||||
V(vmrghb, VMRGHB, 0x1000000C) \
|
||||
/* Vector Merge High Halfword */ \
|
||||
V(vmrghh, VMRGHH, 0x1000004C) \
|
||||
/* Vector Merge High Word */ \
|
||||
V(vmrghw, VMRGHW, 0x1000008C) \
|
||||
/* Vector Merge Low Byte */ \
|
||||
V(vmrglb, VMRGLB, 0x1000010C) \
|
||||
/* Vector Merge Low Halfword */ \
|
||||
V(vmrglh, VMRGLH, 0x1000014C) \
|
||||
/* Vector Merge Low Word */ \
|
||||
V(vmrglw, VMRGLW, 0x1000018C) \
|
||||
/* Vector Multiply Even Signed Word */ \
|
||||
V(vmulesw, VMULESW, 0x10000388) \
|
||||
/* Vector Multiply Even Unsigned Word */ \
|
||||
V(vmuleuw, VMULEUW, 0x10000288) \
|
||||
/* Vector Multiply Odd Signed Word */ \
|
||||
V(vmulosw, VMULOSW, 0x10000188) \
|
||||
/* Vector Multiply Odd Unsigned Word */ \
|
||||
V(vmulouw, VMULOUW, 0x10000088) \
|
||||
/* Vector NAND */ \
|
||||
V(vnand, VNAND, 0x10000584) \
|
||||
/* Vector OR with Complement */ \
|
||||
|
@ -3682,6 +3682,83 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
i.InputSimd128Register(1), kScratchSimd128Reg);
|
||||
break;
|
||||
}
|
||||
#define EXT_MUL(mul_even, mul_odd) \
|
||||
Simd128Register dst = i.OutputSimd128Register(), \
|
||||
src0 = i.InputSimd128Register(0), \
|
||||
src1 = i.InputSimd128Register(1); \
|
||||
__ mul_even(dst, src0, src1); \
|
||||
__ mul_odd(kScratchSimd128Reg, src0, src1);
|
||||
case kPPC_I64x2ExtMulLowI32x4S: {
|
||||
constexpr int lane_width_in_bytes = 8;
|
||||
EXT_MUL(vmulesw, vmulosw)
|
||||
__ vextractd(dst, dst, Operand(1 * lane_width_in_bytes));
|
||||
__ vextractd(kScratchSimd128Reg, kScratchSimd128Reg,
|
||||
Operand(1 * lane_width_in_bytes));
|
||||
__ vinsertd(dst, kScratchSimd128Reg, Operand(1 * lane_width_in_bytes));
|
||||
break;
|
||||
}
|
||||
case kPPC_I64x2ExtMulHighI32x4S: {
|
||||
constexpr int lane_width_in_bytes = 8;
|
||||
EXT_MUL(vmulesw, vmulosw)
|
||||
__ vinsertd(dst, kScratchSimd128Reg, Operand(1 * lane_width_in_bytes));
|
||||
break;
|
||||
}
|
||||
case kPPC_I64x2ExtMulLowI32x4U: {
|
||||
constexpr int lane_width_in_bytes = 8;
|
||||
EXT_MUL(vmuleuw, vmulouw)
|
||||
__ vextractd(dst, dst, Operand(1 * lane_width_in_bytes));
|
||||
__ vextractd(kScratchSimd128Reg, kScratchSimd128Reg,
|
||||
Operand(1 * lane_width_in_bytes));
|
||||
__ vinsertd(dst, kScratchSimd128Reg, Operand(1 * lane_width_in_bytes));
|
||||
break;
|
||||
}
|
||||
case kPPC_I64x2ExtMulHighI32x4U: {
|
||||
constexpr int lane_width_in_bytes = 8;
|
||||
EXT_MUL(vmuleuw, vmulouw)
|
||||
__ vinsertd(dst, kScratchSimd128Reg, Operand(1 * lane_width_in_bytes));
|
||||
break;
|
||||
}
|
||||
case kPPC_I32x4ExtMulLowI16x8S: {
|
||||
EXT_MUL(vmulesh, vmulosh)
|
||||
__ vmrglw(dst, dst, kScratchSimd128Reg);
|
||||
break;
|
||||
}
|
||||
case kPPC_I32x4ExtMulHighI16x8S: {
|
||||
EXT_MUL(vmulesh, vmulosh)
|
||||
__ vmrghw(dst, dst, kScratchSimd128Reg);
|
||||
break;
|
||||
}
|
||||
case kPPC_I32x4ExtMulLowI16x8U: {
|
||||
EXT_MUL(vmuleuh, vmulouh)
|
||||
__ vmrglw(dst, dst, kScratchSimd128Reg);
|
||||
break;
|
||||
}
|
||||
case kPPC_I32x4ExtMulHighI16x8U: {
|
||||
EXT_MUL(vmuleuh, vmulouh)
|
||||
__ vmrghw(dst, dst, kScratchSimd128Reg);
|
||||
break;
|
||||
}
|
||||
case kPPC_I16x8ExtMulLowI8x16S: {
|
||||
EXT_MUL(vmulesb, vmulosb)
|
||||
__ vmrglh(dst, dst, kScratchSimd128Reg);
|
||||
break;
|
||||
}
|
||||
case kPPC_I16x8ExtMulHighI8x16S: {
|
||||
EXT_MUL(vmulesb, vmulosb)
|
||||
__ vmrghh(dst, dst, kScratchSimd128Reg);
|
||||
break;
|
||||
}
|
||||
case kPPC_I16x8ExtMulLowI8x16U: {
|
||||
EXT_MUL(vmuleub, vmuloub)
|
||||
__ vmrglh(dst, dst, kScratchSimd128Reg);
|
||||
break;
|
||||
}
|
||||
case kPPC_I16x8ExtMulHighI8x16U: {
|
||||
EXT_MUL(vmuleub, vmuloub)
|
||||
__ vmrghh(dst, dst, kScratchSimd128Reg);
|
||||
break;
|
||||
}
|
||||
#undef EXT_MUL
|
||||
case kPPC_StoreCompressTagged: {
|
||||
ASSEMBLE_STORE_INTEGER(StoreTaggedField, StoreTaggedFieldX);
|
||||
break;
|
||||
|
@ -242,6 +242,8 @@ namespace compiler {
|
||||
V(PPC_F32x4NearestInt) \
|
||||
V(PPC_F32x4Pmin) \
|
||||
V(PPC_F32x4Pmax) \
|
||||
V(PPC_F32x4Qfma) \
|
||||
V(PPC_F32x4Qfms) \
|
||||
V(PPC_I64x2Splat) \
|
||||
V(PPC_I64x2ExtractLane) \
|
||||
V(PPC_I64x2ReplaceLane) \
|
||||
@ -267,6 +269,10 @@ namespace compiler {
|
||||
V(PPC_I64x2SConvertI32x4High) \
|
||||
V(PPC_I64x2UConvertI32x4Low) \
|
||||
V(PPC_I64x2UConvertI32x4High) \
|
||||
V(PPC_I64x2ExtMulLowI32x4S) \
|
||||
V(PPC_I64x2ExtMulHighI32x4S) \
|
||||
V(PPC_I64x2ExtMulLowI32x4U) \
|
||||
V(PPC_I64x2ExtMulHighI32x4U) \
|
||||
V(PPC_I32x4Splat) \
|
||||
V(PPC_I32x4ExtractLane) \
|
||||
V(PPC_I32x4ReplaceLane) \
|
||||
@ -298,8 +304,10 @@ namespace compiler {
|
||||
V(PPC_I32x4DotI16x8S) \
|
||||
V(PPC_I32x4ExtAddPairwiseI16x8S) \
|
||||
V(PPC_I32x4ExtAddPairwiseI16x8U) \
|
||||
V(PPC_F32x4Qfma) \
|
||||
V(PPC_F32x4Qfms) \
|
||||
V(PPC_I32x4ExtMulLowI16x8S) \
|
||||
V(PPC_I32x4ExtMulHighI16x8S) \
|
||||
V(PPC_I32x4ExtMulLowI16x8U) \
|
||||
V(PPC_I32x4ExtMulHighI16x8U) \
|
||||
V(PPC_I16x8Splat) \
|
||||
V(PPC_I16x8ExtractLaneU) \
|
||||
V(PPC_I16x8ExtractLaneS) \
|
||||
@ -337,6 +345,10 @@ namespace compiler {
|
||||
V(PPC_I16x8ExtAddPairwiseI8x16S) \
|
||||
V(PPC_I16x8ExtAddPairwiseI8x16U) \
|
||||
V(PPC_I16x8Q15MulRSatS) \
|
||||
V(PPC_I16x8ExtMulLowI8x16S) \
|
||||
V(PPC_I16x8ExtMulHighI8x16S) \
|
||||
V(PPC_I16x8ExtMulLowI8x16U) \
|
||||
V(PPC_I16x8ExtMulHighI8x16U) \
|
||||
V(PPC_I8x16Splat) \
|
||||
V(PPC_I8x16ExtractLaneU) \
|
||||
V(PPC_I8x16ExtractLaneS) \
|
||||
|
@ -192,6 +192,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kPPC_I64x2SConvertI32x4High:
|
||||
case kPPC_I64x2UConvertI32x4Low:
|
||||
case kPPC_I64x2UConvertI32x4High:
|
||||
case kPPC_I64x2ExtMulLowI32x4S:
|
||||
case kPPC_I64x2ExtMulHighI32x4S:
|
||||
case kPPC_I64x2ExtMulLowI32x4U:
|
||||
case kPPC_I64x2ExtMulHighI32x4U:
|
||||
case kPPC_I32x4Splat:
|
||||
case kPPC_I32x4ExtractLane:
|
||||
case kPPC_I32x4ReplaceLane:
|
||||
@ -223,6 +227,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kPPC_I32x4DotI16x8S:
|
||||
case kPPC_I32x4ExtAddPairwiseI16x8S:
|
||||
case kPPC_I32x4ExtAddPairwiseI16x8U:
|
||||
case kPPC_I32x4ExtMulLowI16x8S:
|
||||
case kPPC_I32x4ExtMulHighI16x8S:
|
||||
case kPPC_I32x4ExtMulLowI16x8U:
|
||||
case kPPC_I32x4ExtMulHighI16x8U:
|
||||
case kPPC_I16x8Splat:
|
||||
case kPPC_I16x8ExtractLaneU:
|
||||
case kPPC_I16x8ExtractLaneS:
|
||||
@ -260,6 +268,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kPPC_I16x8ExtAddPairwiseI8x16S:
|
||||
case kPPC_I16x8ExtAddPairwiseI8x16U:
|
||||
case kPPC_I16x8Q15MulRSatS:
|
||||
case kPPC_I16x8ExtMulLowI8x16S:
|
||||
case kPPC_I16x8ExtMulHighI8x16S:
|
||||
case kPPC_I16x8ExtMulLowI8x16U:
|
||||
case kPPC_I16x8ExtMulHighI8x16U:
|
||||
case kPPC_I8x16Splat:
|
||||
case kPPC_I8x16ExtractLaneU:
|
||||
case kPPC_I8x16ExtractLaneS:
|
||||
|
@ -2177,6 +2177,10 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
|
||||
V(I64x2Mul) \
|
||||
V(I64x2Eq) \
|
||||
V(I64x2Ne) \
|
||||
V(I64x2ExtMulLowI32x4S) \
|
||||
V(I64x2ExtMulHighI32x4S) \
|
||||
V(I64x2ExtMulLowI32x4U) \
|
||||
V(I64x2ExtMulHighI32x4U) \
|
||||
V(I32x4Add) \
|
||||
V(I32x4Sub) \
|
||||
V(I32x4Mul) \
|
||||
@ -2191,6 +2195,10 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
|
||||
V(I32x4GtU) \
|
||||
V(I32x4GeU) \
|
||||
V(I32x4DotI16x8S) \
|
||||
V(I32x4ExtMulLowI16x8S) \
|
||||
V(I32x4ExtMulHighI16x8S) \
|
||||
V(I32x4ExtMulLowI16x8U) \
|
||||
V(I32x4ExtMulHighI16x8U) \
|
||||
V(I16x8Add) \
|
||||
V(I16x8Sub) \
|
||||
V(I16x8Mul) \
|
||||
@ -2212,6 +2220,10 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
|
||||
V(I16x8SubSatU) \
|
||||
V(I16x8RoundingAverageU) \
|
||||
V(I16x8Q15MulRSatS) \
|
||||
V(I16x8ExtMulLowI8x16S) \
|
||||
V(I16x8ExtMulHighI8x16S) \
|
||||
V(I16x8ExtMulLowI8x16U) \
|
||||
V(I16x8ExtMulHighI8x16U) \
|
||||
V(I8x16Add) \
|
||||
V(I8x16Sub) \
|
||||
V(I8x16MinS) \
|
||||
@ -2471,42 +2483,6 @@ void InstructionSelector::VisitS128Const(Node* node) {
|
||||
}
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitI64x2ExtMulLowI32x4S(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
void InstructionSelector::VisitI64x2ExtMulHighI32x4S(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
void InstructionSelector::VisitI64x2ExtMulLowI32x4U(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
void InstructionSelector::VisitI64x2ExtMulHighI32x4U(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
void InstructionSelector::VisitI32x4ExtMulLowI16x8S(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
void InstructionSelector::VisitI32x4ExtMulHighI16x8S(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
void InstructionSelector::VisitI32x4ExtMulLowI16x8U(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
void InstructionSelector::VisitI32x4ExtMulHighI16x8U(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
void InstructionSelector::VisitI16x8ExtMulLowI8x16S(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
void InstructionSelector::VisitI16x8ExtMulHighI8x16S(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
void InstructionSelector::VisitI16x8ExtMulLowI8x16U(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
void InstructionSelector::VisitI16x8ExtMulHighI8x16U(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
void InstructionSelector::VisitI8x16Popcnt(Node* node) { UNIMPLEMENTED(); }
|
||||
void InstructionSelector::VisitF64x2ConvertLowI32x4S(Node* node) {
|
||||
UNIMPLEMENTED();
|
||||
|
@ -4149,18 +4149,20 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
|
||||
VECTOR_ARITHMETIC_OP(int8_t, -)
|
||||
break;
|
||||
}
|
||||
#define VECTOR_MULTIPLY_EVEN_ODD(input_type, result_type, is_odd) \
|
||||
DECODE_VX_INSTRUCTION(t, a, b, T) \
|
||||
size_t i = 0, j = 0, k = 0; \
|
||||
size_t lane_size = sizeof(input_type); \
|
||||
if (is_odd) { \
|
||||
i = 1; \
|
||||
j = lane_size; \
|
||||
} \
|
||||
for (; j < kSimd128Size; i += 2, j += lane_size * 2, k++) { \
|
||||
input_type src0 = get_simd_register_by_lane<input_type>(a, i); \
|
||||
input_type src1 = get_simd_register_by_lane<input_type>(b, i); \
|
||||
set_simd_register_by_lane<result_type>(t, k, src0 * src1); \
|
||||
#define VECTOR_MULTIPLY_EVEN_ODD(input_type, result_type, is_odd) \
|
||||
DECODE_VX_INSTRUCTION(t, a, b, T) \
|
||||
size_t i = 0, j = 0, k = 0; \
|
||||
size_t lane_size = sizeof(input_type); \
|
||||
if (is_odd) { \
|
||||
i = 1; \
|
||||
j = lane_size; \
|
||||
} \
|
||||
for (; j < kSimd128Size; i += 2, j += lane_size * 2, k++) { \
|
||||
result_type src0 = \
|
||||
static_cast<result_type>(get_simd_register_by_lane<input_type>(a, i)); \
|
||||
result_type src1 = \
|
||||
static_cast<result_type>(get_simd_register_by_lane<input_type>(b, i)); \
|
||||
set_simd_register_by_lane<result_type>(t, k, src0 * src1); \
|
||||
}
|
||||
case VMULEUB: {
|
||||
VECTOR_MULTIPLY_EVEN_ODD(uint8_t, uint16_t, false)
|
||||
@ -4170,6 +4172,14 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
|
||||
VECTOR_MULTIPLY_EVEN_ODD(int8_t, int16_t, false)
|
||||
break;
|
||||
}
|
||||
case VMULOUB: {
|
||||
VECTOR_MULTIPLY_EVEN_ODD(uint8_t, uint16_t, true)
|
||||
break;
|
||||
}
|
||||
case VMULOSB: {
|
||||
VECTOR_MULTIPLY_EVEN_ODD(int8_t, int16_t, true)
|
||||
break;
|
||||
}
|
||||
case VMULEUH: {
|
||||
VECTOR_MULTIPLY_EVEN_ODD(uint16_t, uint32_t, false)
|
||||
break;
|
||||
@ -4179,10 +4189,57 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
|
||||
break;
|
||||
}
|
||||
case VMULOUH: {
|
||||
VECTOR_MULTIPLY_EVEN_ODD(uint16_t, uint32_t, true)
|
||||
break;
|
||||
}
|
||||
case VMULOSH: {
|
||||
VECTOR_MULTIPLY_EVEN_ODD(int16_t, int32_t, true)
|
||||
break;
|
||||
}
|
||||
case VMULEUW: {
|
||||
VECTOR_MULTIPLY_EVEN_ODD(uint32_t, uint64_t, false)
|
||||
break;
|
||||
}
|
||||
case VMULESW: {
|
||||
VECTOR_MULTIPLY_EVEN_ODD(int32_t, int64_t, false)
|
||||
break;
|
||||
}
|
||||
case VMULOUW: {
|
||||
VECTOR_MULTIPLY_EVEN_ODD(uint32_t, uint64_t, true)
|
||||
break;
|
||||
}
|
||||
case VMULOSW: {
|
||||
VECTOR_MULTIPLY_EVEN_ODD(int32_t, int64_t, true)
|
||||
break;
|
||||
}
|
||||
#undef VECTOR_MULTIPLY_EVEN_ODD
|
||||
#define VECTOR_MERGE(type, is_low_side) \
|
||||
DECODE_VX_INSTRUCTION(t, a, b, T) \
|
||||
constexpr size_t index_limit = (kSimd128Size / sizeof(type)) / 2; \
|
||||
for (size_t i = 0, source_index = is_low_side ? i + index_limit : i; \
|
||||
i < index_limit; i++, source_index++) { \
|
||||
set_simd_register_by_lane<type>( \
|
||||
t, 2 * i, get_simd_register_by_lane<type>(a, source_index)); \
|
||||
set_simd_register_by_lane<type>( \
|
||||
t, (2 * i) + 1, get_simd_register_by_lane<type>(b, source_index)); \
|
||||
}
|
||||
case VMRGLW: {
|
||||
VECTOR_MERGE(int32_t, true)
|
||||
break;
|
||||
}
|
||||
case VMRGHW: {
|
||||
VECTOR_MERGE(int32_t, false)
|
||||
break;
|
||||
}
|
||||
case VMRGLH: {
|
||||
VECTOR_MERGE(int16_t, true)
|
||||
break;
|
||||
}
|
||||
case VMRGHH: {
|
||||
VECTOR_MERGE(int16_t, false)
|
||||
break;
|
||||
}
|
||||
#undef VECTOR_MERGE
|
||||
#undef VECTOR_ARITHMETIC_OP
|
||||
#define VECTOR_MIN_MAX_OP(type, op) \
|
||||
DECODE_VX_INSTRUCTION(t, a, b, T) \
|
||||
|
Loading…
Reference in New Issue
Block a user