[ia32] Merge some SSE/AVX i32x4, i16x8, i8x16 ops

These instructions are all single instruction lowering, so it's a matter
of changing the code-gen to call macro-assembler functions (that will do
the AVX check).

Bug: v8:11217
Change-Id: I472eacf74933f4b504299fc85f63fd07062db320
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3114602
Reviewed-by: Adam Klein <adamk@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76476}
This commit is contained in:
Ng Zhi An 2021-08-24 15:53:16 -07:00 committed by V8 LUCI CQ
parent 5e80730fb6
commit c282e8ef45
5 changed files with 98 additions and 213 deletions

View File

@ -223,6 +223,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Paddd, paddd)
AVX_OP(Paddq, paddq)
AVX_OP(Paddsb, paddsb)
AVX_OP(Paddsw, paddsw)
AVX_OP(Paddusb, paddusb)
AVX_OP(Paddusw, paddusw)
AVX_OP(Paddw, paddw)
@ -234,7 +235,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Pcmpeqd, pcmpeqd)
AVX_OP(Pcmpeqw, pcmpeqw)
AVX_OP(Pinsrw, pinsrw)
AVX_OP(Pmaxsw, pmaxsw)
AVX_OP(Pmaxub, pmaxub)
AVX_OP(Pminsw, pminsw)
AVX_OP(Pminub, pminub)
AVX_OP(Pmovmskb, pmovmskb)
AVX_OP(Pmullw, pmullw)
@ -256,7 +259,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Psubd, psubd)
AVX_OP(Psubq, psubq)
AVX_OP(Psubsb, psubsb)
AVX_OP(Psubsw, psubsw)
AVX_OP(Psubusb, psubusb)
AVX_OP(Psubusw, psubusw)
AVX_OP(Psubw, psubw)
AVX_OP(Punpckhbw, punpckhbw)
AVX_OP(Punpckhdq, punpckhdq)
@ -301,8 +306,12 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Pinsrb, pinsrb)
AVX_OP_SSE4_1(Pmaxsb, pmaxsb)
AVX_OP_SSE4_1(Pmaxsd, pmaxsd)
AVX_OP_SSE4_1(Pmaxud, pmaxud)
AVX_OP_SSE4_1(Pmaxuw, pmaxuw)
AVX_OP_SSE4_1(Pminsb, pminsb)
AVX_OP_SSE4_1(Pminsd, pminsd)
AVX_OP_SSE4_1(Pminud, pminud)
AVX_OP_SSE4_1(Pminuw, pminuw)
AVX_OP_SSE4_1(Pmovsxbw, pmovsxbw)
AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq)
AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd)

View File

@ -2499,27 +2499,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
ASSEMBLE_SIMD_SHIFT(Psrld, 5);
break;
}
case kSSEI32x4MinU: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pminud(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI32x4MinU: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpminud(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I32x4MinU: {
__ Pminud(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI32x4MaxU: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pmaxud(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI32x4MaxU: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpmaxud(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I32x4MaxU: {
__ Pmaxud(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
@ -2618,91 +2604,43 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
ASSEMBLE_SIMD_SHIFT(Psraw, 4);
break;
}
case kSSEI16x8SConvertI32x4: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ packssdw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kAVXI16x8SConvertI32x4: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpackssdw(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I16x8SConvertI32x4: {
__ Packssdw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI16x8Add: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ paddw(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI16x8Add: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpaddw(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I16x8Add: {
__ Paddw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI16x8AddSatS: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ paddsw(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI16x8AddSatS: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpaddsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I16x8AddSatS: {
__ Paddsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI16x8Sub: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ psubw(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI16x8Sub: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpsubw(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I16x8Sub: {
__ Psubw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI16x8SubSatS: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ psubsw(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI16x8SubSatS: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpsubsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I16x8SubSatS: {
__ Psubsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI16x8Mul: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pmullw(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI16x8Mul: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpmullw(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I16x8Mul: {
__ Pmullw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI16x8MinS: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pminsw(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI16x8MinS: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpminsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I16x8MinS: {
__ Pminsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI16x8MaxS: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pmaxsw(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI16x8MaxS: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpmaxsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I16x8MaxS: {
__ Pmaxsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
@ -2786,49 +2724,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vpackusdw(dst, dst, i.InputSimd128Register(1));
break;
}
case kSSEI16x8AddSatU: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ paddusw(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI16x8AddSatU: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpaddusw(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I16x8AddSatU: {
__ Paddusw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI16x8SubSatU: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ psubusw(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI16x8SubSatU: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpsubusw(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I16x8SubSatU: {
__ Psubusw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI16x8MinU: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pminuw(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI16x8MinU: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpminuw(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I16x8MinU: {
__ Pminuw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI16x8MaxU: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pmaxuw(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI16x8MaxU: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpmaxuw(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I16x8MaxU: {
__ Pmaxuw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
@ -2969,14 +2881,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ S128Store32Lane(operand, i.InputSimd128Register(index), laneidx);
break;
}
case kSSEI8x16SConvertI16x8: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ packsswb(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI8x16SConvertI16x8: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpacksswb(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I8x16SConvertI16x8: {
__ Packsswb(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}

View File

@ -195,10 +195,8 @@ namespace compiler {
V(IA32I32x4UConvertI16x8Low) \
V(IA32I32x4UConvertI16x8High) \
V(IA32I32x4ShrU) \
V(SSEI32x4MinU) \
V(AVXI32x4MinU) \
V(SSEI32x4MaxU) \
V(AVXI32x4MaxU) \
V(IA32I32x4MinU) \
V(IA32I32x4MaxU) \
V(SSEI32x4GtU) \
V(AVXI32x4GtU) \
V(SSEI32x4GeU) \
@ -221,22 +219,14 @@ namespace compiler {
V(IA32I16x8Neg) \
V(IA32I16x8Shl) \
V(IA32I16x8ShrS) \
V(SSEI16x8SConvertI32x4) \
V(AVXI16x8SConvertI32x4) \
V(SSEI16x8Add) \
V(AVXI16x8Add) \
V(SSEI16x8AddSatS) \
V(AVXI16x8AddSatS) \
V(SSEI16x8Sub) \
V(AVXI16x8Sub) \
V(SSEI16x8SubSatS) \
V(AVXI16x8SubSatS) \
V(SSEI16x8Mul) \
V(AVXI16x8Mul) \
V(SSEI16x8MinS) \
V(AVXI16x8MinS) \
V(SSEI16x8MaxS) \
V(AVXI16x8MaxS) \
V(IA32I16x8SConvertI32x4) \
V(IA32I16x8Add) \
V(IA32I16x8AddSatS) \
V(IA32I16x8Sub) \
V(IA32I16x8SubSatS) \
V(IA32I16x8Mul) \
V(IA32I16x8MinS) \
V(IA32I16x8MaxS) \
V(SSEI16x8Eq) \
V(AVXI16x8Eq) \
V(SSEI16x8Ne) \
@ -250,14 +240,10 @@ namespace compiler {
V(IA32I16x8ShrU) \
V(SSEI16x8UConvertI32x4) \
V(AVXI16x8UConvertI32x4) \
V(SSEI16x8AddSatU) \
V(AVXI16x8AddSatU) \
V(SSEI16x8SubSatU) \
V(AVXI16x8SubSatU) \
V(SSEI16x8MinU) \
V(AVXI16x8MinU) \
V(SSEI16x8MaxU) \
V(AVXI16x8MaxU) \
V(IA32I16x8AddSatU) \
V(IA32I16x8SubSatU) \
V(IA32I16x8MinU) \
V(IA32I16x8MaxU) \
V(SSEI16x8GtU) \
V(AVXI16x8GtU) \
V(SSEI16x8GeU) \
@ -280,8 +266,7 @@ namespace compiler {
V(IA32Pextrb) \
V(IA32Pextrw) \
V(IA32S128Store32Lane) \
V(SSEI8x16SConvertI16x8) \
V(AVXI8x16SConvertI16x8) \
V(IA32I8x16SConvertI16x8) \
V(IA32I8x16Neg) \
V(IA32I8x16Shl) \
V(IA32I8x16ShrS) \

View File

@ -180,10 +180,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I32x4UConvertI16x8Low:
case kIA32I32x4UConvertI16x8High:
case kIA32I32x4ShrU:
case kSSEI32x4MinU:
case kAVXI32x4MinU:
case kSSEI32x4MaxU:
case kAVXI32x4MaxU:
case kIA32I32x4MinU:
case kIA32I32x4MaxU:
case kSSEI32x4GtU:
case kAVXI32x4GtU:
case kSSEI32x4GeU:
@ -206,22 +204,14 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I16x8Neg:
case kIA32I16x8Shl:
case kIA32I16x8ShrS:
case kSSEI16x8SConvertI32x4:
case kAVXI16x8SConvertI32x4:
case kSSEI16x8Add:
case kAVXI16x8Add:
case kSSEI16x8AddSatS:
case kAVXI16x8AddSatS:
case kSSEI16x8Sub:
case kAVXI16x8Sub:
case kSSEI16x8SubSatS:
case kAVXI16x8SubSatS:
case kSSEI16x8Mul:
case kAVXI16x8Mul:
case kSSEI16x8MinS:
case kAVXI16x8MinS:
case kSSEI16x8MaxS:
case kAVXI16x8MaxS:
case kIA32I16x8SConvertI32x4:
case kIA32I16x8Add:
case kIA32I16x8AddSatS:
case kIA32I16x8Sub:
case kIA32I16x8SubSatS:
case kIA32I16x8Mul:
case kIA32I16x8MinS:
case kIA32I16x8MaxS:
case kSSEI16x8Eq:
case kAVXI16x8Eq:
case kSSEI16x8Ne:
@ -235,14 +225,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I16x8ShrU:
case kSSEI16x8UConvertI32x4:
case kAVXI16x8UConvertI32x4:
case kSSEI16x8AddSatU:
case kAVXI16x8AddSatU:
case kSSEI16x8SubSatU:
case kAVXI16x8SubSatU:
case kSSEI16x8MinU:
case kAVXI16x8MinU:
case kSSEI16x8MaxU:
case kAVXI16x8MaxU:
case kIA32I16x8AddSatU:
case kIA32I16x8SubSatU:
case kIA32I16x8MinU:
case kIA32I16x8MaxU:
case kSSEI16x8GtU:
case kAVXI16x8GtU:
case kSSEI16x8GeU:
@ -265,8 +251,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32Pextrb:
case kIA32Pextrw:
case kIA32S128Store32Lane:
case kSSEI8x16SConvertI16x8:
case kAVXI8x16SConvertI16x8:
case kIA32I8x16SConvertI16x8:
case kIA32I8x16Neg:
case kIA32I8x16Shl:
case kIA32I8x16ShrS:

View File

@ -2249,29 +2249,14 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
#define SIMD_BINOP_LIST(V) \
V(F32x4Min) \
V(F32x4Max) \
V(I32x4MinU) \
V(I32x4MaxU) \
V(I32x4GtU) \
V(I32x4GeU) \
V(I16x8SConvertI32x4) \
V(I16x8Add) \
V(I16x8AddSatS) \
V(I16x8Sub) \
V(I16x8SubSatS) \
V(I16x8Mul) \
V(I16x8MinS) \
V(I16x8MaxS) \
V(I16x8Eq) \
V(I16x8Ne) \
V(I16x8GtS) \
V(I16x8GeS) \
V(I16x8AddSatU) \
V(I16x8SubSatU) \
V(I16x8MinU) \
V(I16x8MaxU) \
V(I16x8GtU) \
V(I16x8GeU) \
V(I8x16SConvertI16x8) \
V(I8x16Ne) \
V(I8x16GeS) \
V(I8x16GtU) \
@ -2302,7 +2287,21 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(I32x4Ne) \
V(I32x4GtS) \
V(I32x4GeS) \
V(I32x4MinU) \
V(I32x4MaxU) \
V(I32x4DotI16x8S) \
V(I16x8Add) \
V(I16x8AddSatS) \
V(I16x8Sub) \
V(I16x8SubSatS) \
V(I16x8Mul) \
V(I16x8MinS) \
V(I16x8MaxS) \
V(I16x8AddSatU) \
V(I16x8SubSatU) \
V(I16x8MinU) \
V(I16x8MaxU) \
V(I16x8SConvertI32x4) \
V(I16x8RoundingAverageU) \
V(I8x16Add) \
V(I8x16AddSatS) \
@ -2316,6 +2315,7 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(I8x16SubSatU) \
V(I8x16MinU) \
V(I8x16MaxU) \
V(I8x16SConvertI16x8) \
V(I8x16RoundingAverageU)
// These opcodes require all inputs to be registers because the codegen is