[wasm-simd][x64] Add AVX codegen

Mostly i16x8 ops, this should complete all i16x8 ops.

Bug: v8:9561
Change-Id: I73515c3c38a5a055ac00f97ba5c363df1bf3c771
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2111711
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66897}
This commit is contained in:
Ng Zhi An 2020-03-25 13:39:34 -07:00 committed by Commit Bot
parent 21746d6d3a
commit 0c423a7abf
2 changed files with 54 additions and 54 deletions

View File

@ -148,7 +148,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Pcmpeqw, pcmpeqw)
AVX_OP(Pcmpeqd, pcmpeqd)
AVX_OP(Pcmpgtb, pcmpgtb)
AVX_OP(Pcmpgtw, pcmpgtw)
AVX_OP(Pmaxsw, pmaxsw)
AVX_OP(Pmaxub, pmaxub)
AVX_OP(Pminsw, pminsw)
AVX_OP(Pminub, pminub)
AVX_OP(Addss, addss)
AVX_OP(Addsd, addsd)
@ -179,24 +182,32 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Cvttps2dq, cvttps2dq)
AVX_OP(Ucomiss, ucomiss)
AVX_OP(Ucomisd, ucomisd)
AVX_OP(Paddusb, paddusb)
AVX_OP(Pand, pand)
AVX_OP(Por, por)
AVX_OP(Pxor, pxor)
AVX_OP(Psubw, psubw)
AVX_OP(Psubd, psubd)
AVX_OP(Psubq, psubq)
AVX_OP(Psubsw, psubsw)
AVX_OP(Psubusw, psubusw)
AVX_OP(Pslld, pslld)
AVX_OP(Pavgb, pavgb)
AVX_OP(Pavgw, pavgw)
AVX_OP(Psraw, psraw)
AVX_OP(Psrad, psrad)
AVX_OP(Psllw, psllw)
AVX_OP(Psllq, psllq)
AVX_OP(Psrlw, psrlw)
AVX_OP(Psrld, psrld)
AVX_OP(Psrlq, psrlq)
AVX_OP(Paddw, paddw)
AVX_OP(Paddd, paddd)
AVX_OP(Paddq, paddq)
AVX_OP(Paddsw, paddsw)
AVX_OP(Paddusb, paddusb)
AVX_OP(Paddusw, paddusw)
AVX_OP(Pcmpgtd, pcmpgtd)
AVX_OP(Pmullw, pmullw)
AVX_OP(Pmuludq, pmuludq)
AVX_OP(Addpd, addpd)
AVX_OP(Subpd, subpd)
@ -234,7 +245,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Movlhps, movlhps)
AVX_OP_SSE3(Movddup, movddup)
AVX_OP_SSSE3(Phaddd, phaddd)
AVX_OP_SSSE3(Phaddw, phaddw)
AVX_OP_SSSE3(Pshufb, pshufb)
AVX_OP_SSSE3(Psignw, psignw)
AVX_OP_SSSE3(Psignd, psignd)
AVX_OP_SSSE3(Palignr, palignr)
AVX_OP_SSSE3(Pabsb, pabsb)
@ -244,9 +257,11 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Packusdw, packusdw)
AVX_OP_SSE4_1(Pminsb, pminsb)
AVX_OP_SSE4_1(Pminsd, pminsd)
AVX_OP_SSE4_1(Pminuw, pminuw)
AVX_OP_SSE4_1(Pminud, pminud)
AVX_OP_SSE4_1(Pmaxsb, pmaxsb)
AVX_OP_SSE4_1(Pmaxsd, pmaxsd)
AVX_OP_SSE4_1(Pmaxuw, pmaxuw)
AVX_OP_SSE4_1(Pmaxud, pmaxud)
AVX_OP_SSE4_1(Pmulld, pmulld)
AVX_OP_SSE4_1(Extractps, extractps)

View File

@ -3087,35 +3087,32 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64S128Zero: {
XMMRegister dst = i.OutputSimd128Register();
__ xorps(dst, dst);
__ Xorps(dst, dst);
break;
}
case kX64I16x8Splat: {
XMMRegister dst = i.OutputSimd128Register();
if (HasRegisterInput(instr, 0)) {
__ movd(dst, i.InputRegister(0));
__ Movd(dst, i.InputRegister(0));
} else {
__ movd(dst, i.InputOperand(0));
__ Movd(dst, i.InputOperand(0));
}
__ pshuflw(dst, dst, 0x0);
__ pshufd(dst, dst, 0x0);
__ Pshuflw(dst, dst, static_cast<uint8_t>(0x0));
__ Pshufd(dst, dst, static_cast<uint8_t>(0x0));
break;
}
case kX64I16x8ExtractLaneU: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
Register dst = i.OutputRegister();
__ Pextrw(dst, i.InputSimd128Register(0), i.InputInt8(1));
break;
}
case kX64I16x8ExtractLaneS: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
Register dst = i.OutputRegister();
__ Pextrw(dst, i.InputSimd128Register(0), i.InputInt8(1));
__ movsxwl(dst, dst);
break;
}
case kX64I16x8ReplaceLane: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
if (HasRegisterInput(instr, 2)) {
__ Pinsrw(i.OutputSimd128Register(), i.InputRegister(2),
i.InputInt8(1));
@ -3135,26 +3132,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I16x8Neg: {
CpuFeatureScope sse_scope(tasm(), SSSE3);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
if (dst == src) {
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ psignw(dst, kScratchDoubleReg);
__ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ Psignw(dst, kScratchDoubleReg);
} else {
__ pxor(dst, dst);
__ psubw(dst, src);
__ Pxor(dst, dst);
__ Psubw(dst, src);
}
break;
}
case kX64I16x8Shl: {
// Take shift value modulo 2^4.
ASSEMBLE_SIMD_SHIFT(psllw, 4);
ASSEMBLE_SIMD_SHIFT(Psllw, 4);
break;
}
case kX64I16x8ShrS: {
// Take shift value modulo 2^4.
ASSEMBLE_SIMD_SHIFT(psraw, 4);
ASSEMBLE_SIMD_SHIFT(Psraw, 4);
break;
}
case kX64I16x8SConvertI32x4: {
@ -3163,67 +3159,61 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I16x8Add: {
__ paddw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Paddw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8AddSaturateS: {
__ paddsw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Paddsw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8AddHoriz: {
CpuFeatureScope sse_scope(tasm(), SSSE3);
__ phaddw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Phaddw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8Sub: {
__ psubw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Psubw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8SubSaturateS: {
__ psubsw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Psubsw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8Mul: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pmullw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Pmullw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8MinS: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pminsw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Pminsw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8MaxS: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pmaxsw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Pmaxsw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8Eq: {
__ pcmpeqw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Pcmpeqw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8Ne: {
XMMRegister tmp = i.TempSimd128Register(0);
__ pcmpeqw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ pcmpeqw(tmp, tmp);
__ pxor(i.OutputSimd128Register(), tmp);
__ Pcmpeqw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Pcmpeqw(tmp, tmp);
__ Pxor(i.OutputSimd128Register(), tmp);
break;
}
case kX64I16x8GtS: {
__ pcmpgtw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Pcmpgtw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8GeS: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(1);
__ pminsw(dst, src);
__ pcmpeqw(dst, src);
__ Pminsw(dst, src);
__ Pcmpeqw(dst, src);
break;
}
case kX64I16x8UConvertI8x16Low: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pmovzxbw(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ Pmovzxbw(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kX64I16x8UConvertI8x16High: {
@ -3234,50 +3224,45 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I16x8ShrU: {
// Take shift value modulo 2^4.
ASSEMBLE_SIMD_SHIFT(psrlw, 4);
ASSEMBLE_SIMD_SHIFT(Psrlw, 4);
break;
}
case kX64I16x8UConvertI32x4: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ Packusdw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8AddSaturateU: {
__ paddusw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Paddusw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8SubSaturateU: {
__ psubusw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Psubusw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8MinU: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pminuw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Pminuw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8MaxU: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pmaxuw(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Pmaxuw(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I16x8GtU: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(1);
XMMRegister tmp = i.TempSimd128Register(0);
__ pmaxuw(dst, src);
__ pcmpeqw(dst, src);
__ pcmpeqw(tmp, tmp);
__ pxor(dst, tmp);
__ Pmaxuw(dst, src);
__ Pcmpeqw(dst, src);
__ Pcmpeqw(tmp, tmp);
__ Pxor(dst, tmp);
break;
}
case kX64I16x8GeU: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(1);
__ pminuw(dst, src);
__ pcmpeqw(dst, src);
__ Pminuw(dst, src);
__ Pcmpeqw(dst, src);
break;
}
case kX64I16x8RoundingAverageU: {