[wasm-simd][x64][liftoff] Implement i8x16.popcnt
Extract i8x16.popcnt implementation into a macro-assembler function, and reuse it in Liftoff. Bug: v8:11002 Change-Id: I86b2f5322c799d44f584cac28c70e0e393bf114f Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2676280 Reviewed-by: Clemens Backes <clemensb@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#72565}
This commit is contained in:
parent
2071cfd7b7
commit
00babf0718
@ -2297,6 +2297,64 @@ void TurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
|
||||
XMMRegister tmp) {
|
||||
DCHECK_NE(dst, tmp);
|
||||
DCHECK_NE(src, tmp);
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vmovdqa(tmp, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
|
||||
vpandn(kScratchDoubleReg, tmp, src);
|
||||
vpand(dst, tmp, src);
|
||||
vmovdqa(tmp, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
|
||||
vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
|
||||
vpshufb(dst, tmp, dst);
|
||||
vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
|
||||
vpaddb(dst, dst, kScratchDoubleReg);
|
||||
} else if (CpuFeatures::IsSupported(ATOM)) {
|
||||
// Pre-Goldmont low-power Intel microarchitectures have very slow
|
||||
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
|
||||
// algorithm on these processors. ATOM CPU feature captures exactly
|
||||
// the right set of processors.
|
||||
xorps(tmp, tmp);
|
||||
pavgb(tmp, src);
|
||||
if (dst != src) {
|
||||
movaps(dst, src);
|
||||
}
|
||||
andps(tmp, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x55()));
|
||||
psubb(dst, tmp);
|
||||
Operand splat_0x33 = ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x33());
|
||||
movaps(tmp, dst);
|
||||
andps(dst, splat_0x33);
|
||||
psrlw(tmp, 2);
|
||||
andps(tmp, splat_0x33);
|
||||
paddb(dst, tmp);
|
||||
movaps(tmp, dst);
|
||||
psrlw(dst, 4);
|
||||
paddb(dst, tmp);
|
||||
andps(dst, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
|
||||
} else {
|
||||
movaps(tmp, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
|
||||
Operand mask = ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_popcnt_mask());
|
||||
Move(kScratchDoubleReg, tmp);
|
||||
andps(tmp, src);
|
||||
andnps(kScratchDoubleReg, src);
|
||||
psrlw(kScratchDoubleReg, 4);
|
||||
movaps(dst, mask);
|
||||
pshufb(dst, tmp);
|
||||
movaps(tmp, mask);
|
||||
pshufb(tmp, kScratchDoubleReg);
|
||||
paddb(dst, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::Abspd(XMMRegister dst) {
|
||||
Andps(dst, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_double_abs_constant()));
|
||||
|
@ -608,6 +608,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
|
||||
void S128Store64Lane(Operand dst, XMMRegister src, uint8_t laneidx);
|
||||
|
||||
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
|
||||
|
||||
void Abspd(XMMRegister dst);
|
||||
void Negpd(XMMRegister dst);
|
||||
|
||||
|
@ -3931,67 +3931,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kX64I8x16Popcnt: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src = i.InputSimd128Register(0);
|
||||
XMMRegister tmp = i.TempSimd128Register(0);
|
||||
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
__ vmovdqa(tmp,
|
||||
__ ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
|
||||
__ vpandn(kScratchDoubleReg, tmp, src);
|
||||
__ vpand(dst, tmp, src);
|
||||
__ vmovdqa(tmp,
|
||||
__ ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
|
||||
__ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
|
||||
__ vpshufb(dst, tmp, dst);
|
||||
__ vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
|
||||
__ vpaddb(dst, dst, kScratchDoubleReg);
|
||||
} else if (CpuFeatures::IsSupported(ATOM)) {
|
||||
// Pre-Goldmont low-power Intel microarchitectures have very slow
|
||||
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
|
||||
// algorithm on these processors. ATOM CPU feature captures exactly
|
||||
// the right set of processors.
|
||||
__ xorps(tmp, tmp);
|
||||
__ pavgb(tmp, src);
|
||||
if (dst != src) {
|
||||
__ movaps(dst, src);
|
||||
}
|
||||
__ andps(tmp,
|
||||
__ ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x55()));
|
||||
__ psubb(dst, tmp);
|
||||
Operand splat_0x33 = __ ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x33());
|
||||
__ movaps(tmp, dst);
|
||||
__ andps(dst, splat_0x33);
|
||||
__ psrlw(tmp, 2);
|
||||
__ andps(tmp, splat_0x33);
|
||||
__ paddb(dst, tmp);
|
||||
__ movaps(tmp, dst);
|
||||
__ psrlw(dst, 4);
|
||||
__ paddb(dst, tmp);
|
||||
__ andps(dst,
|
||||
__ ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
|
||||
} else {
|
||||
__ movaps(tmp,
|
||||
__ ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
|
||||
Operand mask = __ ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_popcnt_mask());
|
||||
__ Move(kScratchDoubleReg, tmp);
|
||||
__ andps(tmp, src);
|
||||
__ andnps(kScratchDoubleReg, src);
|
||||
__ psrlw(kScratchDoubleReg, 4);
|
||||
__ movaps(dst, mask);
|
||||
__ pshufb(dst, tmp);
|
||||
__ movaps(tmp, mask);
|
||||
__ pshufb(tmp, kScratchDoubleReg);
|
||||
__ paddb(dst, tmp);
|
||||
}
|
||||
__ I8x16Popcnt(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.TempSimd128Register(0));
|
||||
break;
|
||||
}
|
||||
case kX64S128Load8Splat: {
|
||||
|
@ -3402,6 +3402,11 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
|
||||
}
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i8x16.popcnt");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
vdup(Neon8, liftoff::GetSimd128Register(dst), src.gp());
|
||||
|
@ -2452,6 +2452,11 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
|
||||
}
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i8x16.popcnt");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
Dup(dst.fp().V16B(), src.gp().W());
|
||||
|
@ -2874,6 +2874,11 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
|
||||
Pshufb(dst.fp(), lhs.fp(), mask);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
bailout(kSimd, "i8x16.popcnt");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
Movd(dst.fp(), src.gp());
|
||||
|
@ -887,6 +887,7 @@ class LiftoffAssembler : public TurboAssembler {
|
||||
bool is_swizzle);
|
||||
inline void emit_i8x16_swizzle(LiftoffRegister dst, LiftoffRegister lhs,
|
||||
LiftoffRegister rhs);
|
||||
inline void emit_i8x16_popcnt(LiftoffRegister dst, LiftoffRegister src);
|
||||
inline void emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src);
|
||||
inline void emit_i16x8_splat(LiftoffRegister dst, LiftoffRegister src);
|
||||
inline void emit_i32x4_splat(LiftoffRegister dst, LiftoffRegister src);
|
||||
|
@ -2946,6 +2946,8 @@ class LiftoffCompiler {
|
||||
switch (opcode) {
|
||||
case wasm::kExprI8x16Swizzle:
|
||||
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_swizzle);
|
||||
case wasm::kExprI8x16Popcnt:
|
||||
return EmitUnOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_popcnt);
|
||||
case wasm::kExprI8x16Splat:
|
||||
return EmitUnOp<kI32, kS128>(&LiftoffAssembler::emit_i8x16_splat);
|
||||
case wasm::kExprI16x8Splat:
|
||||
|
@ -2472,6 +2472,11 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
|
||||
Pshufb(dst.fp(), lhs.fp(), mask);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
I8x16Popcnt(dst.fp(), src.fp(), liftoff::kScratchDoubleReg2);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
Movd(dst.fp(), src.gp());
|
||||
|
Loading…
Reference in New Issue
Block a user