[wasm-simd][x64][liftoff] Implement i8x16.popcnt

Extract i8x16.popcnt implementation into a macro-assembler function, and
reuse it in Liftoff.

Bug: v8:11002
Change-Id: I86b2f5322c799d44f584cac28c70e0e393bf114f
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2676280
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72565}
This commit is contained in:
Ng Zhi An 2021-02-04 16:01:23 -08:00 committed by Commit Bot
parent 2071cfd7b7
commit 00babf0718
9 changed files with 85 additions and 61 deletions

View File

@ -2297,6 +2297,64 @@ void TurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
}
}
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
XMMRegister tmp) {
DCHECK_NE(dst, tmp);
DCHECK_NE(src, tmp);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqa(tmp, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
vpandn(kScratchDoubleReg, tmp, src);
vpand(dst, tmp, src);
vmovdqa(tmp, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
vpshufb(dst, tmp, dst);
vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
vpaddb(dst, dst, kScratchDoubleReg);
} else if (CpuFeatures::IsSupported(ATOM)) {
// Pre-Goldmont low-power Intel microarchitectures have very slow
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
// algorithm on these processors. ATOM CPU feature captures exactly
// the right set of processors.
xorps(tmp, tmp);
pavgb(tmp, src);
if (dst != src) {
movaps(dst, src);
}
andps(tmp, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x55()));
psubb(dst, tmp);
Operand splat_0x33 = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x33());
movaps(tmp, dst);
andps(dst, splat_0x33);
psrlw(tmp, 2);
andps(tmp, splat_0x33);
paddb(dst, tmp);
movaps(tmp, dst);
psrlw(dst, 4);
paddb(dst, tmp);
andps(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
} else {
movaps(tmp, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
Operand mask = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask());
Move(kScratchDoubleReg, tmp);
andps(tmp, src);
andnps(kScratchDoubleReg, src);
psrlw(kScratchDoubleReg, 4);
movaps(dst, mask);
pshufb(dst, tmp);
movaps(tmp, mask);
pshufb(tmp, kScratchDoubleReg);
paddb(dst, tmp);
}
}
void TurboAssembler::Abspd(XMMRegister dst) {
Andps(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_double_abs_constant()));

View File

@ -608,6 +608,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void S128Store64Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
void Abspd(XMMRegister dst);
void Negpd(XMMRegister dst);

View File

@ -3931,67 +3931,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I8x16Popcnt: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
XMMRegister tmp = i.TempSimd128Register(0);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vmovdqa(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
__ vpandn(kScratchDoubleReg, tmp, src);
__ vpand(dst, tmp, src);
__ vmovdqa(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
__ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
__ vpshufb(dst, tmp, dst);
__ vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
__ vpaddb(dst, dst, kScratchDoubleReg);
} else if (CpuFeatures::IsSupported(ATOM)) {
// Pre-Goldmont low-power Intel microarchitectures have very slow
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
// algorithm on these processors. ATOM CPU feature captures exactly
// the right set of processors.
__ xorps(tmp, tmp);
__ pavgb(tmp, src);
if (dst != src) {
__ movaps(dst, src);
}
__ andps(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x55()));
__ psubb(dst, tmp);
Operand splat_0x33 = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x33());
__ movaps(tmp, dst);
__ andps(dst, splat_0x33);
__ psrlw(tmp, 2);
__ andps(tmp, splat_0x33);
__ paddb(dst, tmp);
__ movaps(tmp, dst);
__ psrlw(dst, 4);
__ paddb(dst, tmp);
__ andps(dst,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
} else {
__ movaps(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
Operand mask = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask());
__ Move(kScratchDoubleReg, tmp);
__ andps(tmp, src);
__ andnps(kScratchDoubleReg, src);
__ psrlw(kScratchDoubleReg, 4);
__ movaps(dst, mask);
__ pshufb(dst, tmp);
__ movaps(tmp, mask);
__ pshufb(tmp, kScratchDoubleReg);
__ paddb(dst, tmp);
}
__ I8x16Popcnt(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.TempSimd128Register(0));
break;
}
case kX64S128Load8Splat: {

View File

@ -3402,6 +3402,11 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
}
}
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i8x16.popcnt");
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
vdup(Neon8, liftoff::GetSimd128Register(dst), src.gp());

View File

@ -2452,6 +2452,11 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
}
}
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i8x16.popcnt");
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Dup(dst.fp().V16B(), src.gp().W());

View File

@ -2874,6 +2874,11 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
Pshufb(dst.fp(), lhs.fp(), mask);
}
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i8x16.popcnt");
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());

View File

@ -887,6 +887,7 @@ class LiftoffAssembler : public TurboAssembler {
bool is_swizzle);
inline void emit_i8x16_swizzle(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_popcnt(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i16x8_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i32x4_splat(LiftoffRegister dst, LiftoffRegister src);

View File

@ -2946,6 +2946,8 @@ class LiftoffCompiler {
switch (opcode) {
case wasm::kExprI8x16Swizzle:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_swizzle);
case wasm::kExprI8x16Popcnt:
return EmitUnOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_popcnt);
case wasm::kExprI8x16Splat:
return EmitUnOp<kI32, kS128>(&LiftoffAssembler::emit_i8x16_splat);
case wasm::kExprI16x8Splat:

View File

@ -2472,6 +2472,11 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
Pshufb(dst.fp(), lhs.fp(), mask);
}
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
I8x16Popcnt(dst.fp(), src.fp(), liftoff::kScratchDoubleReg2);
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());