[wasm-simd] Share i8x16.popcnt implementation

No functionality change, moved the i8x16.popcnt algorithm
into shared-macro-assembler.

Bug: v8:11589
Change-Id: I3dd9d01589bf0176df1e33433f4c3c0c717c253d
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3122572
Reviewed-by: Adam Klein <adamk@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76516}
This commit is contained in:
Ng Zhi An 2021-08-26 12:03:28 -07:00 committed by V8 LUCI CQ
parent f70cfb8840
commit ba25a52e88
7 changed files with 73 additions and 142 deletions

View File

@ -631,76 +631,6 @@ void TurboAssembler::Cvttsd2ui(Register dst, Operand src, XMMRegister tmp) {
add(dst, Immediate(0x80000000));
}
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
XMMRegister tmp1, XMMRegister tmp2,
Register scratch) {
ASM_CODE_COMMENT(this);
DCHECK_NE(dst, tmp1);
DCHECK_NE(src, tmp1);
DCHECK_NE(dst, tmp2);
DCHECK_NE(src, tmp2);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqa(tmp1, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
scratch));
vpandn(tmp2, tmp1, src);
vpand(dst, tmp1, src);
vmovdqa(tmp1, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask(),
scratch));
vpsrlw(tmp2, tmp2, 4);
vpshufb(dst, tmp1, dst);
vpshufb(tmp2, tmp1, tmp2);
vpaddb(dst, dst, tmp2);
} else if (CpuFeatures::IsSupported(ATOM)) {
// Pre-Goldmont low-power Intel microarchitectures have very slow
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
// algorithm on these processors. ATOM CPU feature captures exactly
// the right set of processors.
movaps(tmp1, src);
psrlw(tmp1, 1);
if (dst != src) {
movaps(dst, src);
}
andps(tmp1,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x55(), scratch));
psubb(dst, tmp1);
Operand splat_0x33 = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch);
movaps(tmp1, dst);
andps(dst, splat_0x33);
psrlw(tmp1, 2);
andps(tmp1, splat_0x33);
paddb(dst, tmp1);
movaps(tmp1, dst);
psrlw(dst, 4);
paddb(dst, tmp1);
andps(dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch));
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(tmp1,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch));
Operand mask = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch);
if (tmp2 != tmp1) {
movaps(tmp2, tmp1);
}
andps(tmp1, src);
andnps(tmp2, src);
psrlw(tmp2, 4);
movaps(dst, mask);
pshufb(dst, tmp1);
movaps(tmp1, mask);
pshufb(tmp1, tmp2);
paddb(dst, tmp1);
}
}
void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {
DCHECK_GE(63, shift);
if (shift >= 32) {

View File

@ -383,12 +383,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
}
void Cvttsd2ui(Register dst, Operand src, XMMRegister tmp);
// These Wasm SIMD ops do not have direct lowerings on IA32. These
// helpers are optimized to produce the fastest and smallest codegen.
// Defined here to allow usage on both TurboFan and Liftoff.
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
XMMRegister tmp2, Register scratch);
void Push(Register src) { push(src); }
void Push(Operand src) { push(src); }
void Push(Immediate value);

View File

@ -626,6 +626,75 @@ class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler {
}
}
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
XMMRegister tmp2, Register scratch) {
ASM_CODE_COMMENT(this);
DCHECK_NE(dst, tmp1);
DCHECK_NE(src, tmp1);
DCHECK_NE(dst, tmp2);
DCHECK_NE(src, tmp2);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqa(tmp1, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
scratch));
vpandn(tmp2, tmp1, src);
vpand(dst, tmp1, src);
vmovdqa(tmp1, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask(),
scratch));
vpsrlw(tmp2, tmp2, 4);
vpshufb(dst, tmp1, dst);
vpshufb(tmp2, tmp1, tmp2);
vpaddb(dst, dst, tmp2);
} else if (CpuFeatures::IsSupported(ATOM)) {
// Pre-Goldmont low-power Intel microarchitectures have very slow
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
// algorithm on these processors. ATOM CPU feature captures exactly
// the right set of processors.
movaps(tmp1, src);
psrlw(tmp1, 1);
if (dst != src) {
movaps(dst, src);
}
andps(tmp1, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x55(),
scratch));
psubb(dst, tmp1);
Operand splat_0x33 = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch);
movaps(tmp1, dst);
andps(dst, splat_0x33);
psrlw(tmp1, 2);
andps(tmp1, splat_0x33);
paddb(dst, tmp1);
movaps(tmp1, dst);
psrlw(dst, 4);
paddb(dst, tmp1);
andps(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
scratch));
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(tmp1, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
scratch));
Operand mask = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch);
if (tmp2 != tmp1) {
movaps(tmp2, tmp1);
}
andps(tmp1, src);
andnps(tmp2, src);
psrlw(tmp2, 4);
movaps(dst, mask);
pshufb(dst, tmp1);
movaps(tmp1, mask);
pshufb(tmp1, tmp2);
paddb(dst, tmp1);
}
}
private:
// All implementation-specific methods must be called through this.
Impl* impl() { return static_cast<Impl*>(this); }

View File

@ -2202,65 +2202,6 @@ void TurboAssembler::Blendvpd(XMMRegister dst, XMMRegister src1,
}
}
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
XMMRegister tmp) {
DCHECK_NE(dst, tmp);
DCHECK_NE(src, tmp);
DCHECK_NE(kScratchDoubleReg, tmp);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqa(tmp, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
vpandn(kScratchDoubleReg, tmp, src);
vpand(dst, tmp, src);
vmovdqa(tmp, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
vpshufb(dst, tmp, dst);
vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
vpaddb(dst, dst, kScratchDoubleReg);
} else if (CpuFeatures::IsSupported(ATOM)) {
// Pre-Goldmont low-power Intel microarchitectures have very slow
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
// algorithm on these processors. ATOM CPU feature captures exactly
// the right set of processors.
movaps(tmp, src);
psrlw(tmp, 1);
if (dst != src) {
movaps(dst, src);
}
andps(tmp, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x55()));
psubb(dst, tmp);
Operand splat_0x33 = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x33());
movaps(tmp, dst);
andps(dst, splat_0x33);
psrlw(tmp, 2);
andps(tmp, splat_0x33);
paddb(dst, tmp);
movaps(tmp, dst);
psrlw(dst, 4);
paddb(dst, tmp);
andps(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
} else {
movaps(tmp, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
Operand mask = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask());
Move(kScratchDoubleReg, tmp);
andps(tmp, src);
andnps(kScratchDoubleReg, src);
psrlw(kScratchDoubleReg, 4);
movaps(dst, mask);
pshufb(dst, tmp);
movaps(tmp, mask);
pshufb(tmp, kScratchDoubleReg);
paddb(dst, tmp);
}
}
void TurboAssembler::Abspd(XMMRegister dst) {
Andps(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_double_abs_constant()));

View File

@ -470,11 +470,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
void Blendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask);
// These Wasm SIMD ops do not have direct lowerings on x64. These
// helpers are optimized to produce the fastest and smallest codegen.
// Defined here to allow usage on both TurboFan and Liftoff.
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
void Abspd(XMMRegister dst);
void Negpd(XMMRegister dst);

View File

@ -3718,7 +3718,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I8x16Popcnt: {
__ I8x16Popcnt(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.TempSimd128Register(0));
i.TempSimd128Register(0), kScratchDoubleReg,
kScratchRegister);
break;
}
case kX64S128Load8Splat: {

View File

@ -2488,7 +2488,8 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
I8x16Popcnt(dst.fp(), src.fp(), liftoff::kScratchDoubleReg2);
I8x16Popcnt(dst.fp(), src.fp(), kScratchDoubleReg,
liftoff::kScratchDoubleReg2, kScratchRegister);
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,