[wasm-simd] Share i8x16.popcnt implementation
No functionality change, moved the i8x16.popcnt algorithm into shared-macro-assembler. Bug: v8:11589 Change-Id: I3dd9d01589bf0176df1e33433f4c3c0c717c253d Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3122572 Reviewed-by: Adam Klein <adamk@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/main@{#76516}
This commit is contained in:
parent
f70cfb8840
commit
ba25a52e88
@ -631,76 +631,6 @@ void TurboAssembler::Cvttsd2ui(Register dst, Operand src, XMMRegister tmp) {
|
||||
add(dst, Immediate(0x80000000));
|
||||
}
|
||||
|
||||
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
|
||||
XMMRegister tmp1, XMMRegister tmp2,
|
||||
Register scratch) {
|
||||
ASM_CODE_COMMENT(this);
|
||||
DCHECK_NE(dst, tmp1);
|
||||
DCHECK_NE(src, tmp1);
|
||||
DCHECK_NE(dst, tmp2);
|
||||
DCHECK_NE(src, tmp2);
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vmovdqa(tmp1, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
|
||||
scratch));
|
||||
vpandn(tmp2, tmp1, src);
|
||||
vpand(dst, tmp1, src);
|
||||
vmovdqa(tmp1, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_popcnt_mask(),
|
||||
scratch));
|
||||
vpsrlw(tmp2, tmp2, 4);
|
||||
vpshufb(dst, tmp1, dst);
|
||||
vpshufb(tmp2, tmp1, tmp2);
|
||||
vpaddb(dst, dst, tmp2);
|
||||
} else if (CpuFeatures::IsSupported(ATOM)) {
|
||||
// Pre-Goldmont low-power Intel microarchitectures have very slow
|
||||
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
|
||||
// algorithm on these processors. ATOM CPU feature captures exactly
|
||||
// the right set of processors.
|
||||
movaps(tmp1, src);
|
||||
psrlw(tmp1, 1);
|
||||
if (dst != src) {
|
||||
movaps(dst, src);
|
||||
}
|
||||
andps(tmp1,
|
||||
ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x55(), scratch));
|
||||
psubb(dst, tmp1);
|
||||
Operand splat_0x33 = ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch);
|
||||
movaps(tmp1, dst);
|
||||
andps(dst, splat_0x33);
|
||||
psrlw(tmp1, 2);
|
||||
andps(tmp1, splat_0x33);
|
||||
paddb(dst, tmp1);
|
||||
movaps(tmp1, dst);
|
||||
psrlw(dst, 4);
|
||||
paddb(dst, tmp1);
|
||||
andps(dst,
|
||||
ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch));
|
||||
} else {
|
||||
CpuFeatureScope sse_scope(this, SSSE3);
|
||||
movaps(tmp1,
|
||||
ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch));
|
||||
Operand mask = ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch);
|
||||
if (tmp2 != tmp1) {
|
||||
movaps(tmp2, tmp1);
|
||||
}
|
||||
andps(tmp1, src);
|
||||
andnps(tmp2, src);
|
||||
psrlw(tmp2, 4);
|
||||
movaps(dst, mask);
|
||||
pshufb(dst, tmp1);
|
||||
movaps(tmp1, mask);
|
||||
pshufb(tmp1, tmp2);
|
||||
paddb(dst, tmp1);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {
|
||||
DCHECK_GE(63, shift);
|
||||
if (shift >= 32) {
|
||||
|
@ -383,12 +383,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
|
||||
}
|
||||
void Cvttsd2ui(Register dst, Operand src, XMMRegister tmp);
|
||||
|
||||
// These Wasm SIMD ops do not have direct lowerings on IA32. These
|
||||
// helpers are optimized to produce the fastest and smallest codegen.
|
||||
// Defined here to allow usage on both TurboFan and Liftoff.
|
||||
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
|
||||
XMMRegister tmp2, Register scratch);
|
||||
|
||||
void Push(Register src) { push(src); }
|
||||
void Push(Operand src) { push(src); }
|
||||
void Push(Immediate value);
|
||||
|
@ -626,6 +626,75 @@ class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler {
|
||||
}
|
||||
}
|
||||
|
||||
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
|
||||
XMMRegister tmp2, Register scratch) {
|
||||
ASM_CODE_COMMENT(this);
|
||||
DCHECK_NE(dst, tmp1);
|
||||
DCHECK_NE(src, tmp1);
|
||||
DCHECK_NE(dst, tmp2);
|
||||
DCHECK_NE(src, tmp2);
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vmovdqa(tmp1, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
|
||||
scratch));
|
||||
vpandn(tmp2, tmp1, src);
|
||||
vpand(dst, tmp1, src);
|
||||
vmovdqa(tmp1, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_popcnt_mask(),
|
||||
scratch));
|
||||
vpsrlw(tmp2, tmp2, 4);
|
||||
vpshufb(dst, tmp1, dst);
|
||||
vpshufb(tmp2, tmp1, tmp2);
|
||||
vpaddb(dst, dst, tmp2);
|
||||
} else if (CpuFeatures::IsSupported(ATOM)) {
|
||||
// Pre-Goldmont low-power Intel microarchitectures have very slow
|
||||
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
|
||||
// algorithm on these processors. ATOM CPU feature captures exactly
|
||||
// the right set of processors.
|
||||
movaps(tmp1, src);
|
||||
psrlw(tmp1, 1);
|
||||
if (dst != src) {
|
||||
movaps(dst, src);
|
||||
}
|
||||
andps(tmp1, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x55(),
|
||||
scratch));
|
||||
psubb(dst, tmp1);
|
||||
Operand splat_0x33 = ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch);
|
||||
movaps(tmp1, dst);
|
||||
andps(dst, splat_0x33);
|
||||
psrlw(tmp1, 2);
|
||||
andps(tmp1, splat_0x33);
|
||||
paddb(dst, tmp1);
|
||||
movaps(tmp1, dst);
|
||||
psrlw(dst, 4);
|
||||
paddb(dst, tmp1);
|
||||
andps(dst, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
|
||||
scratch));
|
||||
} else {
|
||||
CpuFeatureScope sse_scope(this, SSSE3);
|
||||
movaps(tmp1, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
|
||||
scratch));
|
||||
Operand mask = ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch);
|
||||
if (tmp2 != tmp1) {
|
||||
movaps(tmp2, tmp1);
|
||||
}
|
||||
andps(tmp1, src);
|
||||
andnps(tmp2, src);
|
||||
psrlw(tmp2, 4);
|
||||
movaps(dst, mask);
|
||||
pshufb(dst, tmp1);
|
||||
movaps(tmp1, mask);
|
||||
pshufb(tmp1, tmp2);
|
||||
paddb(dst, tmp1);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// All implementation-specific methods must be called through this.
|
||||
Impl* impl() { return static_cast<Impl*>(this); }
|
||||
|
@ -2202,65 +2202,6 @@ void TurboAssembler::Blendvpd(XMMRegister dst, XMMRegister src1,
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
|
||||
XMMRegister tmp) {
|
||||
DCHECK_NE(dst, tmp);
|
||||
DCHECK_NE(src, tmp);
|
||||
DCHECK_NE(kScratchDoubleReg, tmp);
|
||||
if (CpuFeatures::IsSupported(AVX)) {
|
||||
CpuFeatureScope avx_scope(this, AVX);
|
||||
vmovdqa(tmp, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
|
||||
vpandn(kScratchDoubleReg, tmp, src);
|
||||
vpand(dst, tmp, src);
|
||||
vmovdqa(tmp, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
|
||||
vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
|
||||
vpshufb(dst, tmp, dst);
|
||||
vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
|
||||
vpaddb(dst, dst, kScratchDoubleReg);
|
||||
} else if (CpuFeatures::IsSupported(ATOM)) {
|
||||
// Pre-Goldmont low-power Intel microarchitectures have very slow
|
||||
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
|
||||
// algorithm on these processors. ATOM CPU feature captures exactly
|
||||
// the right set of processors.
|
||||
movaps(tmp, src);
|
||||
psrlw(tmp, 1);
|
||||
if (dst != src) {
|
||||
movaps(dst, src);
|
||||
}
|
||||
andps(tmp, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x55()));
|
||||
psubb(dst, tmp);
|
||||
Operand splat_0x33 = ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x33());
|
||||
movaps(tmp, dst);
|
||||
andps(dst, splat_0x33);
|
||||
psrlw(tmp, 2);
|
||||
andps(tmp, splat_0x33);
|
||||
paddb(dst, tmp);
|
||||
movaps(tmp, dst);
|
||||
psrlw(dst, 4);
|
||||
paddb(dst, tmp);
|
||||
andps(dst, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
|
||||
} else {
|
||||
movaps(tmp, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
|
||||
Operand mask = ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_wasm_i8x16_popcnt_mask());
|
||||
Move(kScratchDoubleReg, tmp);
|
||||
andps(tmp, src);
|
||||
andnps(kScratchDoubleReg, src);
|
||||
psrlw(kScratchDoubleReg, 4);
|
||||
movaps(dst, mask);
|
||||
pshufb(dst, tmp);
|
||||
movaps(tmp, mask);
|
||||
pshufb(tmp, kScratchDoubleReg);
|
||||
paddb(dst, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
void TurboAssembler::Abspd(XMMRegister dst) {
|
||||
Andps(dst, ExternalReferenceAsOperand(
|
||||
ExternalReference::address_of_double_abs_constant()));
|
||||
|
@ -470,11 +470,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
|
||||
void Blendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
||||
XMMRegister mask);
|
||||
|
||||
// These Wasm SIMD ops do not have direct lowerings on x64. These
|
||||
// helpers are optimized to produce the fastest and smallest codegen.
|
||||
// Defined here to allow usage on both TurboFan and Liftoff.
|
||||
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
|
||||
|
||||
void Abspd(XMMRegister dst);
|
||||
void Negpd(XMMRegister dst);
|
||||
|
||||
|
@ -3718,7 +3718,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kX64I8x16Popcnt: {
|
||||
__ I8x16Popcnt(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.TempSimd128Register(0));
|
||||
i.TempSimd128Register(0), kScratchDoubleReg,
|
||||
kScratchRegister);
|
||||
break;
|
||||
}
|
||||
case kX64S128Load8Splat: {
|
||||
|
@ -2488,7 +2488,8 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
|
||||
LiftoffRegister src) {
|
||||
I8x16Popcnt(dst.fp(), src.fp(), liftoff::kScratchDoubleReg2);
|
||||
I8x16Popcnt(dst.fp(), src.fp(), kScratchDoubleReg,
|
||||
liftoff::kScratchDoubleReg2, kScratchRegister);
|
||||
}
|
||||
|
||||
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
|
||||
|
Loading…
Reference in New Issue
Block a user