From ba25a52e88498843b9cd02eefa6f311ca43c23ff Mon Sep 17 00:00:00 2001 From: Ng Zhi An Date: Thu, 26 Aug 2021 12:03:28 -0700 Subject: [PATCH] [wasm-simd] Share i8x16.popcnt implementation No functionality change, moved the i8x16.popcnt algorithm into shared-macro-assembler. Bug: v8:11589 Change-Id: I3dd9d01589bf0176df1e33433f4c3c0c717c253d Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3122572 Reviewed-by: Adam Klein Commit-Queue: Zhi An Ng Cr-Commit-Position: refs/heads/main@{#76516} --- src/codegen/ia32/macro-assembler-ia32.cc | 70 ------------------- src/codegen/ia32/macro-assembler-ia32.h | 6 -- .../macro-assembler-shared-ia32-x64.h | 69 ++++++++++++++++++ src/codegen/x64/macro-assembler-x64.cc | 59 ---------------- src/codegen/x64/macro-assembler-x64.h | 5 -- .../backend/x64/code-generator-x64.cc | 3 +- src/wasm/baseline/x64/liftoff-assembler-x64.h | 3 +- 7 files changed, 73 insertions(+), 142 deletions(-) diff --git a/src/codegen/ia32/macro-assembler-ia32.cc b/src/codegen/ia32/macro-assembler-ia32.cc index 57effffea6..7f66b48960 100644 --- a/src/codegen/ia32/macro-assembler-ia32.cc +++ b/src/codegen/ia32/macro-assembler-ia32.cc @@ -631,76 +631,6 @@ void TurboAssembler::Cvttsd2ui(Register dst, Operand src, XMMRegister tmp) { add(dst, Immediate(0x80000000)); } -void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src, - XMMRegister tmp1, XMMRegister tmp2, - Register scratch) { - ASM_CODE_COMMENT(this); - DCHECK_NE(dst, tmp1); - DCHECK_NE(src, tmp1); - DCHECK_NE(dst, tmp2); - DCHECK_NE(src, tmp2); - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope avx_scope(this, AVX); - vmovdqa(tmp1, ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x0f(), - scratch)); - vpandn(tmp2, tmp1, src); - vpand(dst, tmp1, src); - vmovdqa(tmp1, ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_popcnt_mask(), - scratch)); - vpsrlw(tmp2, tmp2, 4); - vpshufb(dst, tmp1, dst); - vpshufb(tmp2, tmp1, tmp2); - vpaddb(dst, dst, tmp2); - } else if (CpuFeatures::IsSupported(ATOM)) { - // Pre-Goldmont low-power Intel microarchitectures have very slow - // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer - // algorithm on these processors. ATOM CPU feature captures exactly - // the right set of processors. - movaps(tmp1, src); - psrlw(tmp1, 1); - if (dst != src) { - movaps(dst, src); - } - andps(tmp1, - ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x55(), scratch)); - psubb(dst, tmp1); - Operand splat_0x33 = ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch); - movaps(tmp1, dst); - andps(dst, splat_0x33); - psrlw(tmp1, 2); - andps(tmp1, splat_0x33); - paddb(dst, tmp1); - movaps(tmp1, dst); - psrlw(dst, 4); - paddb(dst, tmp1); - andps(dst, - ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch)); - } else { - CpuFeatureScope sse_scope(this, SSSE3); - movaps(tmp1, - ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch)); - Operand mask = ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch); - if (tmp2 != tmp1) { - movaps(tmp2, tmp1); - } - andps(tmp1, src); - andnps(tmp2, src); - psrlw(tmp2, 4); - movaps(dst, mask); - pshufb(dst, tmp1); - movaps(tmp1, mask); - pshufb(tmp1, tmp2); - paddb(dst, tmp1); - } -} - void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) { DCHECK_GE(63, shift); if (shift >= 32) { diff --git a/src/codegen/ia32/macro-assembler-ia32.h b/src/codegen/ia32/macro-assembler-ia32.h index 236c56efeb..c254e76d10 100644 --- a/src/codegen/ia32/macro-assembler-ia32.h +++ b/src/codegen/ia32/macro-assembler-ia32.h @@ -383,12 +383,6 @@ class V8_EXPORT_PRIVATE TurboAssembler } void Cvttsd2ui(Register dst, Operand src, XMMRegister tmp); - // These Wasm SIMD ops do not have direct lowerings on IA32. These - // helpers are optimized to produce the fastest and smallest codegen. - // Defined here to allow usage on both TurboFan and Liftoff. - void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1, - XMMRegister tmp2, Register scratch); - void Push(Register src) { push(src); } void Push(Operand src) { push(src); } void Push(Immediate value); diff --git a/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h b/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h index 853f7ef887..db78da5b74 100644 --- a/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h +++ b/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h @@ -626,6 +626,75 @@ class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler { } } + void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1, + XMMRegister tmp2, Register scratch) { + ASM_CODE_COMMENT(this); + DCHECK_NE(dst, tmp1); + DCHECK_NE(src, tmp1); + DCHECK_NE(dst, tmp2); + DCHECK_NE(src, tmp2); + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(this, AVX); + vmovdqa(tmp1, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x0f(), + scratch)); + vpandn(tmp2, tmp1, src); + vpand(dst, tmp1, src); + vmovdqa(tmp1, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_popcnt_mask(), + scratch)); + vpsrlw(tmp2, tmp2, 4); + vpshufb(dst, tmp1, dst); + vpshufb(tmp2, tmp1, tmp2); + vpaddb(dst, dst, tmp2); + } else if (CpuFeatures::IsSupported(ATOM)) { + // Pre-Goldmont low-power Intel microarchitectures have very slow + // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer + // algorithm on these processors. ATOM CPU feature captures exactly + // the right set of processors. + movaps(tmp1, src); + psrlw(tmp1, 1); + if (dst != src) { + movaps(dst, src); + } + andps(tmp1, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x55(), + scratch)); + psubb(dst, tmp1); + Operand splat_0x33 = ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch); + movaps(tmp1, dst); + andps(dst, splat_0x33); + psrlw(tmp1, 2); + andps(tmp1, splat_0x33); + paddb(dst, tmp1); + movaps(tmp1, dst); + psrlw(dst, 4); + paddb(dst, tmp1); + andps(dst, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x0f(), + scratch)); + } else { + CpuFeatureScope sse_scope(this, SSSE3); + movaps(tmp1, ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x0f(), + scratch)); + Operand mask = ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch); + if (tmp2 != tmp1) { + movaps(tmp2, tmp1); + } + andps(tmp1, src); + andnps(tmp2, src); + psrlw(tmp2, 4); + movaps(dst, mask); + pshufb(dst, tmp1); + movaps(tmp1, mask); + pshufb(tmp1, tmp2); + paddb(dst, tmp1); + } + } + private: // All implementation-specific methods must be called through this. Impl* impl() { return static_cast(this); } diff --git a/src/codegen/x64/macro-assembler-x64.cc b/src/codegen/x64/macro-assembler-x64.cc index 2990633a6a..19228ad66e 100644 --- a/src/codegen/x64/macro-assembler-x64.cc +++ b/src/codegen/x64/macro-assembler-x64.cc @@ -2202,65 +2202,6 @@ void TurboAssembler::Blendvpd(XMMRegister dst, XMMRegister src1, } } -void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src, - XMMRegister tmp) { - DCHECK_NE(dst, tmp); - DCHECK_NE(src, tmp); - DCHECK_NE(kScratchDoubleReg, tmp); - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope avx_scope(this, AVX); - vmovdqa(tmp, ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x0f())); - vpandn(kScratchDoubleReg, tmp, src); - vpand(dst, tmp, src); - vmovdqa(tmp, ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_popcnt_mask())); - vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4); - vpshufb(dst, tmp, dst); - vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg); - vpaddb(dst, dst, kScratchDoubleReg); - } else if (CpuFeatures::IsSupported(ATOM)) { - // Pre-Goldmont low-power Intel microarchitectures have very slow - // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer - // algorithm on these processors. ATOM CPU feature captures exactly - // the right set of processors. - movaps(tmp, src); - psrlw(tmp, 1); - if (dst != src) { - movaps(dst, src); - } - andps(tmp, ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x55())); - psubb(dst, tmp); - Operand splat_0x33 = ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x33()); - movaps(tmp, dst); - andps(dst, splat_0x33); - psrlw(tmp, 2); - andps(tmp, splat_0x33); - paddb(dst, tmp); - movaps(tmp, dst); - psrlw(dst, 4); - paddb(dst, tmp); - andps(dst, ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x0f())); - } else { - movaps(tmp, ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x0f())); - Operand mask = ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_popcnt_mask()); - Move(kScratchDoubleReg, tmp); - andps(tmp, src); - andnps(kScratchDoubleReg, src); - psrlw(kScratchDoubleReg, 4); - movaps(dst, mask); - pshufb(dst, tmp); - movaps(tmp, mask); - pshufb(tmp, kScratchDoubleReg); - paddb(dst, tmp); - } -} - void TurboAssembler::Abspd(XMMRegister dst) { Andps(dst, ExternalReferenceAsOperand( ExternalReference::address_of_double_abs_constant())); diff --git a/src/codegen/x64/macro-assembler-x64.h b/src/codegen/x64/macro-assembler-x64.h index ce9ddcea1e..7fa4ffc142 100644 --- a/src/codegen/x64/macro-assembler-x64.h +++ b/src/codegen/x64/macro-assembler-x64.h @@ -470,11 +470,6 @@ class V8_EXPORT_PRIVATE TurboAssembler void Blendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask); - // These Wasm SIMD ops do not have direct lowerings on x64. These - // helpers are optimized to produce the fastest and smallest codegen. - // Defined here to allow usage on both TurboFan and Liftoff. - void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp); - void Abspd(XMMRegister dst); void Negpd(XMMRegister dst); diff --git a/src/compiler/backend/x64/code-generator-x64.cc b/src/compiler/backend/x64/code-generator-x64.cc index 3f22d7e5e7..b54bbbd29a 100644 --- a/src/compiler/backend/x64/code-generator-x64.cc +++ b/src/compiler/backend/x64/code-generator-x64.cc @@ -3718,7 +3718,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( } case kX64I8x16Popcnt: { __ I8x16Popcnt(i.OutputSimd128Register(), i.InputSimd128Register(0), - i.TempSimd128Register(0)); + i.TempSimd128Register(0), kScratchDoubleReg, + kScratchRegister); break; } case kX64S128Load8Splat: { diff --git a/src/wasm/baseline/x64/liftoff-assembler-x64.h b/src/wasm/baseline/x64/liftoff-assembler-x64.h index 6636acd1d9..1180cd531f 100644 --- a/src/wasm/baseline/x64/liftoff-assembler-x64.h +++ b/src/wasm/baseline/x64/liftoff-assembler-x64.h @@ -2488,7 +2488,8 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst, void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst, LiftoffRegister src) { - I8x16Popcnt(dst.fp(), src.fp(), liftoff::kScratchDoubleReg2); + I8x16Popcnt(dst.fp(), src.fp(), kScratchDoubleReg, + liftoff::kScratchDoubleReg2, kScratchRegister); } void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,