From a723767935dec385818d1134ea729a4c3a3ddcfb Mon Sep 17 00:00:00 2001 From: Zhi An Ng Date: Mon, 8 Feb 2021 19:02:46 +0000 Subject: [PATCH] Revert "[wasm-simd][x64][liftoff] Implement i8x16.popcnt" This reverts commit 00babf0718492f0ed18744fa67f132af0d0ec584. Reason for revert: Broke mac64 https://ci.chromium.org/ui/p/v8/builders/ci/V8%20Mac64/38510/overview Original change's description: > [wasm-simd][x64][liftoff] Implement i8x16.popcnt > > Extract i8x16.popcnt implementation into a macro-assembler function, and > reuse it in Liftoff. > > Bug: v8:11002 > Change-Id: I86b2f5322c799d44f584cac28c70e0e393bf114f > Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2676280 > Reviewed-by: Clemens Backes > Reviewed-by: Deepti Gandluri > Commit-Queue: Zhi An Ng > Cr-Commit-Position: refs/heads/master@{#72565} TBR=gdeepti@chromium.org,clemensb@chromium.org,zhin@chromium.org Change-Id: I5795b71f65d59237db59907d40c34e4fa7779fe1 No-Presubmit: true No-Tree-Checks: true No-Try: true Bug: v8:11002 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2682505 Reviewed-by: Zhi An Ng Commit-Queue: Zhi An Ng Cr-Commit-Position: refs/heads/master@{#72566} --- src/codegen/x64/macro-assembler-x64.cc | 58 ----------------- src/codegen/x64/macro-assembler-x64.h | 2 - .../backend/x64/code-generator-x64.cc | 63 ++++++++++++++++++- src/wasm/baseline/arm/liftoff-assembler-arm.h | 5 -- .../baseline/arm64/liftoff-assembler-arm64.h | 5 -- .../baseline/ia32/liftoff-assembler-ia32.h | 5 -- src/wasm/baseline/liftoff-assembler.h | 1 - src/wasm/baseline/liftoff-compiler.cc | 2 - src/wasm/baseline/x64/liftoff-assembler-x64.h | 5 -- 9 files changed, 61 insertions(+), 85 deletions(-) diff --git a/src/codegen/x64/macro-assembler-x64.cc b/src/codegen/x64/macro-assembler-x64.cc index f840e884ed..bf10fa007a 100644 --- a/src/codegen/x64/macro-assembler-x64.cc +++ b/src/codegen/x64/macro-assembler-x64.cc @@ -2297,64 +2297,6 @@ void TurboAssembler::S128Store64Lane(Operand dst, XMMRegister src, } } -void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src, - XMMRegister tmp) { - DCHECK_NE(dst, tmp); - DCHECK_NE(src, tmp); - if (CpuFeatures::IsSupported(AVX)) { - CpuFeatureScope avx_scope(this, AVX); - vmovdqa(tmp, ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x0f())); - vpandn(kScratchDoubleReg, tmp, src); - vpand(dst, tmp, src); - vmovdqa(tmp, ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_popcnt_mask())); - vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4); - vpshufb(dst, tmp, dst); - vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg); - vpaddb(dst, dst, kScratchDoubleReg); - } else if (CpuFeatures::IsSupported(ATOM)) { - // Pre-Goldmont low-power Intel microarchitectures have very slow - // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer - // algorithm on these processors. ATOM CPU feature captures exactly - // the right set of processors. - xorps(tmp, tmp); - pavgb(tmp, src); - if (dst != src) { - movaps(dst, src); - } - andps(tmp, ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x55())); - psubb(dst, tmp); - Operand splat_0x33 = ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x33()); - movaps(tmp, dst); - andps(dst, splat_0x33); - psrlw(tmp, 2); - andps(tmp, splat_0x33); - paddb(dst, tmp); - movaps(tmp, dst); - psrlw(dst, 4); - paddb(dst, tmp); - andps(dst, ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x0f())); - } else { - movaps(tmp, ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_splat_0x0f())); - Operand mask = ExternalReferenceAsOperand( - ExternalReference::address_of_wasm_i8x16_popcnt_mask()); - Move(kScratchDoubleReg, tmp); - andps(tmp, src); - andnps(kScratchDoubleReg, src); - psrlw(kScratchDoubleReg, 4); - movaps(dst, mask); - pshufb(dst, tmp); - movaps(tmp, mask); - pshufb(tmp, kScratchDoubleReg); - paddb(dst, tmp); - } -} - void TurboAssembler::Abspd(XMMRegister dst) { Andps(dst, ExternalReferenceAsOperand( ExternalReference::address_of_double_abs_constant())); diff --git a/src/codegen/x64/macro-assembler-x64.h b/src/codegen/x64/macro-assembler-x64.h index d397bbd55b..2536ea0ff0 100644 --- a/src/codegen/x64/macro-assembler-x64.h +++ b/src/codegen/x64/macro-assembler-x64.h @@ -608,8 +608,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx); void S128Store64Lane(Operand dst, XMMRegister src, uint8_t laneidx); - void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp); - void Abspd(XMMRegister dst); void Negpd(XMMRegister dst); diff --git a/src/compiler/backend/x64/code-generator-x64.cc b/src/compiler/backend/x64/code-generator-x64.cc index 85b1298b89..71da878f98 100644 --- a/src/compiler/backend/x64/code-generator-x64.cc +++ b/src/compiler/backend/x64/code-generator-x64.cc @@ -3931,8 +3931,67 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( break; } case kX64I8x16Popcnt: { - __ I8x16Popcnt(i.OutputSimd128Register(), i.InputSimd128Register(0), - i.TempSimd128Register(0)); + XMMRegister dst = i.OutputSimd128Register(); + XMMRegister src = i.InputSimd128Register(0); + XMMRegister tmp = i.TempSimd128Register(0); + + if (CpuFeatures::IsSupported(AVX)) { + CpuFeatureScope avx_scope(tasm(), AVX); + __ vmovdqa(tmp, + __ ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x0f())); + __ vpandn(kScratchDoubleReg, tmp, src); + __ vpand(dst, tmp, src); + __ vmovdqa(tmp, + __ ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_popcnt_mask())); + __ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4); + __ vpshufb(dst, tmp, dst); + __ vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg); + __ vpaddb(dst, dst, kScratchDoubleReg); + } else if (CpuFeatures::IsSupported(ATOM)) { + // Pre-Goldmont low-power Intel microarchitectures have very slow + // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer + // algorithm on these processors. ATOM CPU feature captures exactly + // the right set of processors. + __ xorps(tmp, tmp); + __ pavgb(tmp, src); + if (dst != src) { + __ movaps(dst, src); + } + __ andps(tmp, + __ ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x55())); + __ psubb(dst, tmp); + Operand splat_0x33 = __ ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x33()); + __ movaps(tmp, dst); + __ andps(dst, splat_0x33); + __ psrlw(tmp, 2); + __ andps(tmp, splat_0x33); + __ paddb(dst, tmp); + __ movaps(tmp, dst); + __ psrlw(dst, 4); + __ paddb(dst, tmp); + __ andps(dst, + __ ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x0f())); + } else { + __ movaps(tmp, + __ ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_splat_0x0f())); + Operand mask = __ ExternalReferenceAsOperand( + ExternalReference::address_of_wasm_i8x16_popcnt_mask()); + __ Move(kScratchDoubleReg, tmp); + __ andps(tmp, src); + __ andnps(kScratchDoubleReg, src); + __ psrlw(kScratchDoubleReg, 4); + __ movaps(dst, mask); + __ pshufb(dst, tmp); + __ movaps(tmp, mask); + __ pshufb(tmp, kScratchDoubleReg); + __ paddb(dst, tmp); + } break; } case kX64S128Load8Splat: { diff --git a/src/wasm/baseline/arm/liftoff-assembler-arm.h b/src/wasm/baseline/arm/liftoff-assembler-arm.h index bb47beb298..8474a9d2f7 100644 --- a/src/wasm/baseline/arm/liftoff-assembler-arm.h +++ b/src/wasm/baseline/arm/liftoff-assembler-arm.h @@ -3402,11 +3402,6 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst, } } -void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst, - LiftoffRegister src) { - bailout(kSimd, "i8x16.popcnt"); -} - void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src) { vdup(Neon8, liftoff::GetSimd128Register(dst), src.gp()); diff --git a/src/wasm/baseline/arm64/liftoff-assembler-arm64.h b/src/wasm/baseline/arm64/liftoff-assembler-arm64.h index 2a0977d261..a08a4a5739 100644 --- a/src/wasm/baseline/arm64/liftoff-assembler-arm64.h +++ b/src/wasm/baseline/arm64/liftoff-assembler-arm64.h @@ -2452,11 +2452,6 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst, } } -void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst, - LiftoffRegister src) { - bailout(kSimd, "i8x16.popcnt"); -} - void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src) { Dup(dst.fp().V16B(), src.gp().W()); diff --git a/src/wasm/baseline/ia32/liftoff-assembler-ia32.h b/src/wasm/baseline/ia32/liftoff-assembler-ia32.h index 9cfe31bd1f..4495676868 100644 --- a/src/wasm/baseline/ia32/liftoff-assembler-ia32.h +++ b/src/wasm/baseline/ia32/liftoff-assembler-ia32.h @@ -2874,11 +2874,6 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst, Pshufb(dst.fp(), lhs.fp(), mask); } -void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst, - LiftoffRegister src) { - bailout(kSimd, "i8x16.popcnt"); -} - void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src) { Movd(dst.fp(), src.gp()); diff --git a/src/wasm/baseline/liftoff-assembler.h b/src/wasm/baseline/liftoff-assembler.h index 3a2e2b0bc5..f145486bc5 100644 --- a/src/wasm/baseline/liftoff-assembler.h +++ b/src/wasm/baseline/liftoff-assembler.h @@ -887,7 +887,6 @@ class LiftoffAssembler : public TurboAssembler { bool is_swizzle); inline void emit_i8x16_swizzle(LiftoffRegister dst, LiftoffRegister lhs, LiftoffRegister rhs); - inline void emit_i8x16_popcnt(LiftoffRegister dst, LiftoffRegister src); inline void emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src); inline void emit_i16x8_splat(LiftoffRegister dst, LiftoffRegister src); inline void emit_i32x4_splat(LiftoffRegister dst, LiftoffRegister src); diff --git a/src/wasm/baseline/liftoff-compiler.cc b/src/wasm/baseline/liftoff-compiler.cc index 439c97aa24..582c3f4b1e 100644 --- a/src/wasm/baseline/liftoff-compiler.cc +++ b/src/wasm/baseline/liftoff-compiler.cc @@ -2946,8 +2946,6 @@ class LiftoffCompiler { switch (opcode) { case wasm::kExprI8x16Swizzle: return EmitBinOp(&LiftoffAssembler::emit_i8x16_swizzle); - case wasm::kExprI8x16Popcnt: - return EmitUnOp(&LiftoffAssembler::emit_i8x16_popcnt); case wasm::kExprI8x16Splat: return EmitUnOp(&LiftoffAssembler::emit_i8x16_splat); case wasm::kExprI16x8Splat: diff --git a/src/wasm/baseline/x64/liftoff-assembler-x64.h b/src/wasm/baseline/x64/liftoff-assembler-x64.h index f6b88259ee..b1e9c197b1 100644 --- a/src/wasm/baseline/x64/liftoff-assembler-x64.h +++ b/src/wasm/baseline/x64/liftoff-assembler-x64.h @@ -2472,11 +2472,6 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst, Pshufb(dst.fp(), lhs.fp(), mask); } -void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst, - LiftoffRegister src) { - I8x16Popcnt(dst.fp(), src.fp(), liftoff::kScratchDoubleReg2); -} - void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src) { Movd(dst.fp(), src.gp());