Revert "[wasm-simd][x64][liftoff] Implement i8x16.popcnt"

This reverts commit 00babf0718. Reason for revert: Broke mac64 https://ci.chromium.org/ui/p/v8/builders/ci/V8%20Mac64/38510/overview Original change's description: > [wasm-simd][x64][liftoff] Implement i8x16.popcnt > > Extract i8x16.popcnt implementation into a macro-assembler function, and > reuse it in Liftoff. > > Bug: v8:11002 > Change-Id: I86b2f5322c799d44f584cac28c70e0e393bf114f > Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2676280 > Reviewed-by: Clemens Backes <clemensb@chromium.org> > Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> > Commit-Queue: Zhi An Ng <zhin@chromium.org> > Cr-Commit-Position: refs/heads/master@{#72565} TBR=gdeepti@chromium.org,clemensb@chromium.org,zhin@chromium.org Change-Id: I5795b71f65d59237db59907d40c34e4fa7779fe1 No-Presubmit: true No-Tree-Checks: true No-Try: true Bug: v8:11002 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2682505 Reviewed-by: Zhi An Ng <zhin@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#72566}
2021-02-08 19:02:46 +00:00 · 2021-02-08 19:02:46 +00:00 · a723767935
commit a723767935
parent 00babf0718
9 changed files with 61 additions and 85 deletions
--- a/src/codegen/x64/macro-assembler-x64.cc
+++ b/src/codegen/x64/macro-assembler-x64.cc
@ -2297,64 +2297,6 @@ void TurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
  }
 }

-void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
-                                 XMMRegister tmp) {
-  DCHECK_NE(dst, tmp);
-  DCHECK_NE(src, tmp);
-  if (CpuFeatures::IsSupported(AVX)) {
-    CpuFeatureScope avx_scope(this, AVX);
-    vmovdqa(tmp, ExternalReferenceAsOperand(
-                     ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
-    vpandn(kScratchDoubleReg, tmp, src);
-    vpand(dst, tmp, src);
-    vmovdqa(tmp, ExternalReferenceAsOperand(
-                     ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
-    vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
-    vpshufb(dst, tmp, dst);
-    vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
-    vpaddb(dst, dst, kScratchDoubleReg);
-  } else if (CpuFeatures::IsSupported(ATOM)) {
-    // Pre-Goldmont low-power Intel microarchitectures have very slow
-    // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
-    // algorithm on these processors. ATOM CPU feature captures exactly
-    // the right set of processors.
-    xorps(tmp, tmp);
-    pavgb(tmp, src);
-    if (dst != src) {
-      movaps(dst, src);
-    }
-    andps(tmp, ExternalReferenceAsOperand(
-                   ExternalReference::address_of_wasm_i8x16_splat_0x55()));
-    psubb(dst, tmp);
-    Operand splat_0x33 = ExternalReferenceAsOperand(
-        ExternalReference::address_of_wasm_i8x16_splat_0x33());
-    movaps(tmp, dst);
-    andps(dst, splat_0x33);
-    psrlw(tmp, 2);
-    andps(tmp, splat_0x33);
-    paddb(dst, tmp);
-    movaps(tmp, dst);
-    psrlw(dst, 4);
-    paddb(dst, tmp);
-    andps(dst, ExternalReferenceAsOperand(
-                   ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
-  } else {
-    movaps(tmp, ExternalReferenceAsOperand(
-                    ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
-    Operand mask = ExternalReferenceAsOperand(
-        ExternalReference::address_of_wasm_i8x16_popcnt_mask());
-    Move(kScratchDoubleReg, tmp);
-    andps(tmp, src);
-    andnps(kScratchDoubleReg, src);
-    psrlw(kScratchDoubleReg, 4);
-    movaps(dst, mask);
-    pshufb(dst, tmp);
-    movaps(tmp, mask);
-    pshufb(tmp, kScratchDoubleReg);
-    paddb(dst, tmp);
-  }
-}
-
 void TurboAssembler::Abspd(XMMRegister dst) {
  Andps(dst, ExternalReferenceAsOperand(
                 ExternalReference::address_of_double_abs_constant()));
--- a/src/codegen/x64/macro-assembler-x64.h
+++ b/src/codegen/x64/macro-assembler-x64.h
@ -608,8 +608,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
  void S128Store64Lane(Operand dst, XMMRegister src, uint8_t laneidx);

-  void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
-
  void Abspd(XMMRegister dst);
  void Negpd(XMMRegister dst);

--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -3931,8 +3931,67 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kX64I8x16Popcnt: {
-      __ I8x16Popcnt(i.OutputSimd128Register(), i.InputSimd128Register(0),
-                     i.TempSimd128Register(0));
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src = i.InputSimd128Register(0);
+      XMMRegister tmp = i.TempSimd128Register(0);
+
+      if (CpuFeatures::IsSupported(AVX)) {
+        CpuFeatureScope avx_scope(tasm(), AVX);
+        __ vmovdqa(tmp,
+                   __ ExternalReferenceAsOperand(
+                       ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
+        __ vpandn(kScratchDoubleReg, tmp, src);
+        __ vpand(dst, tmp, src);
+        __ vmovdqa(tmp,
+                   __ ExternalReferenceAsOperand(
+                       ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
+        __ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
+        __ vpshufb(dst, tmp, dst);
+        __ vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
+        __ vpaddb(dst, dst, kScratchDoubleReg);
+      } else if (CpuFeatures::IsSupported(ATOM)) {
+        // Pre-Goldmont low-power Intel microarchitectures have very slow
+        // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
+        // algorithm on these processors. ATOM CPU feature captures exactly
+        // the right set of processors.
+        __ xorps(tmp, tmp);
+        __ pavgb(tmp, src);
+        if (dst != src) {
+          __ movaps(dst, src);
+        }
+        __ andps(tmp,
+                 __ ExternalReferenceAsOperand(
+                     ExternalReference::address_of_wasm_i8x16_splat_0x55()));
+        __ psubb(dst, tmp);
+        Operand splat_0x33 = __ ExternalReferenceAsOperand(
+            ExternalReference::address_of_wasm_i8x16_splat_0x33());
+        __ movaps(tmp, dst);
+        __ andps(dst, splat_0x33);
+        __ psrlw(tmp, 2);
+        __ andps(tmp, splat_0x33);
+        __ paddb(dst, tmp);
+        __ movaps(tmp, dst);
+        __ psrlw(dst, 4);
+        __ paddb(dst, tmp);
+        __ andps(dst,
+                 __ ExternalReferenceAsOperand(
+                     ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
+      } else {
+        __ movaps(tmp,
+                  __ ExternalReferenceAsOperand(
+                      ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
+        Operand mask = __ ExternalReferenceAsOperand(
+            ExternalReference::address_of_wasm_i8x16_popcnt_mask());
+        __ Move(kScratchDoubleReg, tmp);
+        __ andps(tmp, src);
+        __ andnps(kScratchDoubleReg, src);
+        __ psrlw(kScratchDoubleReg, 4);
+        __ movaps(dst, mask);
+        __ pshufb(dst, tmp);
+        __ movaps(tmp, mask);
+        __ pshufb(tmp, kScratchDoubleReg);
+        __ paddb(dst, tmp);
+      }
      break;
    }
    case kX64S128Load8Splat: {
--- a/src/wasm/baseline/arm/liftoff-assembler-arm.h
+++ b/src/wasm/baseline/arm/liftoff-assembler-arm.h
@ -3402,11 +3402,6 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
  }
 }

-void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
-                                         LiftoffRegister src) {
-  bailout(kSimd, "i8x16.popcnt");
-}
-
 void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
                                        LiftoffRegister src) {
  vdup(Neon8, liftoff::GetSimd128Register(dst), src.gp());
--- a/src/wasm/baseline/arm64/liftoff-assembler-arm64.h
+++ b/src/wasm/baseline/arm64/liftoff-assembler-arm64.h
@ -2452,11 +2452,6 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
  }
 }

-void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
-                                         LiftoffRegister src) {
-  bailout(kSimd, "i8x16.popcnt");
-}
-
 void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
                                        LiftoffRegister src) {
  Dup(dst.fp().V16B(), src.gp().W());
--- a/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
+++ b/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
@ -2874,11 +2874,6 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
  Pshufb(dst.fp(), lhs.fp(), mask);
 }

-void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
-                                         LiftoffRegister src) {
-  bailout(kSimd, "i8x16.popcnt");
-}
-
 void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
                                        LiftoffRegister src) {
  Movd(dst.fp(), src.gp());
--- a/src/wasm/baseline/liftoff-assembler.h
+++ b/src/wasm/baseline/liftoff-assembler.h
@ -887,7 +887,6 @@ class LiftoffAssembler : public TurboAssembler {
                                 bool is_swizzle);
  inline void emit_i8x16_swizzle(LiftoffRegister dst, LiftoffRegister lhs,
                                 LiftoffRegister rhs);
-  inline void emit_i8x16_popcnt(LiftoffRegister dst, LiftoffRegister src);
  inline void emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src);
  inline void emit_i16x8_splat(LiftoffRegister dst, LiftoffRegister src);
  inline void emit_i32x4_splat(LiftoffRegister dst, LiftoffRegister src);
--- a/src/wasm/baseline/liftoff-compiler.cc
+++ b/src/wasm/baseline/liftoff-compiler.cc
@ -2946,8 +2946,6 @@ class LiftoffCompiler {
    switch (opcode) {
      case wasm::kExprI8x16Swizzle:
        return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_swizzle);
-      case wasm::kExprI8x16Popcnt:
-        return EmitUnOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_popcnt);
      case wasm::kExprI8x16Splat:
        return EmitUnOp<kI32, kS128>(&LiftoffAssembler::emit_i8x16_splat);
      case wasm::kExprI16x8Splat:
--- a/src/wasm/baseline/x64/liftoff-assembler-x64.h
+++ b/src/wasm/baseline/x64/liftoff-assembler-x64.h
@ -2472,11 +2472,6 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
  Pshufb(dst.fp(), lhs.fp(), mask);
 }

-void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
-                                         LiftoffRegister src) {
-  I8x16Popcnt(dst.fp(), src.fp(), liftoff::kScratchDoubleReg2);
-}
-
 void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
                                        LiftoffRegister src) {
  Movd(dst.fp(), src.gp());