[wasm-simd][x64][liftoff] Implement i8x16.popcnt

Extract i8x16.popcnt implementation into a macro-assembler function, and reuse it in Liftoff. Bug: v8:11002 Change-Id: I86b2f5322c799d44f584cac28c70e0e393bf114f Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2676280 Reviewed-by: Clemens Backes <clemensb@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#72565}
2021-02-04 16:01:23 -08:00 · 2021-02-04 16:01:23 -08:00 · 00babf0718
commit 00babf0718
parent 2071cfd7b7
9 changed files with 85 additions and 61 deletions
--- a/src/codegen/x64/macro-assembler-x64.cc
+++ b/src/codegen/x64/macro-assembler-x64.cc
@ -2297,6 +2297,64 @@ void TurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
  }
 }

+void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
+                                 XMMRegister tmp) {
+  DCHECK_NE(dst, tmp);
+  DCHECK_NE(src, tmp);
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope avx_scope(this, AVX);
+    vmovdqa(tmp, ExternalReferenceAsOperand(
+                     ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
+    vpandn(kScratchDoubleReg, tmp, src);
+    vpand(dst, tmp, src);
+    vmovdqa(tmp, ExternalReferenceAsOperand(
+                     ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
+    vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
+    vpshufb(dst, tmp, dst);
+    vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
+    vpaddb(dst, dst, kScratchDoubleReg);
+  } else if (CpuFeatures::IsSupported(ATOM)) {
+    // Pre-Goldmont low-power Intel microarchitectures have very slow
+    // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
+    // algorithm on these processors. ATOM CPU feature captures exactly
+    // the right set of processors.
+    xorps(tmp, tmp);
+    pavgb(tmp, src);
+    if (dst != src) {
+      movaps(dst, src);
+    }
+    andps(tmp, ExternalReferenceAsOperand(
+                   ExternalReference::address_of_wasm_i8x16_splat_0x55()));
+    psubb(dst, tmp);
+    Operand splat_0x33 = ExternalReferenceAsOperand(
+        ExternalReference::address_of_wasm_i8x16_splat_0x33());
+    movaps(tmp, dst);
+    andps(dst, splat_0x33);
+    psrlw(tmp, 2);
+    andps(tmp, splat_0x33);
+    paddb(dst, tmp);
+    movaps(tmp, dst);
+    psrlw(dst, 4);
+    paddb(dst, tmp);
+    andps(dst, ExternalReferenceAsOperand(
+                   ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
+  } else {
+    movaps(tmp, ExternalReferenceAsOperand(
+                    ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
+    Operand mask = ExternalReferenceAsOperand(
+        ExternalReference::address_of_wasm_i8x16_popcnt_mask());
+    Move(kScratchDoubleReg, tmp);
+    andps(tmp, src);
+    andnps(kScratchDoubleReg, src);
+    psrlw(kScratchDoubleReg, 4);
+    movaps(dst, mask);
+    pshufb(dst, tmp);
+    movaps(tmp, mask);
+    pshufb(tmp, kScratchDoubleReg);
+    paddb(dst, tmp);
+  }
+}
+
 void TurboAssembler::Abspd(XMMRegister dst) {
  Andps(dst, ExternalReferenceAsOperand(
                 ExternalReference::address_of_double_abs_constant()));
--- a/src/codegen/x64/macro-assembler-x64.h
+++ b/src/codegen/x64/macro-assembler-x64.h
@ -608,6 +608,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
  void S128Store64Lane(Operand dst, XMMRegister src, uint8_t laneidx);

+  void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
+
  void Abspd(XMMRegister dst);
  void Negpd(XMMRegister dst);

--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -3931,67 +3931,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kX64I8x16Popcnt: {
-      XMMRegister dst = i.OutputSimd128Register();
-      XMMRegister src = i.InputSimd128Register(0);
-      XMMRegister tmp = i.TempSimd128Register(0);
-
-      if (CpuFeatures::IsSupported(AVX)) {
-        CpuFeatureScope avx_scope(tasm(), AVX);
-        __ vmovdqa(tmp,
-                   __ ExternalReferenceAsOperand(
-                       ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
-        __ vpandn(kScratchDoubleReg, tmp, src);
-        __ vpand(dst, tmp, src);
-        __ vmovdqa(tmp,
-                   __ ExternalReferenceAsOperand(
-                       ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
-        __ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
-        __ vpshufb(dst, tmp, dst);
-        __ vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
-        __ vpaddb(dst, dst, kScratchDoubleReg);
-      } else if (CpuFeatures::IsSupported(ATOM)) {
-        // Pre-Goldmont low-power Intel microarchitectures have very slow
-        // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
-        // algorithm on these processors. ATOM CPU feature captures exactly
-        // the right set of processors.
-        __ xorps(tmp, tmp);
-        __ pavgb(tmp, src);
-        if (dst != src) {
-          __ movaps(dst, src);
-        }
-        __ andps(tmp,
-                 __ ExternalReferenceAsOperand(
-                     ExternalReference::address_of_wasm_i8x16_splat_0x55()));
-        __ psubb(dst, tmp);
-        Operand splat_0x33 = __ ExternalReferenceAsOperand(
-            ExternalReference::address_of_wasm_i8x16_splat_0x33());
-        __ movaps(tmp, dst);
-        __ andps(dst, splat_0x33);
-        __ psrlw(tmp, 2);
-        __ andps(tmp, splat_0x33);
-        __ paddb(dst, tmp);
-        __ movaps(tmp, dst);
-        __ psrlw(dst, 4);
-        __ paddb(dst, tmp);
-        __ andps(dst,
-                 __ ExternalReferenceAsOperand(
-                     ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
-      } else {
-        __ movaps(tmp,
-                  __ ExternalReferenceAsOperand(
-                      ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
-        Operand mask = __ ExternalReferenceAsOperand(
-            ExternalReference::address_of_wasm_i8x16_popcnt_mask());
-        __ Move(kScratchDoubleReg, tmp);
-        __ andps(tmp, src);
-        __ andnps(kScratchDoubleReg, src);
-        __ psrlw(kScratchDoubleReg, 4);
-        __ movaps(dst, mask);
-        __ pshufb(dst, tmp);
-        __ movaps(tmp, mask);
-        __ pshufb(tmp, kScratchDoubleReg);
-        __ paddb(dst, tmp);
-      }
+      __ I8x16Popcnt(i.OutputSimd128Register(), i.InputSimd128Register(0),
+                     i.TempSimd128Register(0));
      break;
    }
    case kX64S128Load8Splat: {
--- a/src/wasm/baseline/arm/liftoff-assembler-arm.h
+++ b/src/wasm/baseline/arm/liftoff-assembler-arm.h
@ -3402,6 +3402,11 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
  }
 }

+void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
+                                         LiftoffRegister src) {
+  bailout(kSimd, "i8x16.popcnt");
+}
+
 void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
                                        LiftoffRegister src) {
  vdup(Neon8, liftoff::GetSimd128Register(dst), src.gp());
--- a/src/wasm/baseline/arm64/liftoff-assembler-arm64.h
+++ b/src/wasm/baseline/arm64/liftoff-assembler-arm64.h
@ -2452,6 +2452,11 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
  }
 }

+void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
+                                         LiftoffRegister src) {
+  bailout(kSimd, "i8x16.popcnt");
+}
+
 void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
                                        LiftoffRegister src) {
  Dup(dst.fp().V16B(), src.gp().W());
--- a/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
+++ b/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
@ -2874,6 +2874,11 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
  Pshufb(dst.fp(), lhs.fp(), mask);
 }

+void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
+                                         LiftoffRegister src) {
+  bailout(kSimd, "i8x16.popcnt");
+}
+
 void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
                                        LiftoffRegister src) {
  Movd(dst.fp(), src.gp());
--- a/src/wasm/baseline/liftoff-assembler.h
+++ b/src/wasm/baseline/liftoff-assembler.h
@ -887,6 +887,7 @@ class LiftoffAssembler : public TurboAssembler {
                                 bool is_swizzle);
  inline void emit_i8x16_swizzle(LiftoffRegister dst, LiftoffRegister lhs,
                                 LiftoffRegister rhs);
+  inline void emit_i8x16_popcnt(LiftoffRegister dst, LiftoffRegister src);
  inline void emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src);
  inline void emit_i16x8_splat(LiftoffRegister dst, LiftoffRegister src);
  inline void emit_i32x4_splat(LiftoffRegister dst, LiftoffRegister src);
--- a/src/wasm/baseline/liftoff-compiler.cc
+++ b/src/wasm/baseline/liftoff-compiler.cc
@ -2946,6 +2946,8 @@ class LiftoffCompiler {
    switch (opcode) {
      case wasm::kExprI8x16Swizzle:
        return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_swizzle);
+      case wasm::kExprI8x16Popcnt:
+        return EmitUnOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_popcnt);
      case wasm::kExprI8x16Splat:
        return EmitUnOp<kI32, kS128>(&LiftoffAssembler::emit_i8x16_splat);
      case wasm::kExprI16x8Splat:
--- a/src/wasm/baseline/x64/liftoff-assembler-x64.h
+++ b/src/wasm/baseline/x64/liftoff-assembler-x64.h
@ -2472,6 +2472,11 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
  Pshufb(dst.fp(), lhs.fp(), mask);
 }

+void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
+                                         LiftoffRegister src) {
+  I8x16Popcnt(dst.fp(), src.fp(), liftoff::kScratchDoubleReg2);
+}
+
 void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
                                        LiftoffRegister src) {
  Movd(dst.fp(), src.gp());