[wasm-simd] Share i8x16.popcnt implementation

No functionality change, moved the i8x16.popcnt algorithm into shared-macro-assembler. Bug: v8:11589 Change-Id: I3dd9d01589bf0176df1e33433f4c3c0c717c253d Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3122572 Reviewed-by: Adam Klein <adamk@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/main@{#76516}
2021-08-26 12:03:28 -07:00 · 2021-08-26 12:03:28 -07:00 · ba25a52e88
commit ba25a52e88
parent f70cfb8840
7 changed files with 73 additions and 142 deletions
--- a/src/codegen/ia32/macro-assembler-ia32.cc
+++ b/src/codegen/ia32/macro-assembler-ia32.cc
@ -631,76 +631,6 @@ void TurboAssembler::Cvttsd2ui(Register dst, Operand src, XMMRegister tmp) {
  add(dst, Immediate(0x80000000));
 }

-void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
-                                 XMMRegister tmp1, XMMRegister tmp2,
-                                 Register scratch) {
-  ASM_CODE_COMMENT(this);
-  DCHECK_NE(dst, tmp1);
-  DCHECK_NE(src, tmp1);
-  DCHECK_NE(dst, tmp2);
-  DCHECK_NE(src, tmp2);
-  if (CpuFeatures::IsSupported(AVX)) {
-    CpuFeatureScope avx_scope(this, AVX);
-    vmovdqa(tmp1, ExternalReferenceAsOperand(
-                      ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
-                      scratch));
-    vpandn(tmp2, tmp1, src);
-    vpand(dst, tmp1, src);
-    vmovdqa(tmp1, ExternalReferenceAsOperand(
-                      ExternalReference::address_of_wasm_i8x16_popcnt_mask(),
-                      scratch));
-    vpsrlw(tmp2, tmp2, 4);
-    vpshufb(dst, tmp1, dst);
-    vpshufb(tmp2, tmp1, tmp2);
-    vpaddb(dst, dst, tmp2);
-  } else if (CpuFeatures::IsSupported(ATOM)) {
-    // Pre-Goldmont low-power Intel microarchitectures have very slow
-    // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
-    // algorithm on these processors. ATOM CPU feature captures exactly
-    // the right set of processors.
-    movaps(tmp1, src);
-    psrlw(tmp1, 1);
-    if (dst != src) {
-      movaps(dst, src);
-    }
-    andps(tmp1,
-          ExternalReferenceAsOperand(
-              ExternalReference::address_of_wasm_i8x16_splat_0x55(), scratch));
-    psubb(dst, tmp1);
-    Operand splat_0x33 = ExternalReferenceAsOperand(
-        ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch);
-    movaps(tmp1, dst);
-    andps(dst, splat_0x33);
-    psrlw(tmp1, 2);
-    andps(tmp1, splat_0x33);
-    paddb(dst, tmp1);
-    movaps(tmp1, dst);
-    psrlw(dst, 4);
-    paddb(dst, tmp1);
-    andps(dst,
-          ExternalReferenceAsOperand(
-              ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch));
-  } else {
-    CpuFeatureScope sse_scope(this, SSSE3);
-    movaps(tmp1,
-           ExternalReferenceAsOperand(
-               ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch));
-    Operand mask = ExternalReferenceAsOperand(
-        ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch);
-    if (tmp2 != tmp1) {
-      movaps(tmp2, tmp1);
-    }
-    andps(tmp1, src);
-    andnps(tmp2, src);
-    psrlw(tmp2, 4);
-    movaps(dst, mask);
-    pshufb(dst, tmp1);
-    movaps(tmp1, mask);
-    pshufb(tmp1, tmp2);
-    paddb(dst, tmp1);
-  }
-}
-
 void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {
  DCHECK_GE(63, shift);
  if (shift >= 32) {
--- a/src/codegen/ia32/macro-assembler-ia32.h
+++ b/src/codegen/ia32/macro-assembler-ia32.h
@ -383,12 +383,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
  }
  void Cvttsd2ui(Register dst, Operand src, XMMRegister tmp);

-  // These Wasm SIMD ops do not have direct lowerings on IA32. These
-  // helpers are optimized to produce the fastest and smallest codegen.
-  // Defined here to allow usage on both TurboFan and Liftoff.
-  void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
-                   XMMRegister tmp2, Register scratch);
-
  void Push(Register src) { push(src); }
  void Push(Operand src) { push(src); }
  void Push(Immediate value);
--- a/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h
+++ b/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h
@ -626,6 +626,75 @@ class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler {
    }
  }

+  void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
+                   XMMRegister tmp2, Register scratch) {
+    ASM_CODE_COMMENT(this);
+    DCHECK_NE(dst, tmp1);
+    DCHECK_NE(src, tmp1);
+    DCHECK_NE(dst, tmp2);
+    DCHECK_NE(src, tmp2);
+    if (CpuFeatures::IsSupported(AVX)) {
+      CpuFeatureScope avx_scope(this, AVX);
+      vmovdqa(tmp1, ExternalReferenceAsOperand(
+                        ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
+                        scratch));
+      vpandn(tmp2, tmp1, src);
+      vpand(dst, tmp1, src);
+      vmovdqa(tmp1, ExternalReferenceAsOperand(
+                        ExternalReference::address_of_wasm_i8x16_popcnt_mask(),
+                        scratch));
+      vpsrlw(tmp2, tmp2, 4);
+      vpshufb(dst, tmp1, dst);
+      vpshufb(tmp2, tmp1, tmp2);
+      vpaddb(dst, dst, tmp2);
+    } else if (CpuFeatures::IsSupported(ATOM)) {
+      // Pre-Goldmont low-power Intel microarchitectures have very slow
+      // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
+      // algorithm on these processors. ATOM CPU feature captures exactly
+      // the right set of processors.
+      movaps(tmp1, src);
+      psrlw(tmp1, 1);
+      if (dst != src) {
+        movaps(dst, src);
+      }
+      andps(tmp1, ExternalReferenceAsOperand(
+                      ExternalReference::address_of_wasm_i8x16_splat_0x55(),
+                      scratch));
+      psubb(dst, tmp1);
+      Operand splat_0x33 = ExternalReferenceAsOperand(
+          ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch);
+      movaps(tmp1, dst);
+      andps(dst, splat_0x33);
+      psrlw(tmp1, 2);
+      andps(tmp1, splat_0x33);
+      paddb(dst, tmp1);
+      movaps(tmp1, dst);
+      psrlw(dst, 4);
+      paddb(dst, tmp1);
+      andps(dst, ExternalReferenceAsOperand(
+                     ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
+                     scratch));
+    } else {
+      CpuFeatureScope sse_scope(this, SSSE3);
+      movaps(tmp1, ExternalReferenceAsOperand(
+                       ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
+                       scratch));
+      Operand mask = ExternalReferenceAsOperand(
+          ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch);
+      if (tmp2 != tmp1) {
+        movaps(tmp2, tmp1);
+      }
+      andps(tmp1, src);
+      andnps(tmp2, src);
+      psrlw(tmp2, 4);
+      movaps(dst, mask);
+      pshufb(dst, tmp1);
+      movaps(tmp1, mask);
+      pshufb(tmp1, tmp2);
+      paddb(dst, tmp1);
+    }
+  }
+
 private:
  // All implementation-specific methods must be called through this.
  Impl* impl() { return static_cast<Impl*>(this); }
--- a/src/codegen/x64/macro-assembler-x64.cc
+++ b/src/codegen/x64/macro-assembler-x64.cc
@ -2202,65 +2202,6 @@ void TurboAssembler::Blendvpd(XMMRegister dst, XMMRegister src1,
  }
 }

-void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
-                                 XMMRegister tmp) {
-  DCHECK_NE(dst, tmp);
-  DCHECK_NE(src, tmp);
-  DCHECK_NE(kScratchDoubleReg, tmp);
-  if (CpuFeatures::IsSupported(AVX)) {
-    CpuFeatureScope avx_scope(this, AVX);
-    vmovdqa(tmp, ExternalReferenceAsOperand(
-                     ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
-    vpandn(kScratchDoubleReg, tmp, src);
-    vpand(dst, tmp, src);
-    vmovdqa(tmp, ExternalReferenceAsOperand(
-                     ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
-    vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
-    vpshufb(dst, tmp, dst);
-    vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
-    vpaddb(dst, dst, kScratchDoubleReg);
-  } else if (CpuFeatures::IsSupported(ATOM)) {
-    // Pre-Goldmont low-power Intel microarchitectures have very slow
-    // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
-    // algorithm on these processors. ATOM CPU feature captures exactly
-    // the right set of processors.
-    movaps(tmp, src);
-    psrlw(tmp, 1);
-    if (dst != src) {
-      movaps(dst, src);
-    }
-    andps(tmp, ExternalReferenceAsOperand(
-                   ExternalReference::address_of_wasm_i8x16_splat_0x55()));
-    psubb(dst, tmp);
-    Operand splat_0x33 = ExternalReferenceAsOperand(
-        ExternalReference::address_of_wasm_i8x16_splat_0x33());
-    movaps(tmp, dst);
-    andps(dst, splat_0x33);
-    psrlw(tmp, 2);
-    andps(tmp, splat_0x33);
-    paddb(dst, tmp);
-    movaps(tmp, dst);
-    psrlw(dst, 4);
-    paddb(dst, tmp);
-    andps(dst, ExternalReferenceAsOperand(
-                   ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
-  } else {
-    movaps(tmp, ExternalReferenceAsOperand(
-                    ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
-    Operand mask = ExternalReferenceAsOperand(
-        ExternalReference::address_of_wasm_i8x16_popcnt_mask());
-    Move(kScratchDoubleReg, tmp);
-    andps(tmp, src);
-    andnps(kScratchDoubleReg, src);
-    psrlw(kScratchDoubleReg, 4);
-    movaps(dst, mask);
-    pshufb(dst, tmp);
-    movaps(tmp, mask);
-    pshufb(tmp, kScratchDoubleReg);
-    paddb(dst, tmp);
-  }
-}
-
 void TurboAssembler::Abspd(XMMRegister dst) {
  Andps(dst, ExternalReferenceAsOperand(
                 ExternalReference::address_of_double_abs_constant()));
--- a/src/codegen/x64/macro-assembler-x64.h
+++ b/src/codegen/x64/macro-assembler-x64.h
@ -470,11 +470,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
  void Blendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2,
                XMMRegister mask);

-  // These Wasm SIMD ops do not have direct lowerings on x64. These
-  // helpers are optimized to produce the fastest and smallest codegen.
-  // Defined here to allow usage on both TurboFan and Liftoff.
-  void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
-
  void Abspd(XMMRegister dst);
  void Negpd(XMMRegister dst);

--- a/src/compiler/backend/x64/code-generator-x64.cc
+++ b/src/compiler/backend/x64/code-generator-x64.cc
@ -3718,7 +3718,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
    case kX64I8x16Popcnt: {
      __ I8x16Popcnt(i.OutputSimd128Register(), i.InputSimd128Register(0),
-                     i.TempSimd128Register(0));
+                     i.TempSimd128Register(0), kScratchDoubleReg,
+                     kScratchRegister);
      break;
    }
    case kX64S128Load8Splat: {
--- a/src/wasm/baseline/x64/liftoff-assembler-x64.h
+++ b/src/wasm/baseline/x64/liftoff-assembler-x64.h
@ -2488,7 +2488,8 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,

 void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
                                         LiftoffRegister src) {
-  I8x16Popcnt(dst.fp(), src.fp(), liftoff::kScratchDoubleReg2);
+  I8x16Popcnt(dst.fp(), src.fp(), kScratchDoubleReg,
+              liftoff::kScratchDoubleReg2, kScratchRegister);
 }

 void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,