[Liftoff] Implement i64 popcnt

This is the last remaining missing instruction from the MVP. This CL
adds support for ia32, x64, arm, and arm64.
For CPUs which do not support the POPCNT instruction, there exists a
fallback implementation in C.

R=jkummerow@chromium.org

Bug: v8:9919
Change-Id: Ie7a79a46e91726e15379b9a21b59775bbf5de556
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1895569
Commit-Queue: Clemens Backes <clemensb@chromium.org>
Reviewed-by: Jakob Kummerow <jkummerow@chromium.org>
Cr-Commit-Position: refs/heads/master@{#64764}
This commit is contained in:
Clemens Backes 2019-11-04 15:53:49 +01:00 committed by Commit Bot
parent 6c0825aaa7
commit d710756a7f
6 changed files with 92 additions and 22 deletions

View File

@ -762,30 +762,36 @@ void LiftoffAssembler::emit_i32_ctz(Register dst, Register src) {
clz(dst, dst);
}
bool LiftoffAssembler::emit_i32_popcnt(Register dst, Register src) {
{
UseScratchRegisterScope temps(this);
LiftoffRegList pinned = LiftoffRegList::ForRegs(dst);
Register scratch = pinned.set(GetUnusedRegister(kGpReg, pinned)).gp();
Register scratch_2 = GetUnusedRegister(kGpReg, pinned).gp();
// x = x - ((x & (0x55555555 << 1)) >> 1)
and_(scratch, src, Operand(0xaaaaaaaa));
sub(dst, src, Operand(scratch, LSR, 1));
// x = (x & 0x33333333) + ((x & (0x33333333 << 2)) >> 2)
mov(scratch, Operand(0x33333333));
and_(scratch_2, dst, Operand(scratch, LSL, 2));
and_(scratch, dst, scratch);
add(dst, scratch, Operand(scratch_2, LSR, 2));
}
namespace liftoff {
inline void GeneratePopCnt(Assembler* assm, Register dst, Register src,
Register scratch1, Register scratch2) {
DCHECK(!AreAliased(dst, scratch1, scratch2));
if (src == scratch1) std::swap(scratch1, scratch2);
// x = x - ((x & (0x55555555 << 1)) >> 1)
assm->and_(scratch1, src, Operand(0xaaaaaaaa));
assm->sub(dst, src, Operand(scratch1, LSR, 1));
// x = (x & 0x33333333) + ((x & (0x33333333 << 2)) >> 2)
assm->mov(scratch1, Operand(0x33333333));
assm->and_(scratch2, dst, Operand(scratch1, LSL, 2));
assm->and_(scratch1, dst, scratch1);
assm->add(dst, scratch1, Operand(scratch2, LSR, 2));
// x = (x + (x >> 4)) & 0x0F0F0F0F
add(dst, dst, Operand(dst, LSR, 4));
and_(dst, dst, Operand(0x0f0f0f0f));
assm->add(dst, dst, Operand(dst, LSR, 4));
assm->and_(dst, dst, Operand(0x0f0f0f0f));
// x = x + (x >> 8)
add(dst, dst, Operand(dst, LSR, 8));
assm->add(dst, dst, Operand(dst, LSR, 8));
// x = x + (x >> 16)
add(dst, dst, Operand(dst, LSR, 16));
assm->add(dst, dst, Operand(dst, LSR, 16));
// x = x & 0x3F
and_(dst, dst, Operand(0x3f));
assm->and_(dst, dst, Operand(0x3f));
}
} // namespace liftoff
bool LiftoffAssembler::emit_i32_popcnt(Register dst, Register src) {
LiftoffRegList pinned = LiftoffRegList::ForRegs(dst);
Register scratch1 = pinned.set(GetUnusedRegister(kGpReg, pinned)).gp();
Register scratch2 = GetUnusedRegister(kGpReg, pinned).gp();
liftoff::GeneratePopCnt(this, dst, src, scratch1, scratch2);
return true;
}
@ -1001,6 +1007,23 @@ void LiftoffAssembler::emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src) {
mov(dst.high_gp(), Operand(0)); // High word of result is always 0.
}
bool LiftoffAssembler::emit_i64_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
// Produce partial popcnts in the two dst registers, making sure not to
// overwrite the second src register before using it.
Register src1 = src.high_gp() == dst.low_gp() ? src.high_gp() : src.low_gp();
Register src2 = src.high_gp() == dst.low_gp() ? src.low_gp() : src.high_gp();
LiftoffRegList pinned = LiftoffRegList::ForRegs(dst, src2);
Register scratch1 = pinned.set(GetUnusedRegister(kGpReg, pinned)).gp();
Register scratch2 = GetUnusedRegister(kGpReg, pinned).gp();
liftoff::GeneratePopCnt(this, dst.low_gp(), src1, scratch1, scratch2);
liftoff::GeneratePopCnt(this, dst.high_gp(), src2, scratch1, scratch2);
// Now add the two into the lower dst reg and clear the higher dst reg.
add(dst.low_gp(), dst.low_gp(), dst.high_gp());
mov(dst.high_gp(), Operand(0));
return true;
}
bool LiftoffAssembler::emit_f32_ceil(DoubleRegister dst, DoubleRegister src) {
if (CpuFeatures::IsSupported(ARMv8)) {
CpuFeatureScope scope(this, ARMv8);

View File

@ -588,6 +588,17 @@ void LiftoffAssembler::emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src) {
Clz(dst.gp().X(), dst.gp().X());
}
bool LiftoffAssembler::emit_i64_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
UseScratchRegisterScope temps(this);
VRegister scratch = temps.AcquireV(kFormat8B);
Fmov(scratch.D(), src.gp().X());
Cnt(scratch, scratch);
Addv(scratch.B(), scratch);
Fmov(dst.gp().X(), scratch.D());
return true;
}
void LiftoffAssembler::emit_i32_divs(Register dst, Register lhs, Register rhs,
Label* trap_div_by_zero,
Label* trap_div_unrepresentable) {

View File

@ -1083,6 +1083,21 @@ void LiftoffAssembler::emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src) {
xor_(dst.high_gp(), dst.high_gp()); // High word of result is always 0.
}
bool LiftoffAssembler::emit_i64_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
if (!CpuFeatures::IsSupported(POPCNT)) return false;
CpuFeatureScope scope(this, POPCNT);
// Produce partial popcnts in the two dst registers.
Register src1 = src.high_gp() == dst.low_gp() ? src.high_gp() : src.low_gp();
Register src2 = src.high_gp() == dst.low_gp() ? src.low_gp() : src.high_gp();
popcnt(dst.low_gp(), src1);
popcnt(dst.high_gp(), src2);
// Add the two into the lower dst reg, clear the higher dst reg.
add(dst.low_gp(), dst.high_gp());
xor_(dst.high_gp(), dst.high_gp());
return true;
}
void LiftoffAssembler::emit_i32_to_intptr(Register dst, Register src) {
// This is a nop on ia32.
}

View File

@ -462,6 +462,7 @@ class LiftoffAssembler : public TurboAssembler {
// i64 unops.
inline void emit_i64_clz(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src);
inline bool emit_i64_popcnt(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i32_to_intptr(Register dst, Register src);

View File

@ -839,8 +839,20 @@ class LiftoffCompiler {
});
break;
case kExprI64Popcnt:
return unsupported(decoder, kComplexOperation,
WasmOpcodes::OpcodeName(opcode));
EmitUnOp<kWasmI64, kWasmI64>(
[=](LiftoffRegister dst, LiftoffRegister src) {
if (__ emit_i64_popcnt(dst, src)) return;
// The c function returns i32. We will zero-extend later.
ValueType sig_i_l_reps[] = {kWasmI32, kWasmI64};
FunctionSig sig_i_l(1, 1, sig_i_l_reps);
LiftoffRegister c_call_dst = kNeedI64RegPair ? dst.low() : dst;
GenerateCCall(&c_call_dst, &sig_i_l, kWasmStmt, &src,
ExternalReference::wasm_word64_popcnt());
// Now zero-extend the result to i64.
__ emit_type_conversion(kExprI64UConvertI32, dst, c_call_dst,
nullptr);
});
break;
case kExprI32SConvertSatF32:
case kExprI32UConvertSatF32:
case kExprI32SConvertSatF64:

View File

@ -903,6 +903,14 @@ void LiftoffAssembler::emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src) {
Tzcntq(dst.gp(), src.gp());
}
bool LiftoffAssembler::emit_i64_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
if (!CpuFeatures::IsSupported(POPCNT)) return false;
CpuFeatureScope scope(this, POPCNT);
popcntq(dst.gp(), src.gp());
return true;
}
void LiftoffAssembler::emit_i32_to_intptr(Register dst, Register src) {
movsxlq(dst, src);
}