[mips] Add Ctz and Popcnt as macro assembler instructions

Since these instructions will be used in liftoff as well as they
are used in code generator, they are transfered to macro assembler.

Change-Id: I48e60ccc7586252374bc66b7b72bbe23c2d0c0a6
Reviewed-on: https://chromium-review.googlesource.com/924194
Reviewed-by: Ivica Bogosavljevic <ivica.bogosavljevic@mips.com>
Commit-Queue: Ivica Bogosavljevic <ivica.bogosavljevic@mips.com>
Cr-Commit-Position: refs/heads/master@{#51366}
This commit is contained in:
sreten.kovacevic 2018-02-19 13:01:04 +01:00 committed by Commit Bot
parent b8a727e14c
commit d4f73e7619
6 changed files with 215 additions and 169 deletions

View File

@ -1056,73 +1056,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kMipsCtz: {
Register src = i.InputRegister(0);
Register dst = i.OutputRegister();
if (IsMipsArchVariant(kMips32r6)) {
// We don't have an instruction to count the number of trailing zeroes.
// Start by flipping the bits end-for-end so we can count the number of
// leading zeroes instead.
__ Ror(dst, src, 16);
__ wsbh(dst, dst);
__ bitswap(dst, dst);
__ Clz(dst, dst);
} else {
// Convert trailing zeroes to trailing ones, and bits to their left
// to zeroes.
__ Addu(kScratchReg, src, -1);
__ Xor(dst, kScratchReg, src);
__ And(dst, dst, kScratchReg);
// Count number of leading zeroes.
__ Clz(dst, dst);
// Subtract number of leading zeroes from 32 to get number of trailing
// ones. Remember that the trailing ones were formerly trailing zeroes.
__ li(kScratchReg, 32);
__ Subu(dst, kScratchReg, dst);
}
__ Ctz(dst, src);
} break;
case kMipsPopcnt: {
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
//
// A generalization of the best bit counting method to integers of
// bit-widths up to 128 (parameterized by type T) is this:
//
// v = v - ((v >> 1) & (T)~(T)0/3); // temp
// v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp
// v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp
// c = (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * BITS_PER_BYTE; //count
//
// For comparison, for 32-bit quantities, this algorithm can be executed
// using 20 MIPS instructions (the calls to LoadConst32() generate two
// machine instructions each for the values being used in this algorithm).
// A(n unrolled) loop-based algorithm requires 25 instructions.
//
// For 64-bit quantities, this algorithm gets executed twice, (once
// for in_lo, and again for in_hi), but saves a few instructions
// because the mask values only have to be loaded once. Using this
// algorithm the count for a 64-bit operand can be performed in 29
// instructions compared to a loop-based algorithm which requires 47
// instructions.
Register src = i.InputRegister(0);
Register dst = i.OutputRegister();
uint32_t B0 = 0x55555555; // (T)~(T)0/3
uint32_t B1 = 0x33333333; // (T)~(T)0/15*3
uint32_t B2 = 0x0F0F0F0F; // (T)~(T)0/255*15
uint32_t value = 0x01010101; // (T)~(T)0/255
uint32_t shift = 24; // (sizeof(T) - 1) * BITS_PER_BYTE
__ srl(kScratchReg, src, 1);
__ li(kScratchReg2, B0);
__ And(kScratchReg, kScratchReg, kScratchReg2);
__ Subu(kScratchReg, src, kScratchReg);
__ li(kScratchReg2, B1);
__ And(dst, kScratchReg, kScratchReg2);
__ srl(kScratchReg, kScratchReg, 2);
__ And(kScratchReg, kScratchReg, kScratchReg2);
__ Addu(kScratchReg, dst, kScratchReg);
__ srl(dst, kScratchReg, 4);
__ Addu(dst, dst, kScratchReg);
__ li(kScratchReg2, B2);
__ And(dst, dst, kScratchReg2);
__ li(kScratchReg, value);
__ Mul(dst, dst, kScratchReg);
__ srl(dst, dst, shift);
__ Popcnt(dst, src);
} break;
case kMipsShl:
if (instr->InputAt(1)->IsRegister()) {

View File

@ -1168,124 +1168,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kMips64Ctz: {
Register src = i.InputRegister(0);
Register dst = i.OutputRegister();
if (kArchVariant == kMips64r6) {
// We don't have an instruction to count the number of trailing zeroes.
// Start by flipping the bits end-for-end so we can count the number of
// leading zeroes instead.
__ rotr(dst, src, 16);
__ wsbh(dst, dst);
__ bitswap(dst, dst);
__ Clz(dst, dst);
} else {
// Convert trailing zeroes to trailing ones, and bits to their left
// to zeroes.
__ Daddu(kScratchReg, src, -1);
__ Xor(dst, kScratchReg, src);
__ And(dst, dst, kScratchReg);
// Count number of leading zeroes.
__ Clz(dst, dst);
// Subtract number of leading zeroes from 32 to get number of trailing
// ones. Remember that the trailing ones were formerly trailing zeroes.
__ li(kScratchReg, 32);
__ Subu(dst, kScratchReg, dst);
}
__ Ctz(dst, src);
} break;
case kMips64Dctz: {
Register src = i.InputRegister(0);
Register dst = i.OutputRegister();
if (kArchVariant == kMips64r6) {
// We don't have an instruction to count the number of trailing zeroes.
// Start by flipping the bits end-for-end so we can count the number of
// leading zeroes instead.
__ dsbh(dst, src);
__ dshd(dst, dst);
__ dbitswap(dst, dst);
__ dclz(dst, dst);
} else {
// Convert trailing zeroes to trailing ones, and bits to their left
// to zeroes.
__ Daddu(kScratchReg, src, -1);
__ Xor(dst, kScratchReg, src);
__ And(dst, dst, kScratchReg);
// Count number of leading zeroes.
__ dclz(dst, dst);
// Subtract number of leading zeroes from 64 to get number of trailing
// ones. Remember that the trailing ones were formerly trailing zeroes.
__ li(kScratchReg, 64);
__ Dsubu(dst, kScratchReg, dst);
}
__ Dctz(dst, src);
} break;
case kMips64Popcnt: {
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
//
// A generalization of the best bit counting method to integers of
// bit-widths up to 128 (parameterized by type T) is this:
//
// v = v - ((v >> 1) & (T)~(T)0/3); // temp
// v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp
// v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp
// c = (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * BITS_PER_BYTE; //count
//
// For comparison, for 32-bit quantities, this algorithm can be executed
// using 20 MIPS instructions (the calls to LoadConst32() generate two
// machine instructions each for the values being used in this algorithm).
// A(n unrolled) loop-based algorithm requires 25 instructions.
//
// For a 64-bit operand this can be performed in 24 instructions compared
// to a(n unrolled) loop based algorithm which requires 38 instructions.
//
// There are algorithms which are faster in the cases where very few
// bits are set but the algorithm here attempts to minimize the total
// number of instructions executed even when a large number of bits
// are set.
Register src = i.InputRegister(0);
Register dst = i.OutputRegister();
uint32_t B0 = 0x55555555; // (T)~(T)0/3
uint32_t B1 = 0x33333333; // (T)~(T)0/15*3
uint32_t B2 = 0x0F0F0F0F; // (T)~(T)0/255*15
uint32_t value = 0x01010101; // (T)~(T)0/255
uint32_t shift = 24; // (sizeof(T) - 1) * BITS_PER_BYTE
__ srl(kScratchReg, src, 1);
__ li(kScratchReg2, B0);
__ And(kScratchReg, kScratchReg, kScratchReg2);
__ Subu(kScratchReg, src, kScratchReg);
__ li(kScratchReg2, B1);
__ And(dst, kScratchReg, kScratchReg2);
__ srl(kScratchReg, kScratchReg, 2);
__ And(kScratchReg, kScratchReg, kScratchReg2);
__ Addu(kScratchReg, dst, kScratchReg);
__ srl(dst, kScratchReg, 4);
__ Addu(dst, dst, kScratchReg);
__ li(kScratchReg2, B2);
__ And(dst, dst, kScratchReg2);
__ li(kScratchReg, value);
__ Mul(dst, dst, kScratchReg);
__ srl(dst, dst, shift);
__ Popcnt(dst, src);
} break;
case kMips64Dpopcnt: {
Register src = i.InputRegister(0);
Register dst = i.OutputRegister();
uint64_t B0 = 0x5555555555555555l; // (T)~(T)0/3
uint64_t B1 = 0x3333333333333333l; // (T)~(T)0/15*3
uint64_t B2 = 0x0F0F0F0F0F0F0F0Fl; // (T)~(T)0/255*15
uint64_t value = 0x0101010101010101l; // (T)~(T)0/255
uint64_t shift = 24; // (sizeof(T) - 1) * BITS_PER_BYTE
__ dsrl(kScratchReg, src, 1);
__ li(kScratchReg2, B0);
__ And(kScratchReg, kScratchReg, kScratchReg2);
__ Dsubu(kScratchReg, src, kScratchReg);
__ li(kScratchReg2, B1);
__ And(dst, kScratchReg, kScratchReg2);
__ dsrl(kScratchReg, kScratchReg, 2);
__ And(kScratchReg, kScratchReg, kScratchReg2);
__ Daddu(kScratchReg, dst, kScratchReg);
__ dsrl(dst, kScratchReg, 4);
__ Daddu(dst, dst, kScratchReg);
__ li(kScratchReg2, B2);
__ And(dst, dst, kScratchReg2);
__ li(kScratchReg, value);
__ Dmul(dst, dst, kScratchReg);
__ dsrl32(dst, dst, shift);
__ Dpopcnt(dst, src);
} break;
case kMips64Shl:
if (instr->InputAt(1)->IsRegister()) {

View File

@ -2311,6 +2311,79 @@ void TurboAssembler::Clz(Register rd, Register rs) {
}
}
void TurboAssembler::Ctz(Register rd, Register rs) {
if (IsMipsArchVariant(kMips32r6)) {
// We don't have an instruction to count the number of trailing zeroes.
// Start by flipping the bits end-for-end so we can count the number of
// leading zeroes instead.
Ror(rd, rs, 16);
wsbh(rd, rd);
bitswap(rd, rd);
Clz(rd, rd);
} else {
// Convert trailing zeroes to trailing ones, and bits to their left
// to zeroes.
UseScratchRegisterScope temps(this);
Register scratch = temps.Acquire();
Addu(scratch, rs, -1);
Xor(rd, scratch, rs);
And(rd, rd, scratch);
// Count number of leading zeroes.
Clz(rd, rd);
// Subtract number of leading zeroes from 32 to get number of trailing
// ones. Remember that the trailing ones were formerly trailing zeroes.
li(scratch, 32);
Subu(rd, scratch, rd);
}
}
void TurboAssembler::Popcnt(Register rd, Register rs) {
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
//
// A generalization of the best bit counting method to integers of
// bit-widths up to 128 (parameterized by type T) is this:
//
// v = v - ((v >> 1) & (T)~(T)0/3); // temp
// v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp
// v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp
// c = (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * BITS_PER_BYTE; //count
//
// For comparison, for 32-bit quantities, this algorithm can be executed
// using 20 MIPS instructions (the calls to LoadConst32() generate two
// machine instructions each for the values being used in this algorithm).
// A(n unrolled) loop-based algorithm requires 25 instructions.
//
// For 64-bit quantities, this algorithm gets executed twice, (once
// for in_lo, and again for in_hi), but saves a few instructions
// because the mask values only have to be loaded once. Using this
// algorithm the count for a 64-bit operand can be performed in 29
// instructions compared to a loop-based algorithm which requires 47
// instructions.
uint32_t B0 = 0x55555555; // (T)~(T)0/3
uint32_t B1 = 0x33333333; // (T)~(T)0/15*3
uint32_t B2 = 0x0F0F0F0F; // (T)~(T)0/255*15
uint32_t value = 0x01010101; // (T)~(T)0/255
uint32_t shift = 24; // (sizeof(T) - 1) * BITS_PER_BYTE
UseScratchRegisterScope temps(this);
Register scratch = temps.Acquire();
Register scratch2 = t8;
srl(scratch, rs, 1);
li(scratch2, B0);
And(scratch, scratch, scratch2);
Subu(scratch, rs, scratch);
li(scratch2, B1);
And(rd, scratch, scratch2);
srl(scratch, scratch, 2);
And(scratch, scratch, scratch2);
Addu(scratch, rd, scratch);
srl(rd, scratch, 4);
Addu(rd, rd, scratch);
li(scratch2, B2);
And(rd, rd, scratch2);
li(scratch, value);
Mul(rd, rd, scratch);
srl(rd, rd, shift);
}
void MacroAssembler::EmitFPUTruncate(FPURoundingMode rounding_mode,
Register result,

View File

@ -562,6 +562,8 @@ class TurboAssembler : public Assembler {
void Movf(Register rd, Register rs, uint16_t cc = 0);
void Clz(Register rd, Register rs);
void Ctz(Register rd, Register rs);
void Popcnt(Register rd, Register rs);
// Int64Lowering instructions
void AddPair(Register dst_low, Register dst_high, Register left_low,

View File

@ -2777,6 +2777,136 @@ void TurboAssembler::Movf(Register rd, Register rs, uint16_t cc) {
void TurboAssembler::Clz(Register rd, Register rs) { clz(rd, rs); }
void TurboAssembler::Ctz(Register rd, Register rs) {
if (kArchVariant == kMips64r6) {
// We don't have an instruction to count the number of trailing zeroes.
// Start by flipping the bits end-for-end so we can count the number of
// leading zeroes instead.
rotr(rd, rs, 16);
wsbh(rd, rd);
bitswap(rd, rd);
Clz(rd, rd);
} else {
// Convert trailing zeroes to trailing ones, and bits to their left
// to zeroes.
UseScratchRegisterScope temps(this);
Register scratch = temps.Acquire();
Daddu(scratch, rs, -1);
Xor(rd, scratch, rs);
And(rd, rd, scratch);
// Count number of leading zeroes.
Clz(rd, rd);
// Subtract number of leading zeroes from 32 to get number of trailing
// ones. Remember that the trailing ones were formerly trailing zeroes.
li(scratch, 32);
Subu(rd, scratch, rd);
}
}
void TurboAssembler::Dctz(Register rd, Register rs) {
if (kArchVariant == kMips64r6) {
// We don't have an instruction to count the number of trailing zeroes.
// Start by flipping the bits end-for-end so we can count the number of
// leading zeroes instead.
dsbh(rd, rs);
dshd(rd, rd);
dbitswap(rd, rd);
dclz(rd, rd);
} else {
// Convert trailing zeroes to trailing ones, and bits to their left
// to zeroes.
UseScratchRegisterScope temps(this);
Register scratch = temps.Acquire();
Daddu(scratch, rs, -1);
Xor(rd, scratch, rs);
And(rd, rd, scratch);
// Count number of leading zeroes.
dclz(rd, rd);
// Subtract number of leading zeroes from 64 to get number of trailing
// ones. Remember that the trailing ones were formerly trailing zeroes.
li(scratch, 64);
Dsubu(rd, scratch, rd);
}
}
void TurboAssembler::Popcnt(Register rd, Register rs) {
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
//
// A generalization of the best bit counting method to integers of
// bit-widths up to 128 (parameterized by type T) is this:
//
// v = v - ((v >> 1) & (T)~(T)0/3); // temp
// v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp
// v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp
// c = (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * BITS_PER_BYTE; //count
//
// For comparison, for 32-bit quantities, this algorithm can be executed
// using 20 MIPS instructions (the calls to LoadConst32() generate two
// machine instructions each for the values being used in this algorithm).
// A(n unrolled) loop-based algorithm requires 25 instructions.
//
// For a 64-bit operand this can be performed in 24 instructions compared
// to a(n unrolled) loop based algorithm which requires 38 instructions.
//
// There are algorithms which are faster in the cases where very few
// bits are set but the algorithm here attempts to minimize the total
// number of instructions executed even when a large number of bits
// are set.
uint32_t B0 = 0x55555555; // (T)~(T)0/3
uint32_t B1 = 0x33333333; // (T)~(T)0/15*3
uint32_t B2 = 0x0F0F0F0F; // (T)~(T)0/255*15
uint32_t value = 0x01010101; // (T)~(T)0/255
uint32_t shift = 24; // (sizeof(T) - 1) * BITS_PER_BYTE
UseScratchRegisterScope temps(this);
Register scratch = temps.Acquire();
Register scratch2 = t8;
srl(scratch, rs, 1);
li(scratch2, B0);
And(scratch, scratch, scratch2);
Subu(scratch, rs, scratch);
li(scratch2, B1);
And(rd, scratch, scratch2);
srl(scratch, scratch, 2);
And(scratch, scratch, scratch2);
Addu(scratch, rd, scratch);
srl(rd, scratch, 4);
Addu(rd, rd, scratch);
li(scratch2, B2);
And(rd, rd, scratch2);
li(scratch, value);
Mul(rd, rd, scratch);
srl(rd, rd, shift);
}
void TurboAssembler::Dpopcnt(Register rd, Register rs) {
uint64_t B0 = 0x5555555555555555l; // (T)~(T)0/3
uint64_t B1 = 0x3333333333333333l; // (T)~(T)0/15*3
uint64_t B2 = 0x0F0F0F0F0F0F0F0Fl; // (T)~(T)0/255*15
uint64_t value = 0x0101010101010101l; // (T)~(T)0/255
uint64_t shift = 24; // (sizeof(T) - 1) * BITS_PER_BYTE
UseScratchRegisterScope temps(this);
Register scratch = temps.Acquire();
Register scratch2 = t8;
dsrl(scratch, rs, 1);
li(scratch2, B0);
And(scratch, scratch, scratch2);
Dsubu(scratch, rs, scratch);
li(scratch2, B1);
And(rd, scratch, scratch2);
dsrl(scratch, scratch, 2);
And(scratch, scratch, scratch2);
Daddu(scratch, rd, scratch);
dsrl(rd, scratch, 4);
Daddu(rd, rd, scratch);
li(scratch2, B2);
And(rd, rd, scratch2);
li(scratch, value);
Dmul(rd, rd, scratch);
dsrl32(rd, rd, shift);
}
void MacroAssembler::EmitFPUTruncate(FPURoundingMode rounding_mode,
Register result,
DoubleRegister double_input,

View File

@ -604,6 +604,10 @@ class TurboAssembler : public Assembler {
void Movf(Register rd, Register rs, uint16_t cc = 0);
void Clz(Register rd, Register rs);
void Ctz(Register rd, Register rs);
void Dctz(Register rd, Register rs);
void Popcnt(Register rd, Register rs);
void Dpopcnt(Register rd, Register rs);
// MIPS64 R2 instruction macro.
void Ext(Register rt, Register rs, uint16_t pos, uint16_t size);