[wasm-simd][ia32] Optimize codegen when shift is constant

This optimizes i8x16 shifts when shift value is constant. It reduces
instruction counts from 10 to 6 (unsigned), and 9 to 5 (signed).

We can use a word (16-bit) shift, then mask away the high (shru) or low
(shl) bits to achieve a byte shift. Most of the instructions are
dedicated to building the mask.

Bug: v8:10115
Change-Id: Ie602c2b0a7227181502fadb14e100bb0b92f322f
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2103445
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66755}
This commit is contained in:
Ng Zhi An 2020-03-17 11:33:19 -07:00 committed by Commit Bot
parent 06de28d280
commit e05b10b7d8
2 changed files with 108 additions and 58 deletions

View File

@ -3149,39 +3149,62 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kIA32I8x16Shl: {
XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
Register shift = i.InputRegister(1);
Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1);
// Take shift value modulo 8.
__ and_(shift, 7);
// Mask off the unwanted bits before word-shifting.
__ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
__ mov(tmp, shift);
__ add(tmp, Immediate(8));
__ Movd(tmp_simd, tmp);
__ Psrlw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
__ Packuswb(kScratchDoubleReg, kScratchDoubleReg);
__ Pand(dst, kScratchDoubleReg);
__ Movd(tmp_simd, shift);
__ Psllw(dst, dst, tmp_simd);
if (HasImmediateInput(instr, 1)) {
// Perform 16-bit shift, then mask away low bits.
uint8_t shift = i.InputInt3(1);
__ Psllw(dst, dst, static_cast<byte>(shift));
uint8_t bmask = static_cast<uint8_t>(0xff << shift);
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
__ mov(tmp, mask);
__ Movd(tmp_simd, tmp);
__ Pshufd(tmp_simd, tmp_simd, 0);
__ Pand(dst, tmp_simd);
} else {
Register shift = i.InputRegister(1);
// Take shift value modulo 8.
__ and_(shift, 7);
// Mask off the unwanted bits before word-shifting.
__ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
__ mov(tmp, shift);
__ add(tmp, Immediate(8));
__ Movd(tmp_simd, tmp);
__ Psrlw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
__ Packuswb(kScratchDoubleReg, kScratchDoubleReg);
__ Pand(dst, kScratchDoubleReg);
__ Movd(tmp_simd, shift);
__ Psllw(dst, dst, tmp_simd);
}
break;
}
case kIA32I8x16ShrS: {
XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1);
// Unpack the bytes into words, do arithmetic shifts, and repack.
__ punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst);
__ mov(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ and_(tmp, 7);
__ add(tmp, Immediate(8));
__ movd(tmp_simd, tmp);
__ psraw(kScratchDoubleReg, tmp_simd);
__ psraw(dst, tmp_simd);
__ packsswb(dst, kScratchDoubleReg);
if (HasImmediateInput(instr, 1)) {
__ punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst);
uint8_t shift = i.InputInt3(1) + 8;
__ psraw(kScratchDoubleReg, shift);
__ psraw(dst, shift);
__ packsswb(dst, kScratchDoubleReg);
} else {
Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1);
// Unpack the bytes into words, do arithmetic shifts, and repack.
__ punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst);
__ mov(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ and_(tmp, 7);
__ add(tmp, Immediate(8));
__ movd(tmp_simd, tmp);
__ psraw(kScratchDoubleReg, tmp_simd);
__ psraw(dst, tmp_simd);
__ packsswb(dst, kScratchDoubleReg);
}
break;
}
case kSSEI8x16Add: {
@ -3423,21 +3446,35 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32I8x16ShrU: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1);
// Unpack the bytes into words, do logical shifts, and repack.
__ punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst);
__ mov(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ and_(tmp, 7);
__ add(tmp, Immediate(8));
__ movd(tmp_simd, tmp);
__ psrlw(kScratchDoubleReg, tmp_simd);
__ psrlw(dst, tmp_simd);
__ packuswb(dst, kScratchDoubleReg);
if (HasImmediateInput(instr, 1)) {
// Perform 16-bit shift, then mask away high bits.
uint8_t shift = i.InputInt3(1);
__ Psrlw(dst, dst, static_cast<byte>(shift));
uint8_t bmask = 0xff >> shift;
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
__ mov(tmp, mask);
__ Movd(tmp_simd, tmp);
__ Pshufd(tmp_simd, tmp_simd, 0);
__ Pand(dst, tmp_simd);
} else {
// Unpack the bytes into words, do logical shifts, and repack.
__ punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst);
__ mov(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ and_(tmp, 7);
__ add(tmp, Immediate(8));
__ movd(tmp_simd, tmp);
__ psrlw(kScratchDoubleReg, tmp_simd);
__ psrlw(dst, tmp_simd);
__ packuswb(dst, kScratchDoubleReg);
}
break;
}
case kSSEI8x16MinU: {

View File

@ -320,8 +320,8 @@ void VisitRROSimdShift(InstructionSelector* selector, Node* node,
}
}
void VisitRROI8x16SimdRightShift(InstructionSelector* selector, Node* node,
ArchOpcode opcode) {
void VisitRROI8x16SimdShift(InstructionSelector* selector, Node* node,
ArchOpcode opcode) {
IA32OperandGenerator g(selector);
InstructionOperand operand0 = g.UseUniqueRegister(node->InputAt(0));
InstructionOperand operand1 = g.UseUniqueRegister(node->InputAt(1));
@ -329,7 +329,6 @@ void VisitRROI8x16SimdRightShift(InstructionSelector* selector, Node* node,
selector->Emit(opcode, g.DefineSameAsFirst(node), operand0, operand1,
arraysize(temps), temps);
}
} // namespace
void InstructionSelector::VisitStackSlot(Node* node) {
@ -2135,10 +2134,6 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(I16x8ShrS) \
V(I16x8ShrU)
#define SIMD_I8X16_RIGHT_SHIFT_OPCODES(V) \
V(I8x16ShrS) \
V(I8x16ShrU)
void InstructionSelector::VisitF64x2Min(Node* node) {
IA32OperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register()};
@ -2361,14 +2356,6 @@ SIMD_SHIFT_OPCODES_UNIFED_SSE_AVX(VISIT_SIMD_SHIFT_UNIFIED_SSE_AVX)
#undef VISIT_SIMD_SHIFT_UNIFIED_SSE_AVX
#undef SIMD_SHIFT_OPCODES_UNIFED_SSE_AVX
#define VISIT_SIMD_I8x16_RIGHT_SHIFT(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
VisitRROI8x16SimdRightShift(this, node, kIA32##Opcode); \
}
SIMD_I8X16_RIGHT_SHIFT_OPCODES(VISIT_SIMD_I8x16_RIGHT_SHIFT)
#undef SIMD_I8X16_RIGHT_SHIFT_OPCODES
#undef VISIT_SIMD_I8x16_RIGHT_SHIFT
#define VISIT_SIMD_UNOP(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
IA32OperandGenerator g(this); \
@ -2448,11 +2435,37 @@ void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) {
void InstructionSelector::VisitI8x16Shl(Node* node) {
IA32OperandGenerator g(this);
InstructionOperand operand0 = g.UseUniqueRegister(node->InputAt(0));
InstructionOperand operand1 = g.UseUniqueRegister(node->InputAt(1));
InstructionOperand temps[] = {g.TempRegister(), g.TempSimd128Register()};
Emit(kIA32I8x16Shl, g.DefineSameAsFirst(node), operand0, operand1,
arraysize(temps), temps);
if (g.CanBeImmediate(node->InputAt(1))) {
InstructionOperand temps[] = {g.TempRegister(), g.TempSimd128Register()};
this->Emit(kIA32I8x16Shl, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)),
g.UseImmediate(node->InputAt(1)), arraysize(temps), temps);
} else {
VisitRROI8x16SimdShift(this, node, kIA32I8x16Shl);
}
}
void InstructionSelector::VisitI8x16ShrS(Node* node) {
IA32OperandGenerator g(this);
if (g.CanBeImmediate(node->InputAt(1))) {
this->Emit(kIA32I8x16ShrS, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)),
g.UseImmediate(node->InputAt(1)));
} else {
VisitRROI8x16SimdShift(this, node, kIA32I8x16ShrS);
}
}
void InstructionSelector::VisitI8x16ShrU(Node* node) {
IA32OperandGenerator g(this);
if (g.CanBeImmediate(node->InputAt(1))) {
InstructionOperand temps[] = {g.TempRegister(), g.TempSimd128Register()};
this->Emit(kIA32I8x16ShrU, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)),
g.UseImmediate(node->InputAt(1)), arraysize(temps), temps);
} else {
VisitRROI8x16SimdShift(this, node, kIA32I8x16ShrU);
}
}
void InstructionSelector::VisitInt32AbsWithOverflow(Node* node) {