[wasm-simd][ia32] Optimize codegen when shift is constant
This optimizes i8x16 shifts when shift value is constant. It reduces instruction counts from 10 to 6 (unsigned), and 9 to 5 (signed). We can use a word (16-bit) shift, then mask away the high (shru) or low (shl) bits to achieve a byte shift. Most of the instructions are dedicated to building the mask. Bug: v8:10115 Change-Id: Ie602c2b0a7227181502fadb14e100bb0b92f322f Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2103445 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Cr-Commit-Position: refs/heads/master@{#66755}
This commit is contained in:
parent
06de28d280
commit
e05b10b7d8
@ -3149,39 +3149,62 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
case kIA32I8x16Shl: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
Register shift = i.InputRegister(1);
|
||||
Register tmp = i.ToRegister(instr->TempAt(0));
|
||||
XMMRegister tmp_simd = i.TempSimd128Register(1);
|
||||
// Take shift value modulo 8.
|
||||
__ and_(shift, 7);
|
||||
// Mask off the unwanted bits before word-shifting.
|
||||
__ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
|
||||
__ mov(tmp, shift);
|
||||
__ add(tmp, Immediate(8));
|
||||
__ Movd(tmp_simd, tmp);
|
||||
__ Psrlw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
|
||||
__ Packuswb(kScratchDoubleReg, kScratchDoubleReg);
|
||||
__ Pand(dst, kScratchDoubleReg);
|
||||
__ Movd(tmp_simd, shift);
|
||||
__ Psllw(dst, dst, tmp_simd);
|
||||
|
||||
if (HasImmediateInput(instr, 1)) {
|
||||
// Perform 16-bit shift, then mask away low bits.
|
||||
uint8_t shift = i.InputInt3(1);
|
||||
__ Psllw(dst, dst, static_cast<byte>(shift));
|
||||
|
||||
uint8_t bmask = static_cast<uint8_t>(0xff << shift);
|
||||
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
|
||||
__ mov(tmp, mask);
|
||||
__ Movd(tmp_simd, tmp);
|
||||
__ Pshufd(tmp_simd, tmp_simd, 0);
|
||||
__ Pand(dst, tmp_simd);
|
||||
} else {
|
||||
Register shift = i.InputRegister(1);
|
||||
// Take shift value modulo 8.
|
||||
__ and_(shift, 7);
|
||||
// Mask off the unwanted bits before word-shifting.
|
||||
__ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
|
||||
__ mov(tmp, shift);
|
||||
__ add(tmp, Immediate(8));
|
||||
__ Movd(tmp_simd, tmp);
|
||||
__ Psrlw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
|
||||
__ Packuswb(kScratchDoubleReg, kScratchDoubleReg);
|
||||
__ Pand(dst, kScratchDoubleReg);
|
||||
__ Movd(tmp_simd, shift);
|
||||
__ Psllw(dst, dst, tmp_simd);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kIA32I8x16ShrS: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
Register tmp = i.ToRegister(instr->TempAt(0));
|
||||
XMMRegister tmp_simd = i.TempSimd128Register(1);
|
||||
// Unpack the bytes into words, do arithmetic shifts, and repack.
|
||||
__ punpckhbw(kScratchDoubleReg, dst);
|
||||
__ punpcklbw(dst, dst);
|
||||
__ mov(tmp, i.InputRegister(1));
|
||||
// Take shift value modulo 8.
|
||||
__ and_(tmp, 7);
|
||||
__ add(tmp, Immediate(8));
|
||||
__ movd(tmp_simd, tmp);
|
||||
__ psraw(kScratchDoubleReg, tmp_simd);
|
||||
__ psraw(dst, tmp_simd);
|
||||
__ packsswb(dst, kScratchDoubleReg);
|
||||
if (HasImmediateInput(instr, 1)) {
|
||||
__ punpckhbw(kScratchDoubleReg, dst);
|
||||
__ punpcklbw(dst, dst);
|
||||
uint8_t shift = i.InputInt3(1) + 8;
|
||||
__ psraw(kScratchDoubleReg, shift);
|
||||
__ psraw(dst, shift);
|
||||
__ packsswb(dst, kScratchDoubleReg);
|
||||
} else {
|
||||
Register tmp = i.ToRegister(instr->TempAt(0));
|
||||
XMMRegister tmp_simd = i.TempSimd128Register(1);
|
||||
// Unpack the bytes into words, do arithmetic shifts, and repack.
|
||||
__ punpckhbw(kScratchDoubleReg, dst);
|
||||
__ punpcklbw(dst, dst);
|
||||
__ mov(tmp, i.InputRegister(1));
|
||||
// Take shift value modulo 8.
|
||||
__ and_(tmp, 7);
|
||||
__ add(tmp, Immediate(8));
|
||||
__ movd(tmp_simd, tmp);
|
||||
__ psraw(kScratchDoubleReg, tmp_simd);
|
||||
__ psraw(dst, tmp_simd);
|
||||
__ packsswb(dst, kScratchDoubleReg);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kSSEI8x16Add: {
|
||||
@ -3423,21 +3446,35 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
break;
|
||||
}
|
||||
case kIA32I8x16ShrU: {
|
||||
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
Register tmp = i.ToRegister(instr->TempAt(0));
|
||||
XMMRegister tmp_simd = i.TempSimd128Register(1);
|
||||
// Unpack the bytes into words, do logical shifts, and repack.
|
||||
__ punpckhbw(kScratchDoubleReg, dst);
|
||||
__ punpcklbw(dst, dst);
|
||||
__ mov(tmp, i.InputRegister(1));
|
||||
// Take shift value modulo 8.
|
||||
__ and_(tmp, 7);
|
||||
__ add(tmp, Immediate(8));
|
||||
__ movd(tmp_simd, tmp);
|
||||
__ psrlw(kScratchDoubleReg, tmp_simd);
|
||||
__ psrlw(dst, tmp_simd);
|
||||
__ packuswb(dst, kScratchDoubleReg);
|
||||
|
||||
if (HasImmediateInput(instr, 1)) {
|
||||
// Perform 16-bit shift, then mask away high bits.
|
||||
uint8_t shift = i.InputInt3(1);
|
||||
__ Psrlw(dst, dst, static_cast<byte>(shift));
|
||||
|
||||
uint8_t bmask = 0xff >> shift;
|
||||
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
|
||||
__ mov(tmp, mask);
|
||||
__ Movd(tmp_simd, tmp);
|
||||
__ Pshufd(tmp_simd, tmp_simd, 0);
|
||||
__ Pand(dst, tmp_simd);
|
||||
} else {
|
||||
// Unpack the bytes into words, do logical shifts, and repack.
|
||||
__ punpckhbw(kScratchDoubleReg, dst);
|
||||
__ punpcklbw(dst, dst);
|
||||
__ mov(tmp, i.InputRegister(1));
|
||||
// Take shift value modulo 8.
|
||||
__ and_(tmp, 7);
|
||||
__ add(tmp, Immediate(8));
|
||||
__ movd(tmp_simd, tmp);
|
||||
__ psrlw(kScratchDoubleReg, tmp_simd);
|
||||
__ psrlw(dst, tmp_simd);
|
||||
__ packuswb(dst, kScratchDoubleReg);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kSSEI8x16MinU: {
|
||||
|
@ -320,8 +320,8 @@ void VisitRROSimdShift(InstructionSelector* selector, Node* node,
|
||||
}
|
||||
}
|
||||
|
||||
void VisitRROI8x16SimdRightShift(InstructionSelector* selector, Node* node,
|
||||
ArchOpcode opcode) {
|
||||
void VisitRROI8x16SimdShift(InstructionSelector* selector, Node* node,
|
||||
ArchOpcode opcode) {
|
||||
IA32OperandGenerator g(selector);
|
||||
InstructionOperand operand0 = g.UseUniqueRegister(node->InputAt(0));
|
||||
InstructionOperand operand1 = g.UseUniqueRegister(node->InputAt(1));
|
||||
@ -329,7 +329,6 @@ void VisitRROI8x16SimdRightShift(InstructionSelector* selector, Node* node,
|
||||
selector->Emit(opcode, g.DefineSameAsFirst(node), operand0, operand1,
|
||||
arraysize(temps), temps);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void InstructionSelector::VisitStackSlot(Node* node) {
|
||||
@ -2135,10 +2134,6 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
|
||||
V(I16x8ShrS) \
|
||||
V(I16x8ShrU)
|
||||
|
||||
#define SIMD_I8X16_RIGHT_SHIFT_OPCODES(V) \
|
||||
V(I8x16ShrS) \
|
||||
V(I8x16ShrU)
|
||||
|
||||
void InstructionSelector::VisitF64x2Min(Node* node) {
|
||||
IA32OperandGenerator g(this);
|
||||
InstructionOperand temps[] = {g.TempSimd128Register()};
|
||||
@ -2361,14 +2356,6 @@ SIMD_SHIFT_OPCODES_UNIFED_SSE_AVX(VISIT_SIMD_SHIFT_UNIFIED_SSE_AVX)
|
||||
#undef VISIT_SIMD_SHIFT_UNIFIED_SSE_AVX
|
||||
#undef SIMD_SHIFT_OPCODES_UNIFED_SSE_AVX
|
||||
|
||||
#define VISIT_SIMD_I8x16_RIGHT_SHIFT(Opcode) \
|
||||
void InstructionSelector::Visit##Opcode(Node* node) { \
|
||||
VisitRROI8x16SimdRightShift(this, node, kIA32##Opcode); \
|
||||
}
|
||||
SIMD_I8X16_RIGHT_SHIFT_OPCODES(VISIT_SIMD_I8x16_RIGHT_SHIFT)
|
||||
#undef SIMD_I8X16_RIGHT_SHIFT_OPCODES
|
||||
#undef VISIT_SIMD_I8x16_RIGHT_SHIFT
|
||||
|
||||
#define VISIT_SIMD_UNOP(Opcode) \
|
||||
void InstructionSelector::Visit##Opcode(Node* node) { \
|
||||
IA32OperandGenerator g(this); \
|
||||
@ -2448,11 +2435,37 @@ void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) {
|
||||
|
||||
void InstructionSelector::VisitI8x16Shl(Node* node) {
|
||||
IA32OperandGenerator g(this);
|
||||
InstructionOperand operand0 = g.UseUniqueRegister(node->InputAt(0));
|
||||
InstructionOperand operand1 = g.UseUniqueRegister(node->InputAt(1));
|
||||
InstructionOperand temps[] = {g.TempRegister(), g.TempSimd128Register()};
|
||||
Emit(kIA32I8x16Shl, g.DefineSameAsFirst(node), operand0, operand1,
|
||||
arraysize(temps), temps);
|
||||
if (g.CanBeImmediate(node->InputAt(1))) {
|
||||
InstructionOperand temps[] = {g.TempRegister(), g.TempSimd128Register()};
|
||||
this->Emit(kIA32I8x16Shl, g.DefineSameAsFirst(node),
|
||||
g.UseRegister(node->InputAt(0)),
|
||||
g.UseImmediate(node->InputAt(1)), arraysize(temps), temps);
|
||||
} else {
|
||||
VisitRROI8x16SimdShift(this, node, kIA32I8x16Shl);
|
||||
}
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitI8x16ShrS(Node* node) {
|
||||
IA32OperandGenerator g(this);
|
||||
if (g.CanBeImmediate(node->InputAt(1))) {
|
||||
this->Emit(kIA32I8x16ShrS, g.DefineSameAsFirst(node),
|
||||
g.UseRegister(node->InputAt(0)),
|
||||
g.UseImmediate(node->InputAt(1)));
|
||||
} else {
|
||||
VisitRROI8x16SimdShift(this, node, kIA32I8x16ShrS);
|
||||
}
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitI8x16ShrU(Node* node) {
|
||||
IA32OperandGenerator g(this);
|
||||
if (g.CanBeImmediate(node->InputAt(1))) {
|
||||
InstructionOperand temps[] = {g.TempRegister(), g.TempSimd128Register()};
|
||||
this->Emit(kIA32I8x16ShrU, g.DefineSameAsFirst(node),
|
||||
g.UseRegister(node->InputAt(0)),
|
||||
g.UseImmediate(node->InputAt(1)), arraysize(temps), temps);
|
||||
} else {
|
||||
VisitRROI8x16SimdShift(this, node, kIA32I8x16ShrU);
|
||||
}
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitInt32AbsWithOverflow(Node* node) {
|
||||
|
Loading…
Reference in New Issue
Block a user