[wasm simd] Handle more shuffles
- Handles zip, unzip, and transpose shuffles/swizzles. - Adds punpck* instructions to assembler. Bug: v8:6020 Change-Id: If124b7a7462ffd0470347b54ce4a93c01667e384 Reviewed-on: https://chromium-review.googlesource.com/1084069 Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Bill Budge <bbudge@chromium.org> Cr-Commit-Position: refs/heads/master@{#53947}
This commit is contained in:
parent
edfcba0407
commit
51ded9a743
@ -420,6 +420,30 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define ASSEMBLE_SIMD_PUNPCK_SHUFFLE(opcode) \
|
||||
do { \
|
||||
XMMRegister src0 = i.InputSimd128Register(0); \
|
||||
Operand src1 = i.InputOperand(instr->InputCount() == 2 ? 1 : 0); \
|
||||
if (CpuFeatures::IsSupported(AVX)) { \
|
||||
CpuFeatureScope avx_scope(tasm(), AVX); \
|
||||
__ v##opcode(i.OutputSimd128Register(), src0, src1); \
|
||||
} else { \
|
||||
DCHECK_EQ(i.OutputSimd128Register(), src0); \
|
||||
__ opcode(i.OutputSimd128Register(), src1); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define ASSEMBLE_SIMD_IMM_SHUFFLE(opcode, SSELevel, imm) \
|
||||
if (CpuFeatures::IsSupported(AVX)) { \
|
||||
CpuFeatureScope avx_scope(tasm(), AVX); \
|
||||
__ v##opcode(i.OutputSimd128Register(), i.InputSimd128Register(0), \
|
||||
i.InputOperand(1), imm); \
|
||||
} else { \
|
||||
CpuFeatureScope sse_scope(tasm(), SSELevel); \
|
||||
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); \
|
||||
__ opcode(i.OutputSimd128Register(), i.InputOperand(1), imm); \
|
||||
}
|
||||
|
||||
void CodeGenerator::AssembleDeconstructFrame() {
|
||||
__ mov(esp, ebp);
|
||||
__ pop(ebp);
|
||||
@ -3195,23 +3219,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
}
|
||||
case kIA32S32x4Shuffle: {
|
||||
DCHECK_EQ(4, instr->InputCount()); // Swizzles should be handled above.
|
||||
__ Pshufd(kScratchDoubleReg, i.InputOperand(1), i.InputInt8(2));
|
||||
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(2));
|
||||
int8_t shuffle = i.InputInt8(2);
|
||||
DCHECK_NE(0xe4, shuffle); // A simple blend should be handled below.
|
||||
__ Pshufd(kScratchDoubleReg, i.InputOperand(1), shuffle);
|
||||
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), shuffle);
|
||||
__ Pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputInt8(3));
|
||||
break;
|
||||
}
|
||||
case kSSES16x8Blend: {
|
||||
CpuFeatureScope sse_scope(tasm(), SSE4_1);
|
||||
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
|
||||
__ pblendw(i.OutputSimd128Register(), i.InputOperand(1), i.InputInt8(2));
|
||||
case kIA32S16x8Blend:
|
||||
ASSEMBLE_SIMD_IMM_SHUFFLE(pblendw, SSE4_1, i.InputInt8(2));
|
||||
break;
|
||||
}
|
||||
case kAVXS16x8Blend: {
|
||||
CpuFeatureScope sse_scope(tasm(), AVX);
|
||||
__ vpblendw(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputOperand(1), i.InputInt8(2));
|
||||
break;
|
||||
}
|
||||
case kIA32S16x8HalfShuffle1: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
__ Pshuflw(dst, i.InputOperand(0), i.InputInt8(1));
|
||||
@ -3227,18 +3244,202 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
__ Pblendw(dst, kScratchDoubleReg, i.InputInt8(4));
|
||||
break;
|
||||
}
|
||||
case kSSES8x16Alignr: {
|
||||
CpuFeatureScope sse_scope(tasm(), SSSE3);
|
||||
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
|
||||
__ palignr(i.OutputSimd128Register(), i.InputOperand(1), i.InputInt8(2));
|
||||
case kIA32S8x16Alignr:
|
||||
ASSEMBLE_SIMD_IMM_SHUFFLE(palignr, SSSE3, i.InputInt8(2));
|
||||
break;
|
||||
case kIA32S64x2UnpackHigh:
|
||||
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhqdq);
|
||||
break;
|
||||
case kIA32S32x4UnpackHigh:
|
||||
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhdq);
|
||||
break;
|
||||
case kIA32S16x8UnpackHigh:
|
||||
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhwd);
|
||||
break;
|
||||
case kIA32S8x16UnpackHigh:
|
||||
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhbw);
|
||||
break;
|
||||
case kIA32S64x2UnpackLow:
|
||||
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklqdq);
|
||||
break;
|
||||
case kIA32S32x4UnpackLow:
|
||||
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckldq);
|
||||
break;
|
||||
case kIA32S16x8UnpackLow:
|
||||
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklwd);
|
||||
break;
|
||||
case kIA32S8x16UnpackLow:
|
||||
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklbw);
|
||||
break;
|
||||
case kSSES16x8UnzipHigh: {
|
||||
CpuFeatureScope sse_scope(tasm(), SSE4_1);
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src2 = dst;
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
if (instr->InputCount() == 2) {
|
||||
__ movups(kScratchDoubleReg, i.InputOperand(1));
|
||||
__ psrld(kScratchDoubleReg, 16);
|
||||
src2 = kScratchDoubleReg;
|
||||
}
|
||||
__ psrld(dst, 16);
|
||||
__ packusdw(dst, src2);
|
||||
break;
|
||||
}
|
||||
case kAVXS8x16Alignr: {
|
||||
case kAVXS16x8UnzipHigh: {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
__ vpalignr(i.OutputSimd128Register(), i.InputSimd128Register(0),
|
||||
i.InputOperand(1), i.InputInt8(2));
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src2 = dst;
|
||||
if (instr->InputCount() == 2) {
|
||||
__ vpsrld(kScratchDoubleReg, i.InputSimd128Register(1), 16);
|
||||
src2 = kScratchDoubleReg;
|
||||
}
|
||||
__ vpsrld(dst, i.InputSimd128Register(0), 16);
|
||||
__ vpackusdw(dst, dst, src2);
|
||||
break;
|
||||
}
|
||||
case kSSES16x8UnzipLow: {
|
||||
CpuFeatureScope sse_scope(tasm(), SSE4_1);
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src2 = dst;
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
__ pxor(kScratchDoubleReg, kScratchDoubleReg);
|
||||
if (instr->InputCount() == 2) {
|
||||
__ pblendw(kScratchDoubleReg, i.InputOperand(1), 0x55);
|
||||
src2 = kScratchDoubleReg;
|
||||
}
|
||||
__ pblendw(dst, kScratchDoubleReg, 0xaa);
|
||||
__ packusdw(dst, src2);
|
||||
break;
|
||||
}
|
||||
case kAVXS16x8UnzipLow: {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src2 = dst;
|
||||
__ vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
|
||||
if (instr->InputCount() == 2) {
|
||||
__ vpblendw(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(1),
|
||||
0x55);
|
||||
src2 = kScratchDoubleReg;
|
||||
}
|
||||
__ vpblendw(dst, kScratchDoubleReg, i.InputSimd128Register(0), 0x55);
|
||||
__ vpackusdw(dst, dst, src2);
|
||||
break;
|
||||
}
|
||||
case kSSES8x16UnzipHigh: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src2 = dst;
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
if (instr->InputCount() == 2) {
|
||||
__ movups(kScratchDoubleReg, i.InputOperand(1));
|
||||
__ psrlw(kScratchDoubleReg, 8);
|
||||
src2 = kScratchDoubleReg;
|
||||
}
|
||||
__ psrlw(dst, 8);
|
||||
__ packuswb(dst, src2);
|
||||
break;
|
||||
}
|
||||
case kAVXS8x16UnzipHigh: {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src2 = dst;
|
||||
if (instr->InputCount() == 2) {
|
||||
__ vpsrlw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
|
||||
src2 = kScratchDoubleReg;
|
||||
}
|
||||
__ vpsrlw(dst, i.InputSimd128Register(0), 8);
|
||||
__ vpackuswb(dst, dst, src2);
|
||||
break;
|
||||
}
|
||||
case kSSES8x16UnzipLow: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src2 = dst;
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
if (instr->InputCount() == 2) {
|
||||
__ movups(kScratchDoubleReg, i.InputOperand(1));
|
||||
__ psllw(kScratchDoubleReg, 8);
|
||||
__ psrlw(kScratchDoubleReg, 8);
|
||||
src2 = kScratchDoubleReg;
|
||||
}
|
||||
__ psllw(dst, 8);
|
||||
__ psrlw(dst, 8);
|
||||
__ packuswb(dst, src2);
|
||||
break;
|
||||
}
|
||||
case kAVXS8x16UnzipLow: {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src2 = dst;
|
||||
if (instr->InputCount() == 2) {
|
||||
__ vpsllw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
|
||||
__ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 8);
|
||||
src2 = kScratchDoubleReg;
|
||||
}
|
||||
__ vpsllw(dst, i.InputSimd128Register(0), 8);
|
||||
__ vpsrlw(dst, dst, 8);
|
||||
__ vpackuswb(dst, dst, src2);
|
||||
break;
|
||||
}
|
||||
case kSSES8x16TransposeLow: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
__ psllw(dst, 8);
|
||||
if (instr->InputCount() == 1) {
|
||||
__ movups(kScratchDoubleReg, dst);
|
||||
} else {
|
||||
DCHECK_EQ(2, instr->InputCount());
|
||||
__ movups(kScratchDoubleReg, i.InputOperand(1));
|
||||
__ psllw(kScratchDoubleReg, 8);
|
||||
}
|
||||
__ psrlw(dst, 8);
|
||||
__ por(dst, kScratchDoubleReg);
|
||||
break;
|
||||
}
|
||||
case kAVXS8x16TransposeLow: {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
if (instr->InputCount() == 1) {
|
||||
__ vpsllw(kScratchDoubleReg, i.InputSimd128Register(0), 8);
|
||||
__ vpsrlw(dst, kScratchDoubleReg, 8);
|
||||
} else {
|
||||
DCHECK_EQ(2, instr->InputCount());
|
||||
__ vpsllw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
|
||||
__ vpsllw(dst, i.InputSimd128Register(0), 8);
|
||||
__ vpsrlw(dst, dst, 8);
|
||||
}
|
||||
__ vpor(dst, dst, kScratchDoubleReg);
|
||||
break;
|
||||
}
|
||||
case kSSES8x16TransposeHigh: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
__ psrlw(dst, 8);
|
||||
if (instr->InputCount() == 1) {
|
||||
__ movups(kScratchDoubleReg, dst);
|
||||
} else {
|
||||
DCHECK_EQ(2, instr->InputCount());
|
||||
__ movups(kScratchDoubleReg, i.InputOperand(1));
|
||||
__ psrlw(kScratchDoubleReg, 8);
|
||||
}
|
||||
__ psllw(kScratchDoubleReg, 8);
|
||||
__ por(dst, kScratchDoubleReg);
|
||||
break;
|
||||
}
|
||||
case kAVXS8x16TransposeHigh: {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
if (instr->InputCount() == 1) {
|
||||
__ vpsrlw(dst, i.InputSimd128Register(0), 8);
|
||||
__ vpsllw(kScratchDoubleReg, dst, 8);
|
||||
} else {
|
||||
DCHECK_EQ(2, instr->InputCount());
|
||||
__ vpsrlw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
|
||||
__ vpsrlw(dst, i.InputSimd128Register(0), 8);
|
||||
__ vpsllw(kScratchDoubleReg, kScratchDoubleReg, 8);
|
||||
}
|
||||
__ vpor(dst, dst, kScratchDoubleReg);
|
||||
break;
|
||||
}
|
||||
|
||||
case kIA32S1x4AnyTrue:
|
||||
case kIA32S1x8AnyTrue:
|
||||
case kIA32S1x16AnyTrue: {
|
||||
@ -4136,6 +4337,8 @@ void CodeGenerator::AssembleJumpTable(Label** targets, size_t target_count) {
|
||||
#undef ASSEMBLE_BINOP
|
||||
#undef ASSEMBLE_ATOMIC_BINOP
|
||||
#undef ASSEMBLE_MOVX
|
||||
#undef ASSEMBLE_SIMD_PUNPCK_SHUFFLE
|
||||
#undef ASSEMBLE_SIMD_IMM_SHUFFLE
|
||||
|
||||
} // namespace compiler
|
||||
} // namespace internal
|
||||
|
@ -303,12 +303,30 @@ namespace compiler {
|
||||
V(IA32S8x16Shuffle) \
|
||||
V(IA32S32x4Swizzle) \
|
||||
V(IA32S32x4Shuffle) \
|
||||
V(SSES16x8Blend) \
|
||||
V(AVXS16x8Blend) \
|
||||
V(IA32S16x8Blend) \
|
||||
V(IA32S16x8HalfShuffle1) \
|
||||
V(IA32S16x8HalfShuffle2) \
|
||||
V(SSES8x16Alignr) \
|
||||
V(AVXS8x16Alignr) \
|
||||
V(IA32S8x16Alignr) \
|
||||
V(SSES16x8UnzipHigh) \
|
||||
V(AVXS16x8UnzipHigh) \
|
||||
V(SSES16x8UnzipLow) \
|
||||
V(AVXS16x8UnzipLow) \
|
||||
V(SSES8x16UnzipHigh) \
|
||||
V(AVXS8x16UnzipHigh) \
|
||||
V(SSES8x16UnzipLow) \
|
||||
V(AVXS8x16UnzipLow) \
|
||||
V(IA32S64x2UnpackHigh) \
|
||||
V(IA32S32x4UnpackHigh) \
|
||||
V(IA32S16x8UnpackHigh) \
|
||||
V(IA32S8x16UnpackHigh) \
|
||||
V(IA32S64x2UnpackLow) \
|
||||
V(IA32S32x4UnpackLow) \
|
||||
V(IA32S16x8UnpackLow) \
|
||||
V(IA32S8x16UnpackLow) \
|
||||
V(SSES8x16TransposeLow) \
|
||||
V(AVXS8x16TransposeLow) \
|
||||
V(SSES8x16TransposeHigh) \
|
||||
V(AVXS8x16TransposeHigh) \
|
||||
V(IA32S1x4AnyTrue) \
|
||||
V(IA32S1x4AllTrue) \
|
||||
V(IA32S1x8AnyTrue) \
|
||||
|
@ -285,12 +285,30 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kIA32S8x16Shuffle:
|
||||
case kIA32S32x4Swizzle:
|
||||
case kIA32S32x4Shuffle:
|
||||
case kSSES16x8Blend:
|
||||
case kAVXS16x8Blend:
|
||||
case kIA32S16x8Blend:
|
||||
case kIA32S16x8HalfShuffle1:
|
||||
case kIA32S16x8HalfShuffle2:
|
||||
case kSSES8x16Alignr:
|
||||
case kAVXS8x16Alignr:
|
||||
case kIA32S8x16Alignr:
|
||||
case kSSES16x8UnzipHigh:
|
||||
case kAVXS16x8UnzipHigh:
|
||||
case kSSES16x8UnzipLow:
|
||||
case kAVXS16x8UnzipLow:
|
||||
case kSSES8x16UnzipHigh:
|
||||
case kAVXS8x16UnzipHigh:
|
||||
case kSSES8x16UnzipLow:
|
||||
case kAVXS8x16UnzipLow:
|
||||
case kIA32S64x2UnpackHigh:
|
||||
case kIA32S32x4UnpackHigh:
|
||||
case kIA32S16x8UnpackHigh:
|
||||
case kIA32S8x16UnpackHigh:
|
||||
case kIA32S64x2UnpackLow:
|
||||
case kIA32S32x4UnpackLow:
|
||||
case kIA32S16x8UnpackLow:
|
||||
case kIA32S8x16UnpackLow:
|
||||
case kSSES8x16TransposeLow:
|
||||
case kAVXS8x16TransposeLow:
|
||||
case kSSES8x16TransposeHigh:
|
||||
case kAVXS8x16TransposeHigh:
|
||||
case kIA32S1x4AnyTrue:
|
||||
case kIA32S1x4AllTrue:
|
||||
case kIA32S1x8AnyTrue:
|
||||
|
@ -2059,18 +2059,125 @@ uint8_t PackBlend4(const uint8_t* shuffle32x4) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns true if shuffle can be separated into two half shuffles, i.e.lanes
|
||||
// don't move from low 4 lanes to high 4 lanes or vice versa) and a blend.
|
||||
// Returns true if shuffle can be decomposed into two 16x4 half shuffles
|
||||
// followed by a 16x8 blend.
|
||||
// E.g. [3 2 1 0 15 14 13 12].
|
||||
bool Is16x8BlendedShuffle(uint8_t* shuffle16x8, uint8_t* blend_mask) {
|
||||
bool TryMatch16x8HalfShuffle(uint8_t* shuffle16x8, uint8_t* blend_mask) {
|
||||
*blend_mask = 0;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
*blend_mask |= (shuffle16x8[i] > 7 ? 1 : 0) << i;
|
||||
if ((shuffle16x8[i] & 0x4) != (i & 0x4)) return false;
|
||||
*blend_mask |= (shuffle16x8[i] > 7 ? 1 : 0) << i;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
struct ShuffleEntry {
|
||||
uint8_t shuffle[kSimd128Size];
|
||||
ArchOpcode opcode;
|
||||
ArchOpcode avx_opcode;
|
||||
bool src0_needs_reg;
|
||||
bool src1_needs_reg;
|
||||
};
|
||||
|
||||
// Shuffles that map to architecture-specific instruction sequences. These are
|
||||
// matched very early, so we shouldn't include shuffles that match better in
|
||||
// later tests, like 32x4 and 16x8 shuffles. In general, these patterns should
|
||||
// map to either a single instruction, or be finer grained, such as zip/unzip or
|
||||
// transpose patterns.
|
||||
static const ShuffleEntry arch_shuffles[] = {
|
||||
{{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23},
|
||||
kIA32S64x2UnpackLow,
|
||||
kIA32S64x2UnpackLow,
|
||||
true,
|
||||
false},
|
||||
{{8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
|
||||
kIA32S64x2UnpackHigh,
|
||||
kIA32S64x2UnpackHigh,
|
||||
true,
|
||||
false},
|
||||
{{0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23},
|
||||
kIA32S32x4UnpackLow,
|
||||
kIA32S32x4UnpackLow,
|
||||
true,
|
||||
false},
|
||||
{{8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31},
|
||||
kIA32S32x4UnpackHigh,
|
||||
kIA32S32x4UnpackHigh,
|
||||
true,
|
||||
false},
|
||||
{{0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23},
|
||||
kIA32S16x8UnpackLow,
|
||||
kIA32S16x8UnpackLow,
|
||||
true,
|
||||
false},
|
||||
{{8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31},
|
||||
kIA32S16x8UnpackHigh,
|
||||
kIA32S16x8UnpackHigh,
|
||||
true,
|
||||
false},
|
||||
{{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23},
|
||||
kIA32S8x16UnpackLow,
|
||||
kIA32S8x16UnpackLow,
|
||||
true,
|
||||
false},
|
||||
{{8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31},
|
||||
kIA32S8x16UnpackHigh,
|
||||
kIA32S8x16UnpackHigh,
|
||||
true,
|
||||
false},
|
||||
|
||||
{{0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29},
|
||||
kSSES16x8UnzipLow,
|
||||
kAVXS16x8UnzipLow,
|
||||
true,
|
||||
false},
|
||||
{{2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31},
|
||||
kSSES16x8UnzipHigh,
|
||||
kAVXS16x8UnzipHigh,
|
||||
true,
|
||||
true},
|
||||
{{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30},
|
||||
kSSES8x16UnzipLow,
|
||||
kAVXS8x16UnzipLow,
|
||||
true,
|
||||
true},
|
||||
{{1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31},
|
||||
kSSES8x16UnzipHigh,
|
||||
kAVXS8x16UnzipHigh,
|
||||
true,
|
||||
true},
|
||||
|
||||
{{0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30},
|
||||
kSSES8x16TransposeLow,
|
||||
kAVXS8x16TransposeLow,
|
||||
true,
|
||||
true},
|
||||
{{1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31},
|
||||
kSSES8x16TransposeHigh,
|
||||
kAVXS8x16TransposeHigh,
|
||||
true,
|
||||
true}};
|
||||
|
||||
bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
|
||||
size_t num_entries, bool is_swizzle,
|
||||
const ShuffleEntry** arch_shuffle) {
|
||||
uint8_t mask = is_swizzle ? kSimd128Size - 1 : 2 * kSimd128Size - 1;
|
||||
for (size_t i = 0; i < num_entries; ++i) {
|
||||
const ShuffleEntry& entry = table[i];
|
||||
int j = 0;
|
||||
for (; j < kSimd128Size; ++j) {
|
||||
if ((entry.shuffle[j] & mask) != (shuffle[j] & mask)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j == kSimd128Size) {
|
||||
*arch_shuffle = &entry;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// TODO(bbudge) Make sure identity shuffle emits no instructions.
|
||||
@ -2090,40 +2197,51 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
|
||||
bool use_avx = CpuFeatures::IsSupported(AVX);
|
||||
// AVX and swizzles don't generally need DefineSameAsFirst to avoid a move.
|
||||
bool no_same_as_first = use_avx || is_swizzle;
|
||||
// We generally need UseRegister for the first source.
|
||||
bool no_use_register = false;
|
||||
// We generally need UseRegister for input0, Use for input1.
|
||||
bool src0_needs_reg = true;
|
||||
bool src1_needs_reg = false;
|
||||
ArchOpcode opcode = kIA32S8x16Shuffle; // general shuffle is the default
|
||||
|
||||
uint8_t offset;
|
||||
uint8_t shuffle32x4[4];
|
||||
uint8_t shuffle16x8[8];
|
||||
const ShuffleEntry* arch_shuffle;
|
||||
if (TryMatchConcat(shuffle, &offset)) {
|
||||
// Swap inputs from the normal order for (v)palignr.
|
||||
SwapShuffleInputs(node);
|
||||
is_swizzle = false; // It's simpler to just handle the general case.
|
||||
no_same_as_first = use_avx; // SSE requires same-as-first.
|
||||
opcode = use_avx ? kAVXS8x16Alignr : kSSES8x16Alignr;
|
||||
opcode = kIA32S8x16Alignr;
|
||||
// palignr takes a single imm8 offset.
|
||||
imms[imm_count++] = offset;
|
||||
} else if (TryMatchArchShuffle(shuffle, arch_shuffles,
|
||||
arraysize(arch_shuffles), is_swizzle,
|
||||
&arch_shuffle)) {
|
||||
opcode = use_avx ? arch_shuffle->avx_opcode : arch_shuffle->opcode;
|
||||
src0_needs_reg = arch_shuffle->src0_needs_reg;
|
||||
// SSE can't take advantage of both operands in registers and needs
|
||||
// same-as-first.
|
||||
src1_needs_reg = use_avx && arch_shuffle->src1_needs_reg;
|
||||
no_same_as_first = use_avx;
|
||||
} else if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) {
|
||||
uint8_t shuffle_mask = PackShuffle4(shuffle32x4);
|
||||
if (is_swizzle) {
|
||||
// pshufd takes a single imm8 shuffle mask.
|
||||
opcode = kIA32S32x4Swizzle;
|
||||
no_same_as_first = true;
|
||||
no_use_register = true;
|
||||
src0_needs_reg = false;
|
||||
imms[imm_count++] = shuffle_mask;
|
||||
} else {
|
||||
// 2 operand shuffle
|
||||
// A blend is more efficient than a general 32x4 shuffle; try it first.
|
||||
if (TryMatchBlend(shuffle)) {
|
||||
opcode = use_avx ? kAVXS16x8Blend : kSSES16x8Blend;
|
||||
opcode = kIA32S16x8Blend;
|
||||
uint8_t blend_mask = PackBlend4(shuffle32x4);
|
||||
imms[imm_count++] = blend_mask;
|
||||
} else {
|
||||
opcode = kIA32S32x4Shuffle;
|
||||
no_same_as_first = true;
|
||||
no_use_register = true;
|
||||
src0_needs_reg = false;
|
||||
imms[imm_count++] = shuffle_mask;
|
||||
int8_t blend_mask = PackBlend4(shuffle32x4);
|
||||
imms[imm_count++] = blend_mask;
|
||||
@ -2132,14 +2250,14 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
|
||||
} else if (TryMatch16x8Shuffle(shuffle, shuffle16x8)) {
|
||||
uint8_t blend_mask;
|
||||
if (TryMatchBlend(shuffle)) {
|
||||
opcode = use_avx ? kAVXS16x8Blend : kSSES16x8Blend;
|
||||
opcode = kIA32S16x8Blend;
|
||||
blend_mask = PackBlend8(shuffle16x8);
|
||||
imms[imm_count++] = blend_mask;
|
||||
} else if (Is16x8BlendedShuffle(shuffle16x8, &blend_mask)) {
|
||||
} else if (TryMatch16x8HalfShuffle(shuffle16x8, &blend_mask)) {
|
||||
opcode = is_swizzle ? kIA32S16x8HalfShuffle1 : kIA32S16x8HalfShuffle2;
|
||||
// Half-shuffles don't need DefineSameAsFirst or UseRegister(src0).
|
||||
no_same_as_first = true;
|
||||
no_use_register = true;
|
||||
src0_needs_reg = false;
|
||||
uint8_t mask_lo = PackShuffle4(shuffle16x8);
|
||||
uint8_t mask_hi = PackShuffle4(shuffle16x8 + 4);
|
||||
imms[imm_count++] = mask_lo;
|
||||
@ -2150,7 +2268,7 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
|
||||
if (opcode == kIA32S8x16Shuffle) {
|
||||
// Use same-as-first for general swizzle, but not shuffle.
|
||||
no_same_as_first = !is_swizzle;
|
||||
no_use_register = no_same_as_first;
|
||||
src0_needs_reg = !no_same_as_first;
|
||||
imms[imm_count++] = Pack4Lanes(shuffle);
|
||||
imms[imm_count++] = Pack4Lanes(shuffle + 4);
|
||||
imms[imm_count++] = Pack4Lanes(shuffle + 8);
|
||||
@ -2164,13 +2282,15 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
|
||||
InstructionOperand dst =
|
||||
no_same_as_first ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
|
||||
InstructionOperand src0 =
|
||||
no_use_register ? g.Use(input0) : g.UseRegister(input0);
|
||||
src0_needs_reg ? g.UseRegister(input0) : g.Use(input0);
|
||||
|
||||
int input_count = 0;
|
||||
InstructionOperand inputs[2 + kMaxImms + kMaxTemps];
|
||||
inputs[input_count++] = src0;
|
||||
if (!is_swizzle) {
|
||||
inputs[input_count++] = g.Use(node->InputAt(1));
|
||||
Node* input1 = node->InputAt(1);
|
||||
inputs[input_count++] =
|
||||
src1_needs_reg ? g.UseRegister(input1) : g.Use(input1);
|
||||
}
|
||||
for (int i = 0; i < imm_count; ++i) {
|
||||
inputs[input_count++] = g.UseImmediate(imms[i]);
|
||||
|
@ -42,8 +42,14 @@
|
||||
V(psubsw, 66, 0F, E9) \
|
||||
V(psubusb, 66, 0F, D8) \
|
||||
V(psubusw, 66, 0F, D9) \
|
||||
V(punpckhdq, 66, 0F, 6A) \
|
||||
V(punpcklbw, 66, 0F, 60) \
|
||||
V(punpcklwd, 66, 0F, 61) \
|
||||
V(punpckldq, 66, 0F, 62) \
|
||||
V(punpcklqdq, 66, 0F, 6C) \
|
||||
V(punpckhbw, 66, 0F, 68) \
|
||||
V(punpckhwd, 66, 0F, 69) \
|
||||
V(punpckhdq, 66, 0F, 6A) \
|
||||
V(punpckhqdq, 66, 0F, 6D) \
|
||||
V(pxor, 66, 0F, EF)
|
||||
|
||||
#define SSSE3_INSTRUCTION_LIST(V) \
|
||||
|
@ -1874,6 +1874,7 @@ WASM_SIMD_COMPILED_AND_LOWERED_TEST(S16x8TransposeRight) {
|
||||
{{2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31}});
|
||||
}
|
||||
|
||||
// TODO(simd) 'Reverse' tests should be 2-operand shuffles, not swizzles.
|
||||
WASM_SIMD_COMPILED_AND_LOWERED_TEST(S16x4Reverse) {
|
||||
RunShuffleOpTest<int8_t>(
|
||||
execution_mode, lower_simd, kExprS8x16Shuffle,
|
||||
|
Loading…
Reference in New Issue
Block a user