[wasm-simd][x64] Add AVX codegen

For a bunch of s8x16, s16x2 and s32x4 shuffle ops (generated by
s8x16shuffle).

Bug: v8:9561
Change-Id: I0e5cd8a90edba8bc15918c0ca1dc830475db2769
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2110952
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66865}
This commit is contained in:
Ng Zhi An 2020-03-19 17:05:30 -07:00 committed by Commit Bot
parent 8e8d61b38b
commit 2f83184db3
6 changed files with 110 additions and 82 deletions

View File

@ -1523,6 +1523,10 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
vinstr(0x70, dst, xmm0, src, k66, k0F, kWIG); vinstr(0x70, dst, xmm0, src, k66, k0F, kWIG);
emit(imm8); emit(imm8);
} }
void vpshufd(XMMRegister dst, Operand src, uint8_t imm8) {
vinstr(0x70, dst, xmm0, src, k66, k0F, kWIG);
emit(imm8);
}
void vpshuflw(XMMRegister dst, XMMRegister src, uint8_t imm8) { void vpshuflw(XMMRegister dst, XMMRegister src, uint8_t imm8) {
vinstr(0x70, dst, xmm0, src, kF2, k0F, kWIG); vinstr(0x70, dst, xmm0, src, kF2, k0F, kWIG);
emit(imm8); emit(imm8);
@ -1531,6 +1535,14 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
vinstr(0x70, dst, xmm0, src, kF2, k0F, kWIG); vinstr(0x70, dst, xmm0, src, kF2, k0F, kWIG);
emit(imm8); emit(imm8);
} }
void vpshufhw(XMMRegister dst, XMMRegister src, uint8_t imm8) {
vinstr(0x70, dst, xmm0, src, kF3, k0F, kWIG);
emit(imm8);
}
void vpshufhw(XMMRegister dst, Operand src, uint8_t imm8) {
vinstr(0x70, dst, xmm0, src, kF2, k0F, kWIG);
emit(imm8);
}
void vpblendw(XMMRegister dst, XMMRegister src1, XMMRegister src2, void vpblendw(XMMRegister dst, XMMRegister src1, XMMRegister src2,
uint8_t mask) { uint8_t mask) {

View File

@ -216,10 +216,18 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Mulps, mulps) AVX_OP(Mulps, mulps)
AVX_OP(Divps, divps) AVX_OP(Divps, divps)
AVX_OP(Pshuflw, pshuflw) AVX_OP(Pshuflw, pshuflw)
AVX_OP(Pshufhw, pshufhw)
AVX_OP(Packsswb, packsswb) AVX_OP(Packsswb, packsswb)
AVX_OP(Packuswb, packuswb) AVX_OP(Packuswb, packuswb)
AVX_OP(Packssdw, packssdw) AVX_OP(Packssdw, packssdw)
AVX_OP(Punpcklbw, punpcklbw)
AVX_OP(Punpcklwd, punpcklwd)
AVX_OP(Punpckldq, punpckldq)
AVX_OP(Punpckhbw, punpckhbw)
AVX_OP(Punpckhwd, punpckhwd)
AVX_OP(Punpckhdq, punpckhdq)
AVX_OP(Punpcklqdq, punpcklqdq) AVX_OP(Punpcklqdq, punpcklqdq)
AVX_OP(Punpckhqdq, punpckhqdq)
AVX_OP(Pshufd, pshufd) AVX_OP(Pshufd, pshufd)
AVX_OP(Cmpps, cmpps) AVX_OP(Cmpps, cmpps)
AVX_OP(Cmppd, cmppd) AVX_OP(Cmppd, cmppd)

View File

@ -62,6 +62,10 @@ class InstructionOperandConverter {
return static_cast<int8_t>(InputInt32(index)); return static_cast<int8_t>(InputInt32(index));
} }
uint8_t InputUint8(size_t index) {
return bit_cast<uint8_t>(InputInt8(index));
}
int16_t InputInt16(size_t index) { int16_t InputInt16(size_t index) {
return static_cast<int16_t>(InputInt32(index)); return static_cast<int16_t>(InputInt32(index));
} }

View File

@ -580,9 +580,8 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
ASSEMBLE_SIMD_INSTR(opcode, dst, input_index); \ ASSEMBLE_SIMD_INSTR(opcode, dst, input_index); \
} while (false) } while (false)
#define ASSEMBLE_SIMD_IMM_SHUFFLE(opcode, SSELevel, imm) \ #define ASSEMBLE_SIMD_IMM_SHUFFLE(opcode, imm) \
do { \ do { \
CpuFeatureScope sse_scope(tasm(), SSELevel); \
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); \ DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); \
__ opcode(i.OutputSimd128Register(), i.InputSimd128Register(1), imm); \ __ opcode(i.OutputSimd128Register(), i.InputSimd128Register(1), imm); \
} while (false) } while (false)
@ -3747,129 +3746,126 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kX64S32x4Swizzle: { case kX64S32x4Swizzle: {
DCHECK_EQ(2, instr->InputCount()); DCHECK_EQ(2, instr->InputCount());
ASSEMBLE_SIMD_IMM_INSTR(pshufd, i.OutputSimd128Register(), 0, ASSEMBLE_SIMD_IMM_INSTR(Pshufd, i.OutputSimd128Register(), 0,
i.InputInt8(1)); i.InputUint8(1));
break; break;
} }
case kX64S32x4Shuffle: { case kX64S32x4Shuffle: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
DCHECK_EQ(4, instr->InputCount()); // Swizzles should be handled above. DCHECK_EQ(4, instr->InputCount()); // Swizzles should be handled above.
int8_t shuffle = i.InputInt8(2); uint8_t shuffle = i.InputUint8(2);
DCHECK_NE(0xe4, shuffle); // A simple blend should be handled below. DCHECK_NE(0xe4, shuffle); // A simple blend should be handled below.
ASSEMBLE_SIMD_IMM_INSTR(pshufd, kScratchDoubleReg, 1, shuffle); ASSEMBLE_SIMD_IMM_INSTR(Pshufd, kScratchDoubleReg, 1, shuffle);
ASSEMBLE_SIMD_IMM_INSTR(pshufd, i.OutputSimd128Register(), 0, shuffle); ASSEMBLE_SIMD_IMM_INSTR(Pshufd, i.OutputSimd128Register(), 0, shuffle);
__ pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputInt8(3)); __ Pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputUint8(3));
break; break;
} }
case kX64S16x8Blend: { case kX64S16x8Blend: {
ASSEMBLE_SIMD_IMM_SHUFFLE(pblendw, SSE4_1, i.InputInt8(2)); ASSEMBLE_SIMD_IMM_SHUFFLE(Pblendw, i.InputUint8(2));
break; break;
} }
case kX64S16x8HalfShuffle1: { case kX64S16x8HalfShuffle1: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
ASSEMBLE_SIMD_IMM_INSTR(pshuflw, dst, 0, i.InputInt8(1)); ASSEMBLE_SIMD_IMM_INSTR(Pshuflw, dst, 0, i.InputUint8(1));
__ pshufhw(dst, dst, i.InputInt8(2)); __ Pshufhw(dst, dst, i.InputUint8(2));
break; break;
} }
case kX64S16x8HalfShuffle2: { case kX64S16x8HalfShuffle2: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
ASSEMBLE_SIMD_IMM_INSTR(pshuflw, kScratchDoubleReg, 1, i.InputInt8(2)); ASSEMBLE_SIMD_IMM_INSTR(Pshuflw, kScratchDoubleReg, 1, i.InputUint8(2));
__ pshufhw(kScratchDoubleReg, kScratchDoubleReg, i.InputInt8(3)); __ Pshufhw(kScratchDoubleReg, kScratchDoubleReg, i.InputUint8(3));
ASSEMBLE_SIMD_IMM_INSTR(pshuflw, dst, 0, i.InputInt8(2)); ASSEMBLE_SIMD_IMM_INSTR(Pshuflw, dst, 0, i.InputUint8(2));
__ pshufhw(dst, dst, i.InputInt8(3)); __ Pshufhw(dst, dst, i.InputUint8(3));
__ pblendw(dst, kScratchDoubleReg, i.InputInt8(4)); __ Pblendw(dst, kScratchDoubleReg, i.InputUint8(4));
break; break;
} }
case kX64S8x16Alignr: { case kX64S8x16Alignr: {
ASSEMBLE_SIMD_IMM_SHUFFLE(palignr, SSSE3, i.InputInt8(2)); ASSEMBLE_SIMD_IMM_SHUFFLE(Palignr, i.InputUint8(2));
break; break;
} }
case kX64S16x8Dup: { case kX64S16x8Dup: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
int8_t lane = i.InputInt8(1) & 0x7; uint8_t lane = i.InputInt8(1) & 0x7;
int8_t lane4 = lane & 0x3; uint8_t lane4 = lane & 0x3;
int8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6); uint8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
if (lane < 4) { if (lane < 4) {
ASSEMBLE_SIMD_IMM_INSTR(pshuflw, dst, 0, half_dup); ASSEMBLE_SIMD_IMM_INSTR(Pshuflw, dst, 0, half_dup);
__ pshufd(dst, dst, 0); __ Pshufd(dst, dst, static_cast<uint8_t>(0));
} else { } else {
ASSEMBLE_SIMD_IMM_INSTR(pshufhw, dst, 0, half_dup); ASSEMBLE_SIMD_IMM_INSTR(Pshufhw, dst, 0, half_dup);
__ pshufd(dst, dst, 0xaa); __ Pshufd(dst, dst, static_cast<uint8_t>(0xaa));
} }
break; break;
} }
case kX64S8x16Dup: { case kX64S8x16Dup: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
int8_t lane = i.InputInt8(1) & 0xf; uint8_t lane = i.InputInt8(1) & 0xf;
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
if (lane < 8) { if (lane < 8) {
__ punpcklbw(dst, dst); __ Punpcklbw(dst, dst);
} else { } else {
__ punpckhbw(dst, dst); __ Punpckhbw(dst, dst);
} }
lane &= 0x7; lane &= 0x7;
int8_t lane4 = lane & 0x3; uint8_t lane4 = lane & 0x3;
int8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6); uint8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
if (lane < 4) { if (lane < 4) {
__ pshuflw(dst, dst, half_dup); __ Pshuflw(dst, dst, half_dup);
__ pshufd(dst, dst, 0); __ Pshufd(dst, dst, static_cast<uint8_t>(0));
} else { } else {
__ pshufhw(dst, dst, half_dup); __ Pshufhw(dst, dst, half_dup);
__ pshufd(dst, dst, 0xaa); __ Pshufd(dst, dst, static_cast<uint8_t>(0xaa));
} }
break; break;
} }
case kX64S64x2UnpackHigh: case kX64S64x2UnpackHigh:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhqdq); ASSEMBLE_SIMD_PUNPCK_SHUFFLE(Punpckhqdq);
break; break;
case kX64S32x4UnpackHigh: case kX64S32x4UnpackHigh:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhdq); ASSEMBLE_SIMD_PUNPCK_SHUFFLE(Punpckhdq);
break; break;
case kX64S16x8UnpackHigh: case kX64S16x8UnpackHigh:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhwd); ASSEMBLE_SIMD_PUNPCK_SHUFFLE(Punpckhwd);
break; break;
case kX64S8x16UnpackHigh: case kX64S8x16UnpackHigh:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhbw); ASSEMBLE_SIMD_PUNPCK_SHUFFLE(Punpckhbw);
break; break;
case kX64S64x2UnpackLow: case kX64S64x2UnpackLow:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklqdq); ASSEMBLE_SIMD_PUNPCK_SHUFFLE(Punpcklqdq);
break; break;
case kX64S32x4UnpackLow: case kX64S32x4UnpackLow:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckldq); ASSEMBLE_SIMD_PUNPCK_SHUFFLE(Punpckldq);
break; break;
case kX64S16x8UnpackLow: case kX64S16x8UnpackLow:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklwd); ASSEMBLE_SIMD_PUNPCK_SHUFFLE(Punpcklwd);
break; break;
case kX64S8x16UnpackLow: case kX64S8x16UnpackLow:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklbw); ASSEMBLE_SIMD_PUNPCK_SHUFFLE(Punpcklbw);
break; break;
case kX64S16x8UnzipHigh: { case kX64S16x8UnzipHigh: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister src2 = dst; XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
if (instr->InputCount() == 2) { if (instr->InputCount() == 2) {
ASSEMBLE_SIMD_INSTR(movups, kScratchDoubleReg, 1); ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1);
__ psrld(kScratchDoubleReg, 16); __ Psrld(kScratchDoubleReg, static_cast<byte>(16));
src2 = kScratchDoubleReg; src2 = kScratchDoubleReg;
} }
__ psrld(dst, 16); __ Psrld(dst, static_cast<byte>(16));
__ packusdw(dst, src2); __ Packusdw(dst, src2);
break; break;
} }
case kX64S16x8UnzipLow: { case kX64S16x8UnzipLow: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister src2 = dst; XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
__ pxor(kScratchDoubleReg, kScratchDoubleReg); __ Pxor(kScratchDoubleReg, kScratchDoubleReg);
if (instr->InputCount() == 2) { if (instr->InputCount() == 2) {
ASSEMBLE_SIMD_IMM_INSTR(pblendw, kScratchDoubleReg, 1, 0x55); ASSEMBLE_SIMD_IMM_INSTR(Pblendw, kScratchDoubleReg, 1,
static_cast<uint8_t>(0x55));
src2 = kScratchDoubleReg; src2 = kScratchDoubleReg;
} }
__ pblendw(dst, kScratchDoubleReg, 0xaa); __ Pblendw(dst, kScratchDoubleReg, static_cast<uint8_t>(0xaa));
__ packusdw(dst, src2); __ Packusdw(dst, src2);
break; break;
} }
case kX64S8x16UnzipHigh: { case kX64S8x16UnzipHigh: {
@ -3877,12 +3873,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src2 = dst; XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
if (instr->InputCount() == 2) { if (instr->InputCount() == 2) {
ASSEMBLE_SIMD_INSTR(movups, kScratchDoubleReg, 1); ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1);
__ psrlw(kScratchDoubleReg, 8); __ Psrlw(kScratchDoubleReg, static_cast<byte>(8));
src2 = kScratchDoubleReg; src2 = kScratchDoubleReg;
} }
__ psrlw(dst, 8); __ Psrlw(dst, static_cast<byte>(8));
__ packuswb(dst, src2); __ Packuswb(dst, src2);
break; break;
} }
case kX64S8x16UnzipLow: { case kX64S8x16UnzipLow: {
@ -3890,44 +3886,44 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src2 = dst; XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
if (instr->InputCount() == 2) { if (instr->InputCount() == 2) {
ASSEMBLE_SIMD_INSTR(movups, kScratchDoubleReg, 1); ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1);
__ psllw(kScratchDoubleReg, 8); __ Psllw(kScratchDoubleReg, static_cast<byte>(8));
__ psrlw(kScratchDoubleReg, 8); __ Psrlw(kScratchDoubleReg, static_cast<byte>(8));
src2 = kScratchDoubleReg; src2 = kScratchDoubleReg;
} }
__ psllw(dst, 8); __ Psllw(dst, static_cast<byte>(8));
__ psrlw(dst, 8); __ Psrlw(dst, static_cast<byte>(8));
__ packuswb(dst, src2); __ Packuswb(dst, src2);
break; break;
} }
case kX64S8x16TransposeLow: { case kX64S8x16TransposeLow: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
__ psllw(dst, 8); __ Psllw(dst, static_cast<byte>(8));
if (instr->InputCount() == 1) { if (instr->InputCount() == 1) {
__ movups(kScratchDoubleReg, dst); __ Movups(kScratchDoubleReg, dst);
} else { } else {
DCHECK_EQ(2, instr->InputCount()); DCHECK_EQ(2, instr->InputCount());
ASSEMBLE_SIMD_INSTR(movups, kScratchDoubleReg, 1); ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1);
__ psllw(kScratchDoubleReg, 8); __ Psllw(kScratchDoubleReg, static_cast<byte>(8));
} }
__ psrlw(dst, 8); __ Psrlw(dst, static_cast<byte>(8));
__ por(dst, kScratchDoubleReg); __ Por(dst, kScratchDoubleReg);
break; break;
} }
case kX64S8x16TransposeHigh: { case kX64S8x16TransposeHigh: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
__ psrlw(dst, 8); __ Psrlw(dst, static_cast<byte>(8));
if (instr->InputCount() == 1) { if (instr->InputCount() == 1) {
__ movups(kScratchDoubleReg, dst); __ Movups(kScratchDoubleReg, dst);
} else { } else {
DCHECK_EQ(2, instr->InputCount()); DCHECK_EQ(2, instr->InputCount());
ASSEMBLE_SIMD_INSTR(movups, kScratchDoubleReg, 1); ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1);
__ psrlw(kScratchDoubleReg, 8); __ Psrlw(kScratchDoubleReg, static_cast<byte>(8));
} }
__ psllw(kScratchDoubleReg, 8); __ Psllw(kScratchDoubleReg, static_cast<byte>(8));
__ por(dst, kScratchDoubleReg); __ Por(dst, kScratchDoubleReg);
break; break;
} }
case kX64S8x8Reverse: case kX64S8x8Reverse:
@ -3938,14 +3934,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
if (arch_opcode != kX64S8x2Reverse) { if (arch_opcode != kX64S8x2Reverse) {
// First shuffle words into position. // First shuffle words into position.
int8_t shuffle_mask = arch_opcode == kX64S8x4Reverse ? 0xB1 : 0x1B; uint8_t shuffle_mask = arch_opcode == kX64S8x4Reverse ? 0xB1 : 0x1B;
__ pshuflw(dst, dst, shuffle_mask); __ Pshuflw(dst, dst, shuffle_mask);
__ pshufhw(dst, dst, shuffle_mask); __ Pshufhw(dst, dst, shuffle_mask);
} }
__ movaps(kScratchDoubleReg, dst); __ Movaps(kScratchDoubleReg, dst);
__ psrlw(kScratchDoubleReg, 8); __ Psrlw(kScratchDoubleReg, static_cast<byte>(8));
__ psllw(dst, 8); __ Psllw(dst, static_cast<byte>(8));
__ por(dst, kScratchDoubleReg); __ Por(dst, kScratchDoubleReg);
break; break;
} }
case kX64S1x2AnyTrue: case kX64S1x2AnyTrue:

View File

@ -1096,6 +1096,11 @@ int DisassemblerX64::AVXInstruction(byte* data) {
AppendToBuffer("vmovdqu %s,", NameOfXMMRegister(regop)); AppendToBuffer("vmovdqu %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current); current += PrintRightXMMOperand(current);
break; break;
case 0x70:
AppendToBuffer("vpshufhw %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
AppendToBuffer(",0x%x", *current++);
break;
case 0x7F: case 0x7F:
AppendToBuffer("vmovdqu "); AppendToBuffer("vmovdqu ");
current += PrintRightXMMOperand(current); current += PrintRightXMMOperand(current);

View File

@ -783,8 +783,11 @@ TEST(DisasmX64) {
__ vpinsrq(xmm1, xmm2, rax, 9); __ vpinsrq(xmm1, xmm2, rax, 9);
__ vpinsrq(xmm1, xmm2, Operand(rbx, rcx, times_4, 10000), 9); __ vpinsrq(xmm1, xmm2, Operand(rbx, rcx, times_4, 10000), 9);
__ vpshufd(xmm1, xmm2, 85); __ vpshufd(xmm1, xmm2, 85);
__ vpshufd(xmm1, Operand(rbx, rcx, times_4, 10000), 85);
__ vpshuflw(xmm1, xmm2, 85); __ vpshuflw(xmm1, xmm2, 85);
__ vpshuflw(xmm1, Operand(rbx, rcx, times_4, 10000), 85); __ vpshuflw(xmm1, Operand(rbx, rcx, times_4, 10000), 85);
__ vpshufhw(xmm1, xmm2, 85);
__ vpshufhw(xmm1, Operand(rbx, rcx, times_4, 10000), 85);
__ vshufps(xmm3, xmm2, xmm3, 3); __ vshufps(xmm3, xmm2, xmm3, 3);
__ vpblendw(xmm1, xmm2, xmm3, 23); __ vpblendw(xmm1, xmm2, xmm3, 23);
__ vpblendw(xmm1, xmm2, Operand(rbx, rcx, times_4, 10000), 23); __ vpblendw(xmm1, xmm2, Operand(rbx, rcx, times_4, 10000), 23);