[wasm-simd] Implement f64x2 splat extract_lane replace_lane for ia32

Bug: v8:9728
Change-Id: I8d993368fc23ab9e8cc08e31f4405678ec4ce824
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1803790
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#63955}
This commit is contained in:
Ng Zhi An 2019-09-20 14:54:49 -07:00 committed by Commit Bot
parent 9cd54cd4f5
commit 8a5386f240
9 changed files with 166 additions and 3 deletions

View File

@ -2405,6 +2405,16 @@ void Assembler::shufps(XMMRegister dst, XMMRegister src, byte imm8) {
EMIT(imm8);
}
void Assembler::shufpd(XMMRegister dst, XMMRegister src, byte imm8) {
DCHECK(is_uint8(imm8));
EnsureSpace ensure_space(this);
EMIT(0x66);
EMIT(0x0F);
EMIT(0xC6);
emit_sse_operand(dst, src);
EMIT(imm8);
}
void Assembler::movdqa(Operand dst, XMMRegister src) {
EnsureSpace ensure_space(this);
EMIT(0x66);
@ -2818,6 +2828,13 @@ void Assembler::vpd(byte op, XMMRegister dst, XMMRegister src1, Operand src2) {
vinstr(op, dst, src1, src2, k66, k0F, kWIG);
}
void Assembler::vshufpd(XMMRegister dst, XMMRegister src1, Operand src2,
byte imm8) {
DCHECK(is_uint8(imm8));
vpd(0xC6, dst, src1, src2);
EMIT(imm8);
}
void Assembler::vcmpps(XMMRegister dst, XMMRegister src1, Operand src2,
uint8_t cmp) {
vps(0xC2, dst, src1, src2);

View File

@ -850,6 +850,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movups(XMMRegister dst, Operand src);
void movups(Operand dst, XMMRegister src);
void shufps(XMMRegister dst, XMMRegister src, byte imm8);
void shufpd(XMMRegister dst, XMMRegister src, byte imm8);
void maxss(XMMRegister dst, XMMRegister src) { maxss(dst, Operand(src)); }
void maxss(XMMRegister dst, Operand src);
@ -1319,12 +1320,18 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
}
void vmovaps(XMMRegister dst, XMMRegister src) { vmovaps(dst, Operand(src)); }
void vmovaps(XMMRegister dst, Operand src) { vps(0x28, dst, xmm0, src); }
void vmovapd(XMMRegister dst, XMMRegister src) { vmovapd(dst, Operand(src)); }
void vmovapd(XMMRegister dst, Operand src) { vpd(0x28, dst, xmm0, src); }
void vmovups(XMMRegister dst, XMMRegister src) { vmovups(dst, Operand(src)); }
void vmovups(XMMRegister dst, Operand src) { vps(0x10, dst, xmm0, src); }
void vshufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, byte imm8) {
vshufps(dst, src1, Operand(src2), imm8);
}
void vshufps(XMMRegister dst, XMMRegister src1, Operand src2, byte imm8);
void vshufpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, byte imm8) {
vshufpd(dst, src1, Operand(src2), imm8);
}
void vshufpd(XMMRegister dst, XMMRegister src1, Operand src2, byte imm8);
void vpsllw(XMMRegister dst, XMMRegister src, uint8_t imm8);
void vpslld(XMMRegister dst, XMMRegister src, uint8_t imm8);

View File

@ -1825,6 +1825,79 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
break;
}
case kSSEF64x2Splat: {
DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
XMMRegister dst = i.OutputSimd128Register();
__ shufpd(dst, dst, 0x0);
break;
}
case kAVXF64x2Splat: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister src = i.InputDoubleRegister(0);
__ vshufpd(i.OutputSimd128Register(), src, src, 0x0);
break;
}
case kSSEF64x2ExtractLane: {
DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
XMMRegister dst = i.OutputDoubleRegister();
int8_t lane = i.InputInt8(1);
if (lane != 0) {
DCHECK_LT(lane, 4);
__ shufpd(dst, dst, lane);
}
break;
}
case kAVXF64x2ExtractLane: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputDoubleRegister();
XMMRegister src = i.InputSimd128Register(0);
int8_t lane = i.InputInt8(1);
if (lane == 0) {
if (dst != src) __ vmovapd(dst, src);
} else {
DCHECK_LT(lane, 4);
__ vshufpd(dst, src, src, lane);
}
break;
}
case kSSEF64x2ReplaceLane: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register();
int8_t lane = i.InputInt8(1);
DoubleRegister rep = i.InputDoubleRegister(2);
// insertps takes a mask which contains (high to low):
// - 2 bit specifying source float element to copy
// - 2 bit specifying destination float element to write to
// - 4 bits specifying which elements of the destination to zero
DCHECK_LT(lane, 2);
if (lane == 0) {
__ insertps(dst, rep, 0b00000000);
__ insertps(dst, rep, 0b01010000);
} else {
__ insertps(dst, rep, 0b00100000);
__ insertps(dst, rep, 0b01110000);
}
break;
}
case kAVXF64x2ReplaceLane: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
int8_t lane = i.InputInt8(1);
DoubleRegister rep = i.InputDoubleRegister(2);
DCHECK_LT(lane, 2);
if (lane == 0) {
__ vinsertps(dst, src, rep, 0b00000000);
__ vinsertps(dst, src, rep, 0b01010000);
} else {
__ vinsertps(dst, src, rep, 0b10100000);
__ vinsertps(dst, src, rep, 0b11110000);
}
break;
}
case kSSEF32x4Splat: {
DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
XMMRegister dst = i.OutputSimd128Register();

View File

@ -116,6 +116,12 @@ namespace compiler {
V(IA32PushSimd128) \
V(IA32Poke) \
V(IA32Peek) \
V(SSEF64x2Splat) \
V(AVXF64x2Splat) \
V(SSEF64x2ExtractLane) \
V(AVXF64x2ExtractLane) \
V(SSEF64x2ReplaceLane) \
V(AVXF64x2ReplaceLane) \
V(SSEF32x4Splat) \
V(AVXF32x4Splat) \
V(SSEF32x4ExtractLane) \

View File

@ -97,6 +97,12 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kAVXFloat32Neg:
case kIA32BitcastFI:
case kIA32BitcastIF:
case kSSEF64x2Splat:
case kAVXF64x2Splat:
case kSSEF64x2ExtractLane:
case kAVXF64x2ExtractLane:
case kSSEF64x2ReplaceLane:
case kAVXF64x2ReplaceLane:
case kSSEF32x4Splat:
case kAVXF32x4Splat:
case kSSEF32x4ExtractLane:

View File

@ -2000,6 +2000,14 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(I8x16ShrS) \
V(I8x16ShrU)
void InstructionSelector::VisitF64x2Splat(Node* node) {
VisitRRSimd(this, node, kAVXF64x2Splat, kSSEF64x2Splat);
}
void InstructionSelector::VisitF64x2ExtractLane(Node* node) {
VisitRRISimd(this, node, kAVXF64x2ExtractLane, kSSEF64x2ExtractLane);
}
void InstructionSelector::VisitF32x4Splat(Node* node) {
VisitRRSimd(this, node, kAVXF32x4Splat, kSSEF32x4Splat);
}
@ -2091,6 +2099,28 @@ VISIT_SIMD_REPLACE_LANE(F32x4)
#undef VISIT_SIMD_REPLACE_LANE
#undef SIMD_INT_TYPES
// The difference between this and VISIT_SIMD_REPLACE_LANE is that this forces
// operand2 to be UseRegister, because the codegen relies on insertps using
// registers.
// TODO(v8:9764) Remove this UseRegister requirement
#define VISIT_SIMD_REPLACE_LANE_USE_REG(Type) \
void InstructionSelector::Visit##Type##ReplaceLane(Node* node) { \
IA32OperandGenerator g(this); \
InstructionOperand operand0 = g.UseRegister(node->InputAt(0)); \
InstructionOperand operand1 = \
g.UseImmediate(OpParameter<int32_t>(node->op())); \
InstructionOperand operand2 = g.UseRegister(node->InputAt(1)); \
if (IsSupported(AVX)) { \
Emit(kAVX##Type##ReplaceLane, g.DefineAsRegister(node), operand0, \
operand1, operand2); \
} else { \
Emit(kSSE##Type##ReplaceLane, g.DefineSameAsFirst(node), operand0, \
operand1, operand2); \
} \
}
VISIT_SIMD_REPLACE_LANE_USE_REG(F64x2)
#undef VISIT_SIMD_REPLACE_LANE_USE_REG
#define VISIT_SIMD_SHIFT(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
VisitRROSimdShift(this, node, kAVX##Opcode, kSSE##Opcode); \

View File

@ -2621,9 +2621,11 @@ void InstructionSelector::VisitWord64AtomicCompareExchange(Node* node) {
#if !V8_TARGET_ARCH_X64
#if !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_IA32
void InstructionSelector::VisitF64x2Splat(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2ExtractLane(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2ReplaceLane(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_IA32
void InstructionSelector::VisitF64x2Abs(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Neg(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Sqrt(Node* node) { UNIMPLEMENTED(); }

View File

@ -1216,6 +1216,13 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
AppendToBuffer(",%d", Imm8(current));
current++;
break;
case 0xC6:
AppendToBuffer("vshufpd %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
AppendToBuffer(",%d", Imm8(current));
current++;
break;
#define DECLARE_SSE_AVX_DIS_CASE(instruction, notUsed1, notUsed2, opcode) \
case 0x##opcode: { \
AppendToBuffer("v" #instruction " %s,%s,", NameOfXMMRegister(regop), \
@ -2269,6 +2276,15 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
data += PrintRightOperand(data);
AppendToBuffer(",%d", Imm8(data));
data++;
} else if (*data == 0xC6) {
// shufpd xmm, xmm/m128, imm8
data++;
int mod, regop, rm;
get_modrm(*data, &mod, &regop, &rm);
AppendToBuffer("shufpd %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
AppendToBuffer(",%d", Imm8(data));
data++;
} else if (*data == 0xE7) {
data++;
int mod, regop, rm;

View File

@ -877,6 +877,7 @@ WASM_SIMD_TEST_NO_LOWERING(F32x4Qfms) {
}
#endif // V8_TARGET_ARCH_X64
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST_NO_LOWERING(I64x2Splat) {
WasmRunner<int32_t, int64_t> r(execution_tier, lower_simd);
@ -1072,6 +1073,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2GeU) {
RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2GeU,
UnsignedGreaterEqual);
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST_NO_LOWERING(F64x2Splat) {
WasmRunner<int32_t, double> r(execution_tier, lower_simd);
@ -1095,6 +1097,7 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Splat) {
}
}
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST_NO_LOWERING(F64x2ExtractLaneWithI64x2) {
WasmRunner<int64_t> r(execution_tier, lower_simd);
BUILD(r, WASM_IF_ELSE_L(
@ -1104,6 +1107,7 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2ExtractLaneWithI64x2) {
WASM_I64V(1), WASM_I64V(0)));
CHECK_EQ(1, r.Call());
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST_NO_LOWERING(F64x2ExtractLane) {
WasmRunner<double, double> r(execution_tier, lower_simd);
@ -1127,6 +1131,7 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2ExtractLane) {
}
}
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST_NO_LOWERING(I64x2ExtractWithF64x2) {
WasmRunner<int64_t> r(execution_tier, lower_simd);
BUILD(r, WASM_IF_ELSE_L(
@ -1136,6 +1141,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2ExtractWithF64x2) {
WASM_I64V(1), WASM_I64V(0)));
CHECK_EQ(1, r.Call());
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST_NO_LOWERING(F64x2ReplaceLane) {
WasmRunner<int32_t> r(execution_tier, lower_simd);
@ -1156,6 +1162,7 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2ReplaceLane) {
}
}
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
bool IsExtreme(double x) {
double abs_x = std::fabs(x);
const double kSmallFloatThreshold = 1.0e-298;
@ -1278,11 +1285,9 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Neg) {
RunF64x2UnOpTest(execution_tier, lower_simd, kExprF64x2Neg, Negate);
}
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST_NO_LOWERING(F64x2Sqrt) {
RunF64x2UnOpTest(execution_tier, lower_simd, kExprF64x2Sqrt, Sqrt);
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
void RunF64x2BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, DoubleBinOp expected_op) {
@ -1413,6 +1418,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2Mul) {
RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2Mul,
base::MulWithWraparound);
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_X64
WASM_SIMD_TEST_NO_LOWERING(I64x2MinS) {
@ -1481,7 +1487,7 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Qfms) {
}
}
#endif // V8_TARGET_ARCH_X64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32
WASM_SIMD_TEST(I32x4Splat) {
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);