[x64] Implement 256-bit assembly for vroundps/pd, vpblendw, vpalignr

Bug: v8:12228
Change-Id: Ifd813e6bff92e6a08cc41eb8f5b1848abe849cd3
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3178540
Reviewed-by: Zhi An Ng <zhin@chromium.org>
Commit-Queue: Jing Bao <jing.bao@intel.com>
Cr-Commit-Position: refs/heads/main@{#77070}
This commit is contained in:
jing.bao 2021-09-23 17:15:31 +08:00 committed by V8 LUCI CQ
parent ce23293e75
commit dd1e168caf
2 changed files with 59 additions and 15 deletions

View File

@ -1500,10 +1500,18 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
vinstr(0x08, dst, xmm0, src, k66, k0F3A, kWIG);
emit(static_cast<byte>(mode) | 0x8); // Mask precision exception.
}
void vroundps(YMMRegister dst, YMMRegister src, RoundingMode mode) {
vinstr(0x08, dst, ymm0, src, k66, k0F3A, kWIG, AVX);
emit(static_cast<byte>(mode) | 0x8); // Mask precision exception.
}
void vroundpd(XMMRegister dst, XMMRegister src, RoundingMode mode) {
vinstr(0x09, dst, xmm0, src, k66, k0F3A, kWIG);
emit(static_cast<byte>(mode) | 0x8); // Mask precision exception.
}
void vroundpd(YMMRegister dst, YMMRegister src, RoundingMode mode) {
vinstr(0x09, dst, ymm0, src, k66, k0F3A, kWIG, AVX);
emit(static_cast<byte>(mode) | 0x8); // Mask precision exception.
}
void vsd(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(op, dst, src1, src2, kF2, k0F, kWIG);
@ -1693,20 +1701,38 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
vinstr(0x0E, dst, src1, src2, k66, k0F3A, kWIG);
emit(mask);
}
void vpblendw(YMMRegister dst, YMMRegister src1, YMMRegister src2,
uint8_t mask) {
vinstr(0x0E, dst, src1, src2, k66, k0F3A, kWIG);
emit(mask);
}
void vpblendw(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t mask) {
vinstr(0x0E, dst, src1, src2, k66, k0F3A, kWIG);
emit(mask);
}
void vpblendw(YMMRegister dst, YMMRegister src1, Operand src2, uint8_t mask) {
vinstr(0x0E, dst, src1, src2, k66, k0F3A, kWIG);
emit(mask);
}
void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2,
uint8_t imm8) {
vinstr(0x0F, dst, src1, src2, k66, k0F3A, kWIG);
emit(imm8);
}
void vpalignr(YMMRegister dst, YMMRegister src1, YMMRegister src2,
uint8_t imm8) {
vinstr(0x0F, dst, src1, src2, k66, k0F3A, kWIG);
emit(imm8);
}
void vpalignr(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8) {
vinstr(0x0F, dst, src1, src2, k66, k0F3A, kWIG);
emit(imm8);
}
void vpalignr(YMMRegister dst, YMMRegister src1, Operand src2, uint8_t imm8) {
vinstr(0x0F, dst, src1, src2, k66, k0F3A, kWIG);
emit(imm8);
}
void vps(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vps(byte op, YMMRegister dst, YMMRegister src1, YMMRegister src2);

View File

@ -2562,7 +2562,7 @@ TEST(AssemblerX64Regmove256bit) {
CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected)));
}
TEST(AssemblerX64Shuffle256bit) {
TEST(AssemblerX64LaneOp256bit) {
if (!CpuFeatures::IsSupported(AVX2)) return;
CcTest::InitializeVM();
v8::HandleScope scope(CcTest::isolate());
@ -2577,6 +2577,10 @@ TEST(AssemblerX64Shuffle256bit) {
__ vpshuflw(ymm9, Operand(rbx, rcx, times_4, 10000), 85);
__ vpshufhw(ymm1, ymm2, 85);
__ vpshufhw(ymm1, Operand(rbx, rcx, times_4, 10000), 85);
__ vpblendw(ymm2, ymm3, ymm4, 23);
__ vpblendw(ymm2, ymm3, Operand(rbx, rcx, times_4, 10000), 23);
__ vpalignr(ymm10, ymm11, ymm12, 4);
__ vpalignr(ymm10, ymm11, Operand(rbx, rcx, times_4, 10000), 4);
CodeDesc desc;
masm.GetCode(isolate, &desc);
@ -2587,19 +2591,27 @@ TEST(AssemblerX64Shuffle256bit) {
code->Print(os);
#endif
byte expected[] = {// vpshufd ymm1, ymm2, 85
0xC5, 0xFD, 0x70, 0xCA, 0x55,
// vpshufd ymm1,YMMWORD PTR [rbx+rcx*4+0x2710], 85
0xC5, 0xFD, 0x70, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x55,
// vpshuflw ymm9, ymm10, 85,
0xC4, 0x41, 0x7F, 0x70, 0xCA, 0x55,
// vpshuflw ymm9,YMMWORD PTR [rbx+rcx*4+0x2710], 85
0xC5, 0x7F, 0x70, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x55,
// vpshufhw ymm1, ymm2, 85
0xC5, 0xFE, 0x70, 0xCA, 0x55,
// vpshufhw ymm1,YMMWORD PTR [rbx+rcx*4+0x2710], 85
0xC5, 0xFE, 0x70, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00,
0x55};
byte expected[] = {
// vpshufd ymm1, ymm2, 85
0xC5, 0xFD, 0x70, 0xCA, 0x55,
// vpshufd ymm1,YMMWORD PTR [rbx+rcx*4+0x2710], 85
0xC5, 0xFD, 0x70, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x55,
// vpshuflw ymm9, ymm10, 85,
0xC4, 0x41, 0x7F, 0x70, 0xCA, 0x55,
// vpshuflw ymm9,YMMWORD PTR [rbx+rcx*4+0x2710], 85
0xC5, 0x7F, 0x70, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x55,
// vpshufhw ymm1, ymm2, 85
0xC5, 0xFE, 0x70, 0xCA, 0x55,
// vpshufhw ymm1,YMMWORD PTR [rbx+rcx*4+0x2710], 85
0xC5, 0xFE, 0x70, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x55,
// vpblendw ymm2, ymm3, ymm4, 23
0xC4, 0xE3, 0x65, 0x0E, 0xD4, 0x17,
// vpblendw ymm2, ymm3, YMMWORD PTR [rbx+rcx*4+0x2710], 23
0xC4, 0xE3, 0x65, 0x0E, 0x94, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x17,
// vpalignr ymm10, ymm11, ymm12, 4
0xC4, 0x43, 0x25, 0x0F, 0xD4, 0x04,
// vpalignr ymm10, ymm11, YMMWORD PTR [rbx+rcx*4+0x2710], 4
0xC4, 0x63, 0x25, 0x0F, 0x94, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x04};
CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected)));
}
@ -2615,6 +2627,8 @@ TEST(AssemblerX64FloatingPoint256bit) {
__ vsqrtps(ymm0, ymm1);
__ vunpcklps(ymm2, ymm3, ymm14);
__ vsubps(ymm10, ymm11, ymm12);
__ vroundps(ymm9, ymm2, kRoundUp);
__ vroundpd(ymm9, ymm2, kRoundToNearest);
CodeDesc desc;
masm.GetCode(isolate, &desc);
@ -2630,7 +2644,11 @@ TEST(AssemblerX64FloatingPoint256bit) {
// VUNPCKLPS
0xC4, 0xC1, 0x64, 0x14, 0xD6,
// VSUBPS
0xC4, 0x41, 0x24, 0x5C, 0xD4};
0xC4, 0x41, 0x24, 0x5C, 0xD4,
// vroundps ymm9, ymm2, 0xA
0xC4, 0x63, 0x7D, 0x08, 0xCA, 0x0A,
// vroundpd ymm9, ymm2, 0x8
0xC4, 0x63, 0x7D, 0x09, 0xCA, 0x08};
CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected)));
}