From dd1e168cafe1493805406ec681dea6ca698efdfb Mon Sep 17 00:00:00 2001 From: "jing.bao" Date: Thu, 23 Sep 2021 17:15:31 +0800 Subject: [PATCH] [x64] Implement 256-bit assembly for vroundps/pd, vpblendw, vpalignr Bug: v8:12228 Change-Id: Ifd813e6bff92e6a08cc41eb8f5b1848abe849cd3 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3178540 Reviewed-by: Zhi An Ng Commit-Queue: Jing Bao Cr-Commit-Position: refs/heads/main@{#77070} --- src/codegen/x64/assembler-x64.h | 26 +++++++++++++++++ test/cctest/test-assembler-x64.cc | 48 +++++++++++++++++++++---------- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/src/codegen/x64/assembler-x64.h b/src/codegen/x64/assembler-x64.h index 2dfdb88a3b..11fb013406 100644 --- a/src/codegen/x64/assembler-x64.h +++ b/src/codegen/x64/assembler-x64.h @@ -1500,10 +1500,18 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { vinstr(0x08, dst, xmm0, src, k66, k0F3A, kWIG); emit(static_cast(mode) | 0x8); // Mask precision exception. } + void vroundps(YMMRegister dst, YMMRegister src, RoundingMode mode) { + vinstr(0x08, dst, ymm0, src, k66, k0F3A, kWIG, AVX); + emit(static_cast(mode) | 0x8); // Mask precision exception. + } void vroundpd(XMMRegister dst, XMMRegister src, RoundingMode mode) { vinstr(0x09, dst, xmm0, src, k66, k0F3A, kWIG); emit(static_cast(mode) | 0x8); // Mask precision exception. } + void vroundpd(YMMRegister dst, YMMRegister src, RoundingMode mode) { + vinstr(0x09, dst, ymm0, src, k66, k0F3A, kWIG, AVX); + emit(static_cast(mode) | 0x8); // Mask precision exception. + } void vsd(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2) { vinstr(op, dst, src1, src2, kF2, k0F, kWIG); @@ -1693,20 +1701,38 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { vinstr(0x0E, dst, src1, src2, k66, k0F3A, kWIG); emit(mask); } + void vpblendw(YMMRegister dst, YMMRegister src1, YMMRegister src2, + uint8_t mask) { + vinstr(0x0E, dst, src1, src2, k66, k0F3A, kWIG); + emit(mask); + } void vpblendw(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t mask) { vinstr(0x0E, dst, src1, src2, k66, k0F3A, kWIG); emit(mask); } + void vpblendw(YMMRegister dst, YMMRegister src1, Operand src2, uint8_t mask) { + vinstr(0x0E, dst, src1, src2, k66, k0F3A, kWIG); + emit(mask); + } void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2, uint8_t imm8) { vinstr(0x0F, dst, src1, src2, k66, k0F3A, kWIG); emit(imm8); } + void vpalignr(YMMRegister dst, YMMRegister src1, YMMRegister src2, + uint8_t imm8) { + vinstr(0x0F, dst, src1, src2, k66, k0F3A, kWIG); + emit(imm8); + } void vpalignr(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8) { vinstr(0x0F, dst, src1, src2, k66, k0F3A, kWIG); emit(imm8); } + void vpalignr(YMMRegister dst, YMMRegister src1, Operand src2, uint8_t imm8) { + vinstr(0x0F, dst, src1, src2, k66, k0F3A, kWIG); + emit(imm8); + } void vps(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2); void vps(byte op, YMMRegister dst, YMMRegister src1, YMMRegister src2); diff --git a/test/cctest/test-assembler-x64.cc b/test/cctest/test-assembler-x64.cc index b023d2b416..6b9ecbd5ff 100644 --- a/test/cctest/test-assembler-x64.cc +++ b/test/cctest/test-assembler-x64.cc @@ -2562,7 +2562,7 @@ TEST(AssemblerX64Regmove256bit) { CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected))); } -TEST(AssemblerX64Shuffle256bit) { +TEST(AssemblerX64LaneOp256bit) { if (!CpuFeatures::IsSupported(AVX2)) return; CcTest::InitializeVM(); v8::HandleScope scope(CcTest::isolate()); @@ -2577,6 +2577,10 @@ TEST(AssemblerX64Shuffle256bit) { __ vpshuflw(ymm9, Operand(rbx, rcx, times_4, 10000), 85); __ vpshufhw(ymm1, ymm2, 85); __ vpshufhw(ymm1, Operand(rbx, rcx, times_4, 10000), 85); + __ vpblendw(ymm2, ymm3, ymm4, 23); + __ vpblendw(ymm2, ymm3, Operand(rbx, rcx, times_4, 10000), 23); + __ vpalignr(ymm10, ymm11, ymm12, 4); + __ vpalignr(ymm10, ymm11, Operand(rbx, rcx, times_4, 10000), 4); CodeDesc desc; masm.GetCode(isolate, &desc); @@ -2587,19 +2591,27 @@ TEST(AssemblerX64Shuffle256bit) { code->Print(os); #endif - byte expected[] = {// vpshufd ymm1, ymm2, 85 - 0xC5, 0xFD, 0x70, 0xCA, 0x55, - // vpshufd ymm1,YMMWORD PTR [rbx+rcx*4+0x2710], 85 - 0xC5, 0xFD, 0x70, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x55, - // vpshuflw ymm9, ymm10, 85, - 0xC4, 0x41, 0x7F, 0x70, 0xCA, 0x55, - // vpshuflw ymm9,YMMWORD PTR [rbx+rcx*4+0x2710], 85 - 0xC5, 0x7F, 0x70, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x55, - // vpshufhw ymm1, ymm2, 85 - 0xC5, 0xFE, 0x70, 0xCA, 0x55, - // vpshufhw ymm1,YMMWORD PTR [rbx+rcx*4+0x2710], 85 - 0xC5, 0xFE, 0x70, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00, - 0x55}; + byte expected[] = { + // vpshufd ymm1, ymm2, 85 + 0xC5, 0xFD, 0x70, 0xCA, 0x55, + // vpshufd ymm1,YMMWORD PTR [rbx+rcx*4+0x2710], 85 + 0xC5, 0xFD, 0x70, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x55, + // vpshuflw ymm9, ymm10, 85, + 0xC4, 0x41, 0x7F, 0x70, 0xCA, 0x55, + // vpshuflw ymm9,YMMWORD PTR [rbx+rcx*4+0x2710], 85 + 0xC5, 0x7F, 0x70, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x55, + // vpshufhw ymm1, ymm2, 85 + 0xC5, 0xFE, 0x70, 0xCA, 0x55, + // vpshufhw ymm1,YMMWORD PTR [rbx+rcx*4+0x2710], 85 + 0xC5, 0xFE, 0x70, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x55, + // vpblendw ymm2, ymm3, ymm4, 23 + 0xC4, 0xE3, 0x65, 0x0E, 0xD4, 0x17, + // vpblendw ymm2, ymm3, YMMWORD PTR [rbx+rcx*4+0x2710], 23 + 0xC4, 0xE3, 0x65, 0x0E, 0x94, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x17, + // vpalignr ymm10, ymm11, ymm12, 4 + 0xC4, 0x43, 0x25, 0x0F, 0xD4, 0x04, + // vpalignr ymm10, ymm11, YMMWORD PTR [rbx+rcx*4+0x2710], 4 + 0xC4, 0x63, 0x25, 0x0F, 0x94, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x04}; CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected))); } @@ -2615,6 +2627,8 @@ TEST(AssemblerX64FloatingPoint256bit) { __ vsqrtps(ymm0, ymm1); __ vunpcklps(ymm2, ymm3, ymm14); __ vsubps(ymm10, ymm11, ymm12); + __ vroundps(ymm9, ymm2, kRoundUp); + __ vroundpd(ymm9, ymm2, kRoundToNearest); CodeDesc desc; masm.GetCode(isolate, &desc); @@ -2630,7 +2644,11 @@ TEST(AssemblerX64FloatingPoint256bit) { // VUNPCKLPS 0xC4, 0xC1, 0x64, 0x14, 0xD6, // VSUBPS - 0xC4, 0x41, 0x24, 0x5C, 0xD4}; + 0xC4, 0x41, 0x24, 0x5C, 0xD4, + // vroundps ymm9, ymm2, 0xA + 0xC4, 0x63, 0x7D, 0x08, 0xCA, 0x0A, + // vroundpd ymm9, ymm2, 0x8 + 0xC4, 0x63, 0x7D, 0x09, 0xCA, 0x08}; CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected))); }