[ia32] Unify F32x4UConvertI32x4 SSE and AVX opcodes
Drive-by cleanup: IWYU for macro-assembler-ia32.h and instruction-selector-ia32.cc Ran using `iwyu_tool.py -p out/ia32.debug <filename>`, with a local build of llvm and iwyu. Bug: v8:11217,v8:7490 Change-Id: I4f8e95fa9be2f51f6764c994bb4da9ae86854c4d Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2583671 Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#71728}
This commit is contained in:
parent
f22a6474c8
commit
d628e5e1dd
@ -9,14 +9,36 @@
|
||||
#ifndef V8_CODEGEN_IA32_MACRO_ASSEMBLER_IA32_H_
|
||||
#define V8_CODEGEN_IA32_MACRO_ASSEMBLER_IA32_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "include/v8-internal.h"
|
||||
#include "src/base/logging.h"
|
||||
#include "src/base/macros.h"
|
||||
#include "src/builtins/builtins.h"
|
||||
#include "src/codegen/assembler.h"
|
||||
#include "src/codegen/bailout-reason.h"
|
||||
#include "src/codegen/cpu-features.h"
|
||||
#include "src/codegen/ia32/assembler-ia32.h"
|
||||
#include "src/codegen/ia32/register-ia32.h"
|
||||
#include "src/codegen/label.h"
|
||||
#include "src/codegen/reglist.h"
|
||||
#include "src/codegen/reloc-info.h"
|
||||
#include "src/codegen/turbo-assembler.h"
|
||||
#include "src/common/globals.h"
|
||||
#include "src/execution/frames.h"
|
||||
#include "src/handles/handles.h"
|
||||
#include "src/objects/heap-object.h"
|
||||
#include "src/objects/smi.h"
|
||||
#include "src/roots/roots.h"
|
||||
#include "src/runtime/runtime.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class Code;
|
||||
class ExternalReference;
|
||||
class StatsCounter;
|
||||
|
||||
// Convenience for platform-independent signatures. We do not normally
|
||||
// distinguish memory operands from other operands on ia32.
|
||||
using MemOperand = Operand;
|
||||
@ -408,6 +430,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
||||
AVX_PACKED_OP3(Pmaddwd, pmaddwd)
|
||||
AVX_PACKED_OP3(Paddd, paddd)
|
||||
AVX_PACKED_OP3(Paddq, paddq)
|
||||
AVX_PACKED_OP3(Psubd, psubd)
|
||||
AVX_PACKED_OP3(Psubq, psubq)
|
||||
AVX_PACKED_OP3(Pmuludq, pmuludq)
|
||||
AVX_PACKED_OP3(Pavgb, pavgb)
|
||||
|
@ -2383,34 +2383,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
__ Cvtdq2ps(i.OutputSimd128Register(), i.InputOperand(0));
|
||||
break;
|
||||
}
|
||||
case kSSEF32x4UConvertI32x4: {
|
||||
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
|
||||
CpuFeatureScope sse_scope(tasm(), SSE4_1);
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
__ pxor(kScratchDoubleReg, kScratchDoubleReg); // zeros
|
||||
__ pblendw(kScratchDoubleReg, dst, 0x55); // get lo 16 bits
|
||||
__ psubd(dst, kScratchDoubleReg); // get hi 16 bits
|
||||
__ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg); // convert lo exactly
|
||||
__ psrld(dst, 1); // divide by 2 to get in unsigned range
|
||||
__ cvtdq2ps(dst, dst); // convert hi exactly
|
||||
__ addps(dst, dst); // double hi, exactly
|
||||
__ addps(dst, kScratchDoubleReg); // add hi and lo, may round.
|
||||
break;
|
||||
}
|
||||
case kAVXF32x4UConvertI32x4: {
|
||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
||||
case kIA32F32x4UConvertI32x4: {
|
||||
XMMRegister dst = i.OutputSimd128Register();
|
||||
XMMRegister src = i.InputSimd128Register(0);
|
||||
__ vpxor(kScratchDoubleReg, kScratchDoubleReg,
|
||||
kScratchDoubleReg); // zeros
|
||||
__ vpblendw(kScratchDoubleReg, kScratchDoubleReg, src,
|
||||
0x55); // get lo 16 bits
|
||||
__ vpsubd(dst, src, kScratchDoubleReg); // get hi 16 bits
|
||||
__ vcvtdq2ps(kScratchDoubleReg, kScratchDoubleReg); // convert lo exactly
|
||||
__ vpsrld(dst, dst, 1); // divide by 2 to get in unsigned range
|
||||
__ vcvtdq2ps(dst, dst); // convert hi exactly
|
||||
__ vaddps(dst, dst, dst); // double hi, exactly
|
||||
__ vaddps(dst, dst, kScratchDoubleReg); // add hi and lo, may round.
|
||||
__ Pxor(kScratchDoubleReg, kScratchDoubleReg); // zeros
|
||||
__ Pblendw(kScratchDoubleReg, src, 0x55); // get lo 16 bits
|
||||
__ Psubd(dst, src, kScratchDoubleReg); // get hi 16 bits
|
||||
__ Cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg); // convert lo exactly
|
||||
__ Psrld(dst, dst, 1); // divide by 2 to get in unsigned range
|
||||
__ Cvtdq2ps(dst, dst); // convert hi exactly
|
||||
__ Addps(dst, dst, dst); // double hi, exactly
|
||||
__ Addps(dst, dst, kScratchDoubleReg); // add hi and lo, may round.
|
||||
break;
|
||||
}
|
||||
case kSSEF32x4Abs: {
|
||||
|
@ -159,8 +159,7 @@ namespace compiler {
|
||||
V(AVXF32x4ExtractLane) \
|
||||
V(IA32Insertps) \
|
||||
V(IA32F32x4SConvertI32x4) \
|
||||
V(SSEF32x4UConvertI32x4) \
|
||||
V(AVXF32x4UConvertI32x4) \
|
||||
V(IA32F32x4UConvertI32x4) \
|
||||
V(SSEF32x4Abs) \
|
||||
V(AVXF32x4Abs) \
|
||||
V(SSEF32x4Neg) \
|
||||
|
@ -138,8 +138,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kAVXF32x4ExtractLane:
|
||||
case kIA32Insertps:
|
||||
case kIA32F32x4SConvertI32x4:
|
||||
case kSSEF32x4UConvertI32x4:
|
||||
case kAVXF32x4UConvertI32x4:
|
||||
case kIA32F32x4UConvertI32x4:
|
||||
case kSSEF32x4Abs:
|
||||
case kAVXF32x4Abs:
|
||||
case kSSEF32x4Neg:
|
||||
|
@ -2,11 +2,43 @@
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "src/base/flags.h"
|
||||
#include "src/base/iterator.h"
|
||||
#include "src/base/logging.h"
|
||||
#include "src/base/macros.h"
|
||||
#include "src/base/platform/wrappers.h"
|
||||
#include "src/codegen/cpu-features.h"
|
||||
#include "src/codegen/ia32/assembler-ia32.h"
|
||||
#include "src/codegen/ia32/register-ia32.h"
|
||||
#include "src/codegen/machine-type.h"
|
||||
#include "src/codegen/turbo-assembler.h"
|
||||
#include "src/common/globals.h"
|
||||
#include "src/compiler/backend/instruction-codes.h"
|
||||
#include "src/compiler/backend/instruction-selector-impl.h"
|
||||
#include "src/compiler/backend/instruction-selector.h"
|
||||
#include "src/compiler/backend/instruction.h"
|
||||
#include "src/compiler/common-operator.h"
|
||||
#include "src/compiler/frame.h"
|
||||
#include "src/compiler/globals.h"
|
||||
#include "src/compiler/linkage.h"
|
||||
#include "src/compiler/machine-operator.h"
|
||||
#include "src/compiler/node-matchers.h"
|
||||
#include "src/compiler/node-properties.h"
|
||||
#include "src/compiler/node.h"
|
||||
#include "src/compiler/opcodes.h"
|
||||
#include "src/compiler/operator.h"
|
||||
#include "src/compiler/write-barrier-kind.h"
|
||||
#include "src/flags/flags.h"
|
||||
#include "src/utils/utils.h"
|
||||
#include "src/wasm/simd-shuffle.h"
|
||||
#include "src/zone/zone-containers.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
@ -279,6 +311,10 @@ void VisitRRSimd(InstructionSelector* selector, Node* node,
|
||||
}
|
||||
}
|
||||
|
||||
void VisitRRSimd(InstructionSelector* selector, Node* node, ArchOpcode opcode) {
|
||||
VisitRRSimd(selector, node, opcode, opcode);
|
||||
}
|
||||
|
||||
// TODO(v8:9198): Like VisitRROFloat, but for SIMD. SSE requires operand1 to be
|
||||
// a register as we don't have memory alignment yet. For AVX, memory operands
|
||||
// are fine, but can have performance issues if not aligned to 16/32 bytes
|
||||
@ -2399,7 +2435,7 @@ void InstructionSelector::VisitF32x4ExtractLane(Node* node) {
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {
|
||||
VisitRRSimd(this, node, kAVXF32x4UConvertI32x4, kSSEF32x4UConvertI32x4);
|
||||
VisitRRSimd(this, node, kIA32F32x4UConvertI32x4);
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitI32x4SConvertF32x4(Node* node) {
|
||||
|
Loading…
Reference in New Issue
Block a user