[wasm][revec] Add RevectorizePhase in WASM compilation pipeline

Bug: v8:12716

Change-Id: I7ef53709e9757b58951086fc01af6b2eda296b27
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3411357
Reviewed-by: Maya Lekova <mslekova@chromium.org>
Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Jie Pan <jie.pan@intel.com>
Cr-Commit-Position: refs/heads/main@{#84888}
This commit is contained in:
jiepan 2022-12-16 12:55:40 +08:00 committed by V8 LUCI CQ
parent cf4b096065
commit 256546319c
21 changed files with 1390 additions and 260 deletions

View File

@ -554,6 +554,10 @@ assert(!v8_enable_static_roots ||
v8_enable_webassembly && v8_enable_i18n_support),
"Trying to enable static roots in a configuration that is not supported")
if (v8_enable_webassembly && !target_is_simulator && v8_current_cpu == "x64") {
v8_enable_wasm_simd256_revec = true
}
assert(!v8_disable_write_barriers || v8_enable_single_generation,
"Disabling write barriers works only with single generation")
@ -1091,6 +1095,9 @@ config("features") {
if (v8_value_deserializer_hard_fail) {
defines += [ "V8_VALUE_DESERIALIZER_HARD_FAIL" ]
}
if (v8_enable_wasm_simd256_revec) {
defines += [ "V8_ENABLE_WASM_SIMD256_REVEC" ]
}
}
config("toolchain") {
@ -3746,7 +3753,10 @@ v8_header_set("v8_internal_headers") {
}
if (v8_enable_wasm_simd256_revec) {
sources += [ "src/compiler/linear-scheduler.h" ]
sources += [
"src/compiler/linear-scheduler.h",
"src/compiler/revectorizer.h",
]
}
if (!v8_enable_third_party_heap) {
@ -4258,7 +4268,10 @@ if (v8_enable_webassembly) {
}
if (v8_enable_wasm_simd256_revec) {
v8_compiler_sources += [ "src/compiler/linear-scheduler.cc" ]
v8_compiler_sources += [
"src/compiler/linear-scheduler.cc",
"src/compiler/revectorizer.cc",
]
}
# The src/compiler files with optimizations.

View File

@ -588,7 +588,8 @@ class V8_EXPORT_PRIVATE InstructionSelector final {
#define DECLARE_GENERATOR(x) void Visit##x(Node* node);
MACHINE_OP_LIST(DECLARE_GENERATOR)
MACHINE_SIMD_OP_LIST(DECLARE_GENERATOR)
MACHINE_SIMD128_OP_LIST(DECLARE_GENERATOR)
MACHINE_SIMD256_OP_LIST(DECLARE_GENERATOR)
#undef DECLARE_GENERATOR
// Visit the load node with a value and opcode to replace with.

View File

@ -20,7 +20,9 @@ Graph::Graph(Zone* zone)
end_(nullptr),
mark_max_(0),
next_node_id_(0),
decorators_(zone) {
decorators_(zone),
has_simd_(false),
simd_stores_(zone) {
// Nodes use compressed pointers, so zone must support pointer compression.
// If the check fails, ensure the zone is created with kCompressGraphZone
// flag.
@ -78,6 +80,10 @@ NodeId Graph::NextNodeId() {
void Graph::Print() const { StdoutStream{} << AsRPO(*this); }
void Graph::RecordSimdStore(Node* store) { simd_stores_.push_back(store); }
ZoneVector<Node*> const& Graph::GetSimdStoreNodes() { return simd_stores_; }
} // namespace compiler
} // namespace internal
} // namespace v8

View File

@ -95,6 +95,12 @@ class V8_EXPORT_PRIVATE Graph final : public NON_EXPORTED_BASE(ZoneObject) {
// Very simple print API usable in a debugger.
void Print() const;
bool HasSimd() const { return has_simd_; }
void SetSimd(bool has_simd) { has_simd_ = has_simd; }
void RecordSimdStore(Node* store);
ZoneVector<Node*> const& GetSimdStoreNodes();
private:
friend class NodeMarkerBase;
@ -106,6 +112,8 @@ class V8_EXPORT_PRIVATE Graph final : public NON_EXPORTED_BASE(ZoneObject) {
Mark mark_max_;
NodeId next_node_id_;
ZoneVector<GraphDecorator*> decorators_;
bool has_simd_;
ZoneVector<Node*> simd_stores_;
};

View File

@ -99,8 +99,8 @@ Node* LinearScheduler::GetEarlySchedulePosition(Node* node) {
NodeState& use = stack.top();
if (use.early_schedule_position == nullptr ||
GetControlLevel(use.early_schedule_position) <
GetControlLevel(top.early_schedule_position)) {
use.early_schedule_position = top.early_schedule_position;
GetControlLevel(early_schedule_position)) {
use.early_schedule_position = early_schedule_position;
}
}
}

View File

@ -126,6 +126,11 @@ std::ostream& operator<<(std::ostream& os, LoadTransformation rep) {
return os << "kS128Load32Zero";
case LoadTransformation::kS128Load64Zero:
return os << "kS128Load64Zero";
// Simd256
case LoadTransformation::kS256Load32Splat:
return os << "kS256Load32Splat";
case LoadTransformation::kS256Load64Splat:
return os << "kS256Load64Splat";
}
UNREACHABLE();
}
@ -637,7 +642,18 @@ std::ostream& operator<<(std::ostream& os, TruncateKind kind) {
V(I32x4RelaxedTruncF64x2UZero, Operator::kNoProperties, 1, 0, 1) \
V(I16x8RelaxedQ15MulRS, Operator::kCommutative, 2, 0, 1) \
V(I16x8DotI8x16I7x16S, Operator::kCommutative, 2, 0, 1) \
V(I32x4DotI8x16I7x16AddS, Operator::kNoProperties, 3, 0, 1)
V(I32x4DotI8x16I7x16AddS, Operator::kNoProperties, 3, 0, 1) \
V(F32x8Add, Operator::kCommutative, 2, 0, 1) \
V(F32x8Sub, Operator::kNoProperties, 2, 0, 1) \
V(F32x8Mul, Operator::kCommutative, 2, 0, 1) \
V(F32x8Div, Operator::kNoProperties, 2, 0, 1) \
V(F32x8Pmin, Operator::kNoProperties, 2, 0, 1) \
V(F32x8Pmax, Operator::kNoProperties, 2, 0, 1) \
V(F32x8Eq, Operator::kCommutative, 2, 0, 1) \
V(F32x8Ne, Operator::kCommutative, 2, 0, 1) \
V(F32x8Lt, Operator::kNoProperties, 2, 0, 1) \
V(F32x8Le, Operator::kNoProperties, 2, 0, 1) \
V(S256Select, Operator::kNoProperties, 3, 0, 1)
// The format is:
// V(Name, properties, value_input_count, control_input_count, output_count)
@ -729,7 +745,9 @@ std::ostream& operator<<(std::ostream& os, TruncateKind kind) {
V(S128Load32x2S) \
V(S128Load32x2U) \
V(S128Load32Zero) \
V(S128Load64Zero)
V(S128Load64Zero) \
V(S256Load32Splat) \
V(S256Load64Splat)
#if TAGGED_SIZE_8_BYTES
@ -2226,6 +2244,21 @@ StackCheckKind StackCheckKindOf(Operator const* op) {
return OpParameter<StackCheckKind>(op);
}
const Operator* MachineOperatorBuilder::ExtractF128(int32_t lane_index) {
DCHECK(0 <= lane_index && lane_index < 2);
class ExtractF128Operator final : public Operator1<int32_t> {
public:
explicit ExtractF128Operator(int32_t lane_index)
: Operator1<int32_t>(IrOpcode::kExtractF128, Operator::kPure,
"ExtractF128", 1, 0, 0, 1, 0, 0, lane_index) {
lane_index_ = lane_index;
}
int32_t lane_index_;
};
return zone_->New<ExtractF128Operator>(lane_index);
}
#undef PURE_BINARY_OP_LIST_32
#undef PURE_BINARY_OP_LIST_64
#undef MACHINE_PURE_OP_LIST

View File

@ -126,6 +126,8 @@ enum class LoadTransformation {
kS128Load32x2U,
kS128Load32Zero,
kS128Load64Zero,
kS256Load32Splat,
kS256Load64Splat,
};
size_t hash_value(LoadTransformation);
@ -964,6 +966,22 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* TraceInstruction(uint32_t markid);
// SIMD256
const Operator* F32x8Add();
const Operator* F32x8Sub();
const Operator* F32x8Mul();
const Operator* F32x8Div();
const Operator* F32x8Min();
const Operator* F32x8Max();
const Operator* F32x8Pmin();
const Operator* F32x8Pmax();
const Operator* F32x8Eq();
const Operator* F32x8Ne();
const Operator* F32x8Lt();
const Operator* F32x8Le();
const Operator* S256Select();
const Operator* ExtractF128(int32_t lane_index);
// load [base + index]
const Operator* Load(LoadRepresentation rep);
const Operator* LoadImmutable(LoadRepresentation rep);

View File

@ -117,6 +117,9 @@ class V8_EXPORT_PRIVATE NodeProperties {
static bool IsPhi(Node* node) {
return IrOpcode::IsPhiOpcode(node->opcode());
}
static bool IsSimd128Operation(Node* node) {
return IrOpcode::IsSimd128Opcode(node->opcode());
}
// Determines whether exceptions thrown by the given node are handled locally
// within the graph (i.e. an IfException projection is present). Optionally

View File

@ -825,248 +825,264 @@
V(StackPointerGreaterThan) \
V(TraceInstruction)
#define MACHINE_SIMD_OP_LIST(V) \
V(F64x2Splat) \
V(F64x2ExtractLane) \
V(F64x2ReplaceLane) \
V(F64x2Abs) \
V(F64x2Neg) \
V(F64x2Sqrt) \
V(F64x2Add) \
V(F64x2Sub) \
V(F64x2Mul) \
V(F64x2Div) \
V(F64x2Min) \
V(F64x2Max) \
V(F64x2Eq) \
V(F64x2Ne) \
V(F64x2Lt) \
V(F64x2Le) \
V(F64x2Qfma) \
V(F64x2Qfms) \
V(F64x2Pmin) \
V(F64x2Pmax) \
V(F64x2Ceil) \
V(F64x2Floor) \
V(F64x2Trunc) \
V(F64x2NearestInt) \
V(F64x2ConvertLowI32x4S) \
V(F64x2ConvertLowI32x4U) \
V(F64x2PromoteLowF32x4) \
V(F32x4Splat) \
V(F32x4ExtractLane) \
V(F32x4ReplaceLane) \
V(F32x4SConvertI32x4) \
V(F32x4UConvertI32x4) \
V(F32x4Abs) \
V(F32x4Neg) \
V(F32x4Sqrt) \
V(F32x4Add) \
V(F32x4Sub) \
V(F32x4Mul) \
V(F32x4Div) \
V(F32x4Min) \
V(F32x4Max) \
V(F32x4Eq) \
V(F32x4Ne) \
V(F32x4Lt) \
V(F32x4Le) \
V(F32x4Gt) \
V(F32x4Ge) \
V(F32x4Qfma) \
V(F32x4Qfms) \
V(F32x4Pmin) \
V(F32x4Pmax) \
V(F32x4Ceil) \
V(F32x4Floor) \
V(F32x4Trunc) \
V(F32x4NearestInt) \
V(F32x4DemoteF64x2Zero) \
V(I64x2Splat) \
V(I64x2SplatI32Pair) \
V(I64x2ExtractLane) \
V(I64x2ReplaceLane) \
V(I64x2ReplaceLaneI32Pair) \
V(I64x2Abs) \
V(I64x2Neg) \
V(I64x2SConvertI32x4Low) \
V(I64x2SConvertI32x4High) \
V(I64x2UConvertI32x4Low) \
V(I64x2UConvertI32x4High) \
V(I64x2BitMask) \
V(I64x2Shl) \
V(I64x2ShrS) \
V(I64x2Add) \
V(I64x2Sub) \
V(I64x2Mul) \
V(I64x2Eq) \
V(I64x2Ne) \
V(I64x2GtS) \
V(I64x2GeS) \
V(I64x2ShrU) \
V(I64x2ExtMulLowI32x4S) \
V(I64x2ExtMulHighI32x4S) \
V(I64x2ExtMulLowI32x4U) \
V(I64x2ExtMulHighI32x4U) \
V(I32x4Splat) \
V(I32x4ExtractLane) \
V(I32x4ReplaceLane) \
V(I32x4SConvertF32x4) \
V(I32x4SConvertI16x8Low) \
V(I32x4SConvertI16x8High) \
V(I32x4Neg) \
V(I32x4Shl) \
V(I32x4ShrS) \
V(I32x4Add) \
V(I32x4Sub) \
V(I32x4Mul) \
V(I32x4MinS) \
V(I32x4MaxS) \
V(I32x4Eq) \
V(I32x4Ne) \
V(I32x4LtS) \
V(I32x4LeS) \
V(I32x4GtS) \
V(I32x4GeS) \
V(I32x4UConvertF32x4) \
V(I32x4UConvertI16x8Low) \
V(I32x4UConvertI16x8High) \
V(I32x4ShrU) \
V(I32x4MinU) \
V(I32x4MaxU) \
V(I32x4LtU) \
V(I32x4LeU) \
V(I32x4GtU) \
V(I32x4GeU) \
V(I32x4Abs) \
V(I32x4BitMask) \
V(I32x4DotI16x8S) \
V(I32x4ExtMulLowI16x8S) \
V(I32x4ExtMulHighI16x8S) \
V(I32x4ExtMulLowI16x8U) \
V(I32x4ExtMulHighI16x8U) \
V(I32x4ExtAddPairwiseI16x8S) \
V(I32x4ExtAddPairwiseI16x8U) \
V(I32x4TruncSatF64x2SZero) \
V(I32x4TruncSatF64x2UZero) \
V(I16x8Splat) \
V(I16x8ExtractLaneU) \
V(I16x8ExtractLaneS) \
V(I16x8ReplaceLane) \
V(I16x8SConvertI8x16Low) \
V(I16x8SConvertI8x16High) \
V(I16x8Neg) \
V(I16x8Shl) \
V(I16x8ShrS) \
V(I16x8SConvertI32x4) \
V(I16x8Add) \
V(I16x8AddSatS) \
V(I16x8Sub) \
V(I16x8SubSatS) \
V(I16x8Mul) \
V(I16x8MinS) \
V(I16x8MaxS) \
V(I16x8Eq) \
V(I16x8Ne) \
V(I16x8LtS) \
V(I16x8LeS) \
V(I16x8GtS) \
V(I16x8GeS) \
V(I16x8UConvertI8x16Low) \
V(I16x8UConvertI8x16High) \
V(I16x8ShrU) \
V(I16x8UConvertI32x4) \
V(I16x8AddSatU) \
V(I16x8SubSatU) \
V(I16x8MinU) \
V(I16x8MaxU) \
V(I16x8LtU) \
V(I16x8LeU) \
V(I16x8GtU) \
V(I16x8GeU) \
V(I16x8RoundingAverageU) \
V(I16x8Q15MulRSatS) \
V(I16x8Abs) \
V(I16x8BitMask) \
V(I16x8ExtMulLowI8x16S) \
V(I16x8ExtMulHighI8x16S) \
V(I16x8ExtMulLowI8x16U) \
V(I16x8ExtMulHighI8x16U) \
V(I16x8ExtAddPairwiseI8x16S) \
V(I16x8ExtAddPairwiseI8x16U) \
V(I8x16Splat) \
V(I8x16ExtractLaneU) \
V(I8x16ExtractLaneS) \
V(I8x16ReplaceLane) \
V(I8x16SConvertI16x8) \
V(I8x16Neg) \
V(I8x16Shl) \
V(I8x16ShrS) \
V(I8x16Add) \
V(I8x16AddSatS) \
V(I8x16Sub) \
V(I8x16SubSatS) \
V(I8x16MinS) \
V(I8x16MaxS) \
V(I8x16Eq) \
V(I8x16Ne) \
V(I8x16LtS) \
V(I8x16LeS) \
V(I8x16GtS) \
V(I8x16GeS) \
V(I8x16UConvertI16x8) \
V(I8x16AddSatU) \
V(I8x16SubSatU) \
V(I8x16ShrU) \
V(I8x16MinU) \
V(I8x16MaxU) \
V(I8x16LtU) \
V(I8x16LeU) \
V(I8x16GtU) \
V(I8x16GeU) \
V(I8x16RoundingAverageU) \
V(I8x16Popcnt) \
V(I8x16Abs) \
V(I8x16BitMask) \
V(S128Zero) \
V(S128Const) \
V(S128Not) \
V(S128And) \
V(S128Or) \
V(S128Xor) \
V(S128Select) \
V(S128AndNot) \
V(I8x16Swizzle) \
V(I8x16RelaxedLaneSelect) \
V(I16x8RelaxedLaneSelect) \
V(I32x4RelaxedLaneSelect) \
V(I64x2RelaxedLaneSelect) \
V(F32x4RelaxedMin) \
V(F32x4RelaxedMax) \
V(F64x2RelaxedMin) \
V(F64x2RelaxedMax) \
V(I32x4RelaxedTruncF32x4S) \
V(I32x4RelaxedTruncF32x4U) \
V(I32x4RelaxedTruncF64x2SZero) \
V(I32x4RelaxedTruncF64x2UZero) \
V(I16x8RelaxedQ15MulRS) \
V(I16x8DotI8x16I7x16S) \
V(I32x4DotI8x16I7x16AddS) \
V(I8x16Shuffle) \
V(V128AnyTrue) \
V(I64x2AllTrue) \
V(I32x4AllTrue) \
V(I16x8AllTrue) \
V(I8x16AllTrue) \
V(LoadTransform) \
V(LoadLane) \
#define MACHINE_SIMD128_OP_LIST(V) \
V(F64x2Splat) \
V(F64x2ExtractLane) \
V(F64x2ReplaceLane) \
V(F64x2Abs) \
V(F64x2Neg) \
V(F64x2Sqrt) \
V(F64x2Add) \
V(F64x2Sub) \
V(F64x2Mul) \
V(F64x2Div) \
V(F64x2Min) \
V(F64x2Max) \
V(F64x2Eq) \
V(F64x2Ne) \
V(F64x2Lt) \
V(F64x2Le) \
V(F64x2Qfma) \
V(F64x2Qfms) \
V(F64x2Pmin) \
V(F64x2Pmax) \
V(F64x2Ceil) \
V(F64x2Floor) \
V(F64x2Trunc) \
V(F64x2NearestInt) \
V(F64x2ConvertLowI32x4S) \
V(F64x2ConvertLowI32x4U) \
V(F64x2PromoteLowF32x4) \
V(F32x4Splat) \
V(F32x4ExtractLane) \
V(F32x4ReplaceLane) \
V(F32x4SConvertI32x4) \
V(F32x4UConvertI32x4) \
V(F32x4Abs) \
V(F32x4Neg) \
V(F32x4Sqrt) \
V(F32x4Add) \
V(F32x4Sub) \
V(F32x4Mul) \
V(F32x4Div) \
V(F32x4Min) \
V(F32x4Max) \
V(F32x4Eq) \
V(F32x4Ne) \
V(F32x4Lt) \
V(F32x4Le) \
V(F32x4Gt) \
V(F32x4Ge) \
V(F32x4Qfma) \
V(F32x4Qfms) \
V(F32x4Pmin) \
V(F32x4Pmax) \
V(F32x4Ceil) \
V(F32x4Floor) \
V(F32x4Trunc) \
V(F32x4NearestInt) \
V(F32x4DemoteF64x2Zero) \
V(I64x2Splat) \
V(I64x2SplatI32Pair) \
V(I64x2ExtractLane) \
V(I64x2ReplaceLane) \
V(I64x2ReplaceLaneI32Pair) \
V(I64x2Abs) \
V(I64x2Neg) \
V(I64x2SConvertI32x4Low) \
V(I64x2SConvertI32x4High) \
V(I64x2UConvertI32x4Low) \
V(I64x2UConvertI32x4High) \
V(I64x2BitMask) \
V(I64x2Shl) \
V(I64x2ShrS) \
V(I64x2Add) \
V(I64x2Sub) \
V(I64x2Mul) \
V(I64x2Eq) \
V(I64x2Ne) \
V(I64x2GtS) \
V(I64x2GeS) \
V(I64x2ShrU) \
V(I64x2ExtMulLowI32x4S) \
V(I64x2ExtMulHighI32x4S) \
V(I64x2ExtMulLowI32x4U) \
V(I64x2ExtMulHighI32x4U) \
V(I32x4Splat) \
V(I32x4ExtractLane) \
V(I32x4ReplaceLane) \
V(I32x4SConvertF32x4) \
V(I32x4SConvertI16x8Low) \
V(I32x4SConvertI16x8High) \
V(I32x4Neg) \
V(I32x4Shl) \
V(I32x4ShrS) \
V(I32x4Add) \
V(I32x4Sub) \
V(I32x4Mul) \
V(I32x4MinS) \
V(I32x4MaxS) \
V(I32x4Eq) \
V(I32x4Ne) \
V(I32x4LtS) \
V(I32x4LeS) \
V(I32x4GtS) \
V(I32x4GeS) \
V(I32x4UConvertF32x4) \
V(I32x4UConvertI16x8Low) \
V(I32x4UConvertI16x8High) \
V(I32x4ShrU) \
V(I32x4MinU) \
V(I32x4MaxU) \
V(I32x4LtU) \
V(I32x4LeU) \
V(I32x4GtU) \
V(I32x4GeU) \
V(I32x4Abs) \
V(I32x4BitMask) \
V(I32x4DotI16x8S) \
V(I32x4ExtMulLowI16x8S) \
V(I32x4ExtMulHighI16x8S) \
V(I32x4ExtMulLowI16x8U) \
V(I32x4ExtMulHighI16x8U) \
V(I32x4ExtAddPairwiseI16x8S) \
V(I32x4ExtAddPairwiseI16x8U) \
V(I32x4TruncSatF64x2SZero) \
V(I32x4TruncSatF64x2UZero) \
V(I16x8Splat) \
V(I16x8ExtractLaneU) \
V(I16x8ExtractLaneS) \
V(I16x8ReplaceLane) \
V(I16x8SConvertI8x16Low) \
V(I16x8SConvertI8x16High) \
V(I16x8Neg) \
V(I16x8Shl) \
V(I16x8ShrS) \
V(I16x8SConvertI32x4) \
V(I16x8Add) \
V(I16x8AddSatS) \
V(I16x8Sub) \
V(I16x8SubSatS) \
V(I16x8Mul) \
V(I16x8MinS) \
V(I16x8MaxS) \
V(I16x8Eq) \
V(I16x8Ne) \
V(I16x8LtS) \
V(I16x8LeS) \
V(I16x8GtS) \
V(I16x8GeS) \
V(I16x8UConvertI8x16Low) \
V(I16x8UConvertI8x16High) \
V(I16x8ShrU) \
V(I16x8UConvertI32x4) \
V(I16x8AddSatU) \
V(I16x8SubSatU) \
V(I16x8MinU) \
V(I16x8MaxU) \
V(I16x8LtU) \
V(I16x8LeU) \
V(I16x8GtU) \
V(I16x8GeU) \
V(I16x8RoundingAverageU) \
V(I16x8Q15MulRSatS) \
V(I16x8Abs) \
V(I16x8BitMask) \
V(I16x8ExtMulLowI8x16S) \
V(I16x8ExtMulHighI8x16S) \
V(I16x8ExtMulLowI8x16U) \
V(I16x8ExtMulHighI8x16U) \
V(I16x8ExtAddPairwiseI8x16S) \
V(I16x8ExtAddPairwiseI8x16U) \
V(I8x16Splat) \
V(I8x16ExtractLaneU) \
V(I8x16ExtractLaneS) \
V(I8x16ReplaceLane) \
V(I8x16SConvertI16x8) \
V(I8x16Neg) \
V(I8x16Shl) \
V(I8x16ShrS) \
V(I8x16Add) \
V(I8x16AddSatS) \
V(I8x16Sub) \
V(I8x16SubSatS) \
V(I8x16MinS) \
V(I8x16MaxS) \
V(I8x16Eq) \
V(I8x16Ne) \
V(I8x16LtS) \
V(I8x16LeS) \
V(I8x16GtS) \
V(I8x16GeS) \
V(I8x16UConvertI16x8) \
V(I8x16AddSatU) \
V(I8x16SubSatU) \
V(I8x16ShrU) \
V(I8x16MinU) \
V(I8x16MaxU) \
V(I8x16LtU) \
V(I8x16LeU) \
V(I8x16GtU) \
V(I8x16GeU) \
V(I8x16RoundingAverageU) \
V(I8x16Popcnt) \
V(I8x16Abs) \
V(I8x16BitMask) \
V(S128Zero) \
V(S128Const) \
V(S128Not) \
V(S128And) \
V(S128Or) \
V(S128Xor) \
V(S128Select) \
V(S128AndNot) \
V(I8x16Swizzle) \
V(I8x16RelaxedLaneSelect) \
V(I16x8RelaxedLaneSelect) \
V(I32x4RelaxedLaneSelect) \
V(I64x2RelaxedLaneSelect) \
V(F32x4RelaxedMin) \
V(F32x4RelaxedMax) \
V(F64x2RelaxedMin) \
V(F64x2RelaxedMax) \
V(I32x4RelaxedTruncF32x4S) \
V(I32x4RelaxedTruncF32x4U) \
V(I32x4RelaxedTruncF64x2SZero) \
V(I32x4RelaxedTruncF64x2UZero) \
V(I16x8RelaxedQ15MulRS) \
V(I16x8DotI8x16I7x16S) \
V(I32x4DotI8x16I7x16AddS) \
V(I8x16Shuffle) \
V(V128AnyTrue) \
V(I64x2AllTrue) \
V(I32x4AllTrue) \
V(I16x8AllTrue) \
V(I8x16AllTrue) \
V(LoadTransform) \
V(LoadLane) \
V(StoreLane)
#define VALUE_OP_LIST(V) \
COMMON_OP_LIST(V) \
SIMPLIFIED_OP_LIST(V) \
MACHINE_OP_LIST(V) \
MACHINE_SIMD_OP_LIST(V) \
// SIMD256 for AVX
#define MACHINE_SIMD256_OP_LIST(V) \
V(F32x8Add) \
V(F32x8Sub) \
V(F32x8Mul) \
V(F32x8Div) \
V(F32x8Pmin) \
V(F32x8Pmax) \
V(F32x8Eq) \
V(F32x8Ne) \
V(F32x8Lt) \
V(F32x8Le) \
V(S256Select) \
V(ExtractF128)
#define VALUE_OP_LIST(V) \
COMMON_OP_LIST(V) \
SIMPLIFIED_OP_LIST(V) \
MACHINE_OP_LIST(V) \
MACHINE_SIMD128_OP_LIST(V) \
MACHINE_SIMD256_OP_LIST(V) \
JS_OP_LIST(V)
// The combination of all operators at all levels and the common operators.
@ -1252,6 +1268,18 @@ class V8_EXPORT_PRIVATE IrOpcode {
}
UNREACHABLE();
}
static bool IsSimd128Opcode(Value value) {
#define CASE(Name, ...) case k##Name:
switch (value) {
MACHINE_SIMD128_OP_LIST(CASE)
return true;
default:
return false;
}
#undef CASE
UNREACHABLE();
}
};
V8_EXPORT_PRIVATE std::ostream& operator<<(std::ostream&, IrOpcode::Value);

View File

@ -126,7 +126,8 @@ bool OperatorProperties::NeedsExactContext(const Operator* op) {
COMMON_OP_LIST(CASE)
CONTROL_OP_LIST(CASE)
MACHINE_OP_LIST(CASE)
MACHINE_SIMD_OP_LIST(CASE)
MACHINE_SIMD128_OP_LIST(CASE)
MACHINE_SIMD256_OP_LIST(CASE)
SIMPLIFIED_OP_LIST(CASE)
break;
#undef CASE

View File

@ -129,6 +129,10 @@
#include "src/wasm/wasm-engine.h"
#endif // V8_ENABLE_WEBASSEMBLY
#if V8_ENABLE_WASM_SIMD256_REVEC
#include "src/compiler/revectorizer.h"
#endif // V8_ENABLE_WASM_SIMD256_REVEC
namespace v8 {
namespace internal {
namespace compiler {
@ -728,6 +732,10 @@ class PipelineImpl final {
// Substep B.1. Produce a scheduled graph.
void ComputeScheduledGraph();
#if V8_ENABLE_WASM_SIMD256_REVEC
void Revectorize();
#endif // V8_ENABLE_WASM_SIMD256_REVEC
// Substep B.2. Select instructions from a scheduled graph.
bool SelectInstructions(Linkage* linkage);
@ -2355,6 +2363,17 @@ struct ComputeSchedulePhase {
}
};
#if V8_ENABLE_WASM_SIMD256_REVEC
struct RevectorizePhase {
DECL_PIPELINE_PHASE_CONSTANTS(Revectorizer)
void Run(PipelineData* data, Zone* temp_zone) {
Revectorizer revec(temp_zone, data->graph(), data->mcgraph());
revec.TryRevectorize(data->info()->GetDebugName().get());
}
};
#endif // V8_ENABLE_WASM_SIMD256_REVEC
struct InstructionRangesAsJSON {
const InstructionSequence* sequence;
const ZoneVector<std::pair<int, int>>* instr_origins;
@ -3458,6 +3477,13 @@ void Pipeline::GenerateCodeForWasmFunction(
pipeline.RunPrintAndVerify("V8.WasmMachineCode", true);
#if V8_ENABLE_WASM_SIMD256_REVEC
if (v8_flags.experimental_wasm_revectorize) {
pipeline.Revectorize();
pipeline.RunPrintAndVerify("V8.WasmRevec", true);
}
#endif // V8_ENABLE_WASM_SIMD256_REVEC
data.BeginPhaseKind("V8.WasmOptimization");
if (v8_flags.wasm_inlining) {
pipeline.Run<WasmInliningPhase>(env, function_index, wire_bytes_storage,
@ -3763,6 +3789,10 @@ void PipelineImpl::ComputeScheduledGraph() {
TraceScheduleAndVerify(data->info(), data, data->schedule(), "schedule");
}
#if V8_ENABLE_WASM_SIMD256_REVEC
void PipelineImpl::Revectorize() { Run<RevectorizePhase>(); }
#endif // V8_ENABLE_WASM_SIMD256_REVEC
bool PipelineImpl::SelectInstructions(Linkage* linkage) {
auto call_descriptor = linkage->GetIncomingDescriptor();
PipelineData* data = this->data_;

View File

@ -0,0 +1,647 @@
// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/compiler/revectorizer.h"
#include "src/base/cpu.h"
#include "src/base/logging.h"
#include "src/compiler/all-nodes.h"
#include "src/compiler/machine-operator.h"
#include "src/compiler/verifier.h"
namespace v8 {
namespace internal {
namespace compiler {
#define TRACE(...) \
do { \
if (v8_flags.trace_wasm_revectorize) { \
PrintF("Revec: "); \
PrintF(__VA_ARGS__); \
} \
} while (false)
namespace {
#ifdef DEBUG
// Currently, only Load/ProtectedLoad/LoadTransfrom are supported.
// TODO(jiepan): add support for UnalignedLoad, LoadLane
bool IsSupportedLoad(const Node* node) {
if (node->opcode() == IrOpcode::kProtectedLoad ||
node->opcode() == IrOpcode::kLoad ||
node->opcode() == IrOpcode::kLoadTransform) {
return true;
}
return false;
}
bool IsSupportedLoad(const ZoneVector<Node*>& node_group) {
for (auto node : node_group) {
if (!IsSupportedLoad(node)) return false;
}
return true;
}
#endif
int64_t GetConstantValue(const Node* node) {
int64_t value = -1;
if (node->opcode() == IrOpcode::kInt64Constant) {
value = OpParameter<int64_t>(node->op());
}
return value;
}
int64_t GetMemoryOffsetValue(const Node* node) {
DCHECK(node->opcode() == IrOpcode::kProtectedLoad ||
node->opcode() == IrOpcode::kStore ||
node->opcode() == IrOpcode::kProtectedStore);
Node* offset = node->InputAt(0);
if (offset->opcode() == IrOpcode::kLoadFromObject ||
offset->opcode() == IrOpcode::kLoad) {
return 0;
}
int64_t offset_value = -1;
if (offset->opcode() == IrOpcode::kInt64Add) {
if (NodeProperties::IsConstant(offset->InputAt(0))) {
offset_value = GetConstantValue(offset->InputAt(0));
} else if (NodeProperties::IsConstant(offset->InputAt(1))) {
offset_value = GetConstantValue(offset->InputAt(1));
}
}
return offset_value;
}
// We want to combine load/store nodes with continuous memory address,
// for load/store node, input(0) is memory_start + offset, input(1) is index,
// we currently use index as the address of the node, nodes with same index and
// continuous offset can be combined together.
Node* GetNodeAddress(const Node* node) {
Node* address = node->InputAt(1);
// The index is changed to Uint64 for memory32
if (address->opcode() == IrOpcode::kChangeUint32ToUint64) {
address = address->InputAt(0);
}
return address;
}
bool IsContinuousAccess(const ZoneVector<Node*>& node_group) {
DCHECK_GT(node_group.size(), 0);
int64_t previous_offset = GetMemoryOffsetValue(node_group[0]);
for (size_t i = 1; i < node_group.size(); ++i) {
int64_t current_offset = GetMemoryOffsetValue(node_group[i]);
int64_t diff = current_offset - previous_offset;
if (diff != kSimd128Size) {
TRACE("Non-continuous store!");
return false;
}
previous_offset = current_offset;
}
return true;
}
// Returns true if all of the nodes in node_group are constants.
bool AllConstant(const ZoneVector<Node*>& node_group) {
for (Node* node : node_group) {
if (!NodeProperties::IsConstant(node)) {
return false;
}
}
return true;
}
// Returns true if all the addresses of the nodes in node_group are identical.
bool AllSameAddress(const ZoneVector<Node*>& nodes) {
Node* address = GetNodeAddress(nodes[0]);
for (size_t i = 1; i < nodes.size(); i++) {
if (GetNodeAddress(nodes[i]) != address) {
TRACE("Diff address #%d,#%d!\n", address->id(),
GetNodeAddress(nodes[i])->id());
return false;
}
}
return true;
}
// Returns true if all of the nodes in node_group are identical.
// Splat opcode in WASM SIMD is used to create vector with identical lanes.
bool IsSplat(const ZoneVector<Node*>& node_group) {
for (ZoneVector<Node*>::size_type i = 1; i < node_group.size(); ++i) {
if (node_group[i] != node_group[0]) {
return false;
}
}
return true;
}
// Returns true if all of the nodes in node_group have the same type.
bool AllSameOperator(const ZoneVector<Node*>& node_group) {
auto op = node_group[0]->op();
for (ZoneVector<Node*>::size_type i = 1; i < node_group.size(); i++) {
if (node_group[i]->op() != op) {
return false;
}
}
return true;
}
} // anonymous namespace
// Sort load/store node by offset
bool MemoryOffsetComparer::operator()(const Node* lhs, const Node* rhs) const {
return GetMemoryOffsetValue(lhs) < GetMemoryOffsetValue(rhs);
}
void PackNode::Print() const {
if (revectorized_node_ != nullptr) {
TRACE("0x%p #%d:%s(%d %d, %s)\n", this, revectorized_node_->id(),
revectorized_node_->op()->mnemonic(), nodes_[0]->id(),
nodes_[1]->id(), nodes_[0]->op()->mnemonic());
} else {
TRACE("0x%p null(%d %d, %s)\n", this, nodes_[0]->id(), nodes_[1]->id(),
nodes_[0]->op()->mnemonic());
}
}
bool SLPTree::CanBePacked(const ZoneVector<Node*>& node_group) {
DCHECK_EQ(node_group.size(), 2);
if (!SameBasicBlock(node_group[0], node_group[1])) {
TRACE("%s(#%d, #%d) not in same BB!\n", node_group[0]->op()->mnemonic(),
node_group[0]->id(), node_group[1]->id());
return false;
}
if (!AllSameOperator(node_group)) {
TRACE("%s(#%d, #%d) have different operator!\n",
node_group[0]->op()->mnemonic(), node_group[0]->id(),
node_group[1]->id());
return false;
}
// TODO(jiepan): add support for Constant
if (AllConstant(node_group)) {
TRACE("%s(#%d, #%d) are constantant, not supported yet!\n",
node_group[0]->op()->mnemonic(), node_group[0]->id(),
node_group[1]->id());
return false;
}
// Only Support simd128 operators or common operators with simd128
// MachineRepresentation. The MachineRepresentation of root had been checked,
// and the leaf node will be checked later. here we omit the check of
// MachineRepresentation, only check the opcode itself.
IrOpcode::Value op = node_group[0]->opcode();
if (NodeProperties::IsSimd128Operation(node_group[0]) ||
(op == IrOpcode::kStore) || (op == IrOpcode::kProtectedStore) ||
(op == IrOpcode::kLoad) || (op == IrOpcode::kProtectedLoad) ||
(op == IrOpcode::kPhi) || (op == IrOpcode::kLoopExitValue) ||
(op == IrOpcode::kExtractF128)) {
return true;
}
return false;
}
PackNode* SLPTree::NewPackNode(const ZoneVector<Node*>& node_group) {
TRACE("PackNode %s(#%d:, #%d)\n", node_group[0]->op()->mnemonic(),
node_group[0]->id(), node_group[1]->id());
PackNode* pnode = zone_->New<PackNode>(zone_, node_group);
for (Node* node : node_group) {
node_to_packnode_[node] = pnode;
}
return pnode;
}
PackNode* SLPTree::NewPackNodeAndRecurs(const ZoneVector<Node*>& node_group,
int start_index, int count,
unsigned recursion_depth) {
PackNode* pnode = NewPackNode(node_group);
for (int i = start_index; i < start_index + count; ++i) {
ZoneVector<Node*> operands(zone_);
// Prepare the operand vector.
for (size_t j = 0; j < node_group.size(); j++) {
Node* node = node_group[j];
operands.push_back(NodeProperties::GetValueInput(node, i));
}
PackNode* child = BuildTreeRec(operands, recursion_depth + 1);
if (child) {
pnode->SetOperand(i, child);
} else {
return nullptr;
}
}
return pnode;
}
PackNode* SLPTree::GetPackNode(Node* node) {
auto I = node_to_packnode_.find(node);
if (I != node_to_packnode_.end()) {
return I->second;
}
return nullptr;
}
void SLPTree::PushStack(const ZoneVector<Node*>& node_group) {
TRACE("Stack Push (%d %s, %d %s)\n", node_group[0]->id(),
node_group[0]->op()->mnemonic(), node_group[1]->id(),
node_group[1]->op()->mnemonic());
for (auto node : node_group) {
on_stack_.insert(node);
}
stack_.push({node_group});
}
void SLPTree::PopStack() {
const ZoneVector<Node*>& node_group = stack_.top();
DCHECK_EQ(node_group.size(), 2);
TRACE("Stack Pop (%d %s, %d %s)\n", node_group[0]->id(),
node_group[0]->op()->mnemonic(), node_group[1]->id(),
node_group[1]->op()->mnemonic());
for (auto node : node_group) {
on_stack_.erase(node);
}
stack_.pop();
}
bool SLPTree::OnStack(Node* node) {
return on_stack_.find(node) != on_stack_.end();
}
bool SLPTree::AllOnStack(const ZoneVector<Node*>& node_group) {
for (auto node : node_group) {
if (OnStack(node)) return true;
}
return false;
}
bool SLPTree::StackTopIsPhi() {
const ZoneVector<Node*>& node_group = stack_.top();
DCHECK_EQ(node_group.size(), 2);
return NodeProperties::IsPhi(node_group[0]);
}
void SLPTree::ClearStack() {
stack_ = ZoneStack<ZoneVector<Node*>>(zone_);
on_stack_.clear();
}
bool SLPTree::IsSideEffectFreeLoad(const ZoneVector<Node*>& node_group) {
DCHECK(IsSupportedLoad(node_group));
DCHECK_EQ(node_group.size(), 2);
TRACE("Enter IsSideEffectFreeLoad (%d %s, %d %s)\n", node_group[0]->id(),
node_group[0]->op()->mnemonic(), node_group[1]->id(),
node_group[1]->op()->mnemonic());
std::stack<Node*> to_visit;
std::unordered_set<Node*> visited;
// Visit all the inputs (except for control inputs) of Loads.
for (size_t i = 0, e = node_group.size(); i < e; i++) {
Node* load = node_group[i];
for (int j = 0; j < NodeProperties::FirstControlIndex(load); ++j) {
Node* input = load->InputAt(j);
if (std::find(node_group.begin(), node_group.end(), input) ==
node_group.end()) {
to_visit.push(input);
}
}
}
// Check the inputs of Loads and find if they are connected to existing nodes
// in SLPTree. If there is, then there will be side effect and we can not
// merge such Loads.
while (!to_visit.empty()) {
Node* input = to_visit.top();
to_visit.pop();
TRACE("IsSideEffectFreeLoad visit (%d %s)\n", input->id(),
input->op()->mnemonic());
if (visited.find(input) == visited.end()) {
visited.insert(input);
if (OnStack(input)) {
TRACE("Has internal dependency because (%d %s) on stack\n", input->id(),
input->op()->mnemonic());
return false;
}
// If the input is not in same basic block as Loads, it must not be in
// SLPTree. Otherwise recursively visit all input's edges and find if they
// are connected to SLPTree.
if (SameBasicBlock(input, node_group[0])) {
for (int i = 0; i < NodeProperties::FirstControlIndex(input); ++i) {
to_visit.push(input->InputAt(i));
}
}
}
}
return true;
}
PackNode* SLPTree::BuildTree(const ZoneVector<Node*>& roots) {
TRACE("Enter %s\n", __func__);
DeleteTree();
root_ = BuildTreeRec(roots, 0);
return root_;
}
PackNode* SLPTree::BuildTreeRec(const ZoneVector<Node*>& node_group,
unsigned recursion_depth) {
TRACE("Enter %s\n", __func__);
DCHECK_EQ(node_group.size(), 2);
Node* node0 = node_group[0];
Node* node1 = node_group[1];
if (recursion_depth == RecursionMaxDepth) {
TRACE("Failed due to max recursion depth!\n");
return nullptr;
}
if (AllOnStack(node_group)) {
if (!StackTopIsPhi()) {
TRACE("Failed due to (%d %s, %d %s) on stack!\n", node0->id(),
node0->op()->mnemonic(), node1->id(), node1->op()->mnemonic());
return nullptr;
}
}
PushStack(node_group);
if (!CanBePacked(node_group)) {
return nullptr;
}
DCHECK(AllConstant(node_group) || AllSameOperator(node_group));
// Check if this is a duplicate of another entry.
for (Node* node : node_group) {
if (PackNode* p = GetPackNode(node)) {
if (!p->IsSame(node_group)) {
// TODO(jiepan): Gathering due to partial overlap
TRACE("Failed due to partial overlap at #%d,%s!\n", node->id(),
node->op()->mnemonic());
return nullptr;
}
PopStack();
TRACE("Perfect diamond merge at #%d,%s\n", node->id(),
node->op()->mnemonic());
return p;
}
}
if (node0->opcode() == IrOpcode::kExtractF128) {
Node* source = node0->InputAt(0);
TRACE("Extract leaf node from #%d,%s!\n", source->id(),
source->op()->mnemonic());
// For 256 only, check whether they are from the same source
if (node0->InputAt(0) == node1->InputAt(0) &&
(node0->InputAt(0)->opcode() == IrOpcode::kLoadTransform
? node0 == node1
: OpParameter<int32_t>(node0->op()) + 1 ==
OpParameter<int32_t>(node1->op()))) {
TRACE("Added a pair of Extract.\n");
PackNode* pnode = NewPackNode(node_group);
PopStack();
return pnode;
}
TRACE("Failed due to ExtractF128!\n");
return nullptr;
}
if (node0->opcode() == IrOpcode::kProtectedLoad ||
node0->opcode() == IrOpcode::kLoadTransform) {
TRACE("Load leaf node\n");
if (!AllSameAddress(node_group)) {
TRACE("Failed due to different load addr!\n");
return nullptr;
}
if (node0->opcode() == IrOpcode::kProtectedLoad) {
MachineRepresentation rep =
LoadRepresentationOf(node0->op()).representation();
if (rep != MachineRepresentation::kSimd128) {
return nullptr;
}
// Sort loads by offset
ZoneVector<Node*> sorted_node_group(node_group.size(), zone_);
partial_sort_copy(begin(node_group), end(node_group),
begin(sorted_node_group), end(sorted_node_group),
MemoryOffsetComparer());
if (!IsContinuousAccess(sorted_node_group)) {
TRACE("Failed due to non-continuous load!\n");
return nullptr;
}
}
if (node0->opcode() == IrOpcode::kLoadTransform) {
if (!IsSplat(node_group)) {
TRACE("LoadTransform Failed due to IsSplat!\n");
return nullptr;
}
LoadTransformParameters params = LoadTransformParametersOf(node0->op());
// TODO(jiepan): Support more LoadTransformation types
if (params.transformation != LoadTransformation::kS128Load32Splat &&
params.transformation != LoadTransformation::kS128Load64Splat) {
TRACE("LoadTransform failed due to unsupported type #%d!\n",
node0->id());
return nullptr;
}
}
if (!IsSideEffectFreeLoad(node_group)) {
TRACE("Failed due to dependency check\n");
return nullptr;
}
PackNode* p = NewPackNode(node_group);
PopStack();
return p;
}
int value_in_count = node0->op()->ValueInputCount();
switch (node0->opcode()) {
case IrOpcode::kPhi: {
TRACE("Added a vector of PHI nodes.\n");
MachineRepresentation rep = PhiRepresentationOf(node0->op());
if (rep != MachineRepresentation::kSimd128) {
return nullptr;
}
PackNode* pnode =
NewPackNodeAndRecurs(node_group, 0, value_in_count, recursion_depth);
PopStack();
return pnode;
}
case IrOpcode::kLoopExitValue: {
MachineRepresentation rep = LoopExitValueRepresentationOf(node0->op());
if (rep != MachineRepresentation::kSimd128) {
return nullptr;
}
PackNode* pnode =
NewPackNodeAndRecurs(node_group, 0, value_in_count, recursion_depth);
PopStack();
return pnode;
}
case IrOpcode::kF32x4Add:
case IrOpcode::kF32x4Mul: {
TRACE("Added a vector of un/bin/ter op.\n");
PackNode* pnode =
NewPackNodeAndRecurs(node_group, 0, value_in_count, recursion_depth);
PopStack();
return pnode;
}
// TODO(jiepan): UnalignedStore,
case IrOpcode::kStore:
case IrOpcode::kProtectedStore: {
TRACE("Added a vector of stores.\n");
if (!AllSameAddress(node_group)) {
TRACE("Failed due to different store addr!\n");
return nullptr;
}
PackNode* pnode = NewPackNodeAndRecurs(node_group, 2, 1, recursion_depth);
PopStack();
return pnode;
}
default:
TRACE("Default branch #%d:%s\n", node0->id(), node0->op()->mnemonic());
break;
}
return nullptr;
}
void SLPTree::DeleteTree() {
ClearStack();
node_to_packnode_.clear();
}
void SLPTree::Print(const char* info) {
TRACE("%s, Packed node:\n", info);
if (!v8_flags.trace_wasm_revectorize) {
return;
}
std::unordered_set<PackNode const*> visited;
for (auto& entry : node_to_packnode_) {
PackNode const* pnode = entry.second;
if (!pnode || visited.find(pnode) != visited.end()) {
continue;
}
pnode->Print();
visited.insert(pnode);
}
}
//////////////////////////////////////////////////////
void Revectorizer::DetectCPUFeatures() {
base::CPU cpu;
if (cpu.has_avx2()) {
support_simd256_ = true;
}
}
bool Revectorizer::TryRevectorize(const char* function) {
bool success = false;
if (support_simd256_ && graph_->GetSimdStoreNodes().size()) {
TRACE("TryRevectorize %s\n", function);
CollectSeeds();
for (auto entry : group_of_stores_) {
ZoneMap<Node*, StoreNodeSet>* store_chains = entry.second;
if (store_chains != nullptr) {
PrintStores(store_chains);
if (ReduceStoreChains(store_chains)) {
TRACE("Successful revectorize %s\n", function);
success = true;
}
}
}
TRACE("Finish revectorize %s\n", function);
}
return success;
}
void Revectorizer::CollectSeeds() {
for (auto it = graph_->GetSimdStoreNodes().begin();
it != graph_->GetSimdStoreNodes().end(); ++it) {
Node* node = *it;
Node* dominator = slp_tree_->GetEarlySchedulePosition(node);
if ((GetMemoryOffsetValue(node) % kSimd128Size) != 0) {
continue;
}
Node* address = GetNodeAddress(node);
ZoneMap<Node*, StoreNodeSet>* store_nodes;
auto first_level_iter = group_of_stores_.find(dominator);
if (first_level_iter == group_of_stores_.end()) {
store_nodes = zone_->New<ZoneMap<Node*, StoreNodeSet>>(zone_);
group_of_stores_[dominator] = store_nodes;
} else {
store_nodes = first_level_iter->second;
}
auto second_level_iter = store_nodes->find(address);
if (second_level_iter == store_nodes->end()) {
second_level_iter =
store_nodes->insert({address, StoreNodeSet(zone())}).first;
}
second_level_iter->second.insert(node);
}
}
bool Revectorizer::ReduceStoreChains(
ZoneMap<Node*, StoreNodeSet>* store_chains) {
TRACE("Enter %s\n", __func__);
bool changed = false;
for (auto chain_iter = store_chains->cbegin();
chain_iter != store_chains->cend(); ++chain_iter) {
if (chain_iter->second.size() >= 2 && chain_iter->second.size() % 2 == 0) {
ZoneVector<Node*> store_chain(chain_iter->second.begin(),
chain_iter->second.end(), zone_);
for (auto it = store_chain.begin(); it < store_chain.end(); it = it + 2) {
ZoneVector<Node*> stores_unit(it, it + 2, zone_);
if (ReduceStoreChain(stores_unit)) {
changed = true;
}
}
}
}
return changed;
}
bool Revectorizer::ReduceStoreChain(const ZoneVector<Node*>& Stores) {
TRACE("Enter %s, root@ (#%d,#%d)\n", __func__, Stores[0]->id(),
Stores[1]->id());
if (!IsContinuousAccess(Stores)) {
return false;
}
PackNode* root = slp_tree_->BuildTree(Stores);
if (!root) {
TRACE("Build tree failed!\n");
return false;
}
slp_tree_->Print("After build tree");
TRACE("\n");
return true;
}
void Revectorizer::PrintStores(ZoneMap<Node*, StoreNodeSet>* store_chains) {
if (!v8_flags.trace_wasm_revectorize) {
return;
}
TRACE("Enter %s\n", __func__);
for (auto it = store_chains->cbegin(); it != store_chains->cend(); ++it) {
if (it->second.size() > 0) {
TRACE("address = #%d:%s \n", it->first->id(),
it->first->op()->mnemonic());
for (auto node : it->second) {
TRACE("#%d:%s, ", node->id(), node->op()->mnemonic());
}
TRACE("\n");
}
}
}
} // namespace compiler
} // namespace internal
} // namespace v8

197
src/compiler/revectorizer.h Normal file
View File

@ -0,0 +1,197 @@
// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_COMPILER_REVECTORIZER_H_
#define V8_COMPILER_REVECTORIZER_H_
// Revectorizer is an optimization to promote pairs of simd128 nodes to new
// simd256 nodes accelerated by wider vector available from hardware e.g. the
// YMM registers from AVX2 instruction set when possible and beneficial. The
// main algorithm is based on the Superword Level Parallel (SLP) vectorization
// technique.
#include <vector>
#include "src/base/small-vector.h"
#include "src/compiler/graph.h"
#include "src/compiler/linear-scheduler.h"
#include "src/compiler/machine-graph.h"
#include "src/compiler/machine-operator.h"
#include "src/compiler/node-marker.h"
#include "src/compiler/node-properties.h"
#include "src/compiler/node.h"
#include "src/compiler/schedule.h"
#include "src/zone/zone-containers.h"
namespace v8 {
namespace internal {
namespace compiler {
struct V8_EXPORT_PRIVATE MemoryOffsetComparer {
bool operator()(const Node* lhs, const Node* rhs) const;
};
using StoreNodeSet = ZoneSet<Node*, MemoryOffsetComparer>;
// A PackNode consists of a fixed number of isomorphic simd128 nodes which can
// execute in parallel and convert to a 256-bit simd node later. The nodes in a
// PackNode must satisfy that they can be scheduled in the same basic block and
// are mutually independent.
class PackNode final : public NON_EXPORTED_BASE(ZoneObject) {
public:
explicit PackNode(Zone* zone, const ZoneVector<Node*>& node_group)
: nodes_(zone), operands_(zone), revectorized_node_(nullptr) {}
bool IsSame(const ZoneVector<Node*>& node_group) const {
return nodes_ == node_group;
}
const Node* RevectorizedNode() const { return revectorized_node_; }
// returns the index operand of this PackNode.
PackNode* GetOperand(size_t index) {
DCHECK_LT(index, operands_.size());
return operands_[index];
}
ZoneVector<PackNode*>::size_type GetOperandsSize() const {
return operands_.size();
}
void SetOperand(size_t index, PackNode* pnode) {
if (operands_.size() < index + 1) operands_.resize(index + 1);
operands_[index] = pnode;
}
void Print() const;
private:
ZoneVector<Node*> nodes_;
ZoneVector<PackNode*> operands_;
Node* revectorized_node_;
};
// An auxillary tree structure with a set of PackNodes based on the Superword
// Level Parallelism (SLP) vectorization technique. The BuildTree method will
// start from a selected root, e.g. a group of consecutive stores, and extend
// through value inputs to create new PackNodes if the inputs are valid, or
// conclude that the current PackNode is a leaf and terminate the tree.
// Below is an example of SLPTree where loads and stores in each PackNode are
// all consecutive.
// [Load0, Load1] [Load2, Load3]
// \ /
// [Add0, Add1]
// |
// [Store0, Store1]
class SLPTree : public NON_EXPORTED_BASE(ZoneObject) {
public:
explicit SLPTree(Zone* zone, Graph* graph)
: zone_(zone),
graph_(graph),
root_(nullptr),
on_stack_(zone),
stack_(zone),
node_to_packnode_(zone) {
scheduler_ = zone->New<LinearScheduler>(zone, graph);
}
PackNode* BuildTree(const ZoneVector<Node*>& roots);
void DeleteTree();
PackNode* GetPackNode(Node* node);
void Print(const char* info);
Node* GetEarlySchedulePosition(Node* node) {
return scheduler_->GetEarlySchedulePosition(node);
}
private:
friend class LinearScheduler;
// This is the recursive part of BuildTree.
PackNode* BuildTreeRec(const ZoneVector<Node*>& node_group, unsigned depth);
// Baseline: create a new PackNode, and return.
PackNode* NewPackNode(const ZoneVector<Node*>& node_group);
// Recursion: create a new PackNode and call BuildTreeRec recursively
PackNode* NewPackNodeAndRecurs(const ZoneVector<Node*>& node_group,
int start_index, int count, unsigned depth);
bool CanBePacked(const ZoneVector<Node*>& node_group);
Graph* graph() const { return graph_; }
// Node stack operations.
void PopStack();
void PushStack(const ZoneVector<Node*>& node_group);
void ClearStack();
bool OnStack(Node* node);
bool AllOnStack(const ZoneVector<Node*>& node_group);
bool StackTopIsPhi();
bool IsSideEffectFreeLoad(const ZoneVector<Node*>& node_group);
bool SameBasicBlock(Node* node0, Node* node1) {
return scheduler_->SameBasicBlock(node0, node1);
}
Zone* const zone_;
Graph* const graph_;
PackNode* root_;
LinearScheduler* scheduler_;
ZoneSet<Node*> on_stack_;
ZoneStack<ZoneVector<Node*>> stack_;
// Maps a specific node to PackNode.
ZoneUnorderedMap<Node*, PackNode*> node_to_packnode_;
static constexpr size_t RecursionMaxDepth = 1000;
};
// The Revectorizer pass will firstly collect seeds with valid group of
// consecutive stores as the root to build the SLPTree. If the SLPTree is built
// successfully, it will estimate the cost of the 256-bit transformation for
// each PackNode and conduct the final revectorization if benefitial.
class V8_EXPORT_PRIVATE Revectorizer final
: public NON_EXPORTED_BASE(ZoneObject) {
public:
Revectorizer(Zone* zone, Graph* graph, MachineGraph* mcgraph)
: zone_(zone),
graph_(graph),
mcgraph_(mcgraph),
group_of_stores_(zone),
support_simd256_(false) {
DetectCPUFeatures();
slp_tree_ = zone_->New<SLPTree>(zone, graph);
}
void DetectCPUFeatures();
bool TryRevectorize(const char* name);
private:
void CollectSeeds();
bool ReduceStoreChains(ZoneMap<Node*, StoreNodeSet>* store_chains);
bool ReduceStoreChain(const ZoneVector<Node*>& Stores);
void PrintStores(ZoneMap<Node*, StoreNodeSet>* store_chains);
Zone* zone() const { return zone_; }
Graph* graph() const { return graph_; }
MachineGraph* mcgraph() const { return mcgraph_; }
PackNode* GetPackNode(Node* node) const {
return slp_tree_->GetPackNode(node);
}
Zone* const zone_;
Graph* const graph_;
MachineGraph* const mcgraph_;
ZoneMap<Node*, ZoneMap<Node*, StoreNodeSet>*> group_of_stores_;
SLPTree* slp_tree_;
bool support_simd256_;
};
} // namespace compiler
} // namespace internal
} // namespace v8
#endif // V8_COMPILER_REVECTORIZER_H_

View File

@ -720,7 +720,8 @@ void SimplifiedLoweringVerifier::VisitNode(Node* node,
// TODO(nicohartmann@): These operators might need to be supported.
break;
}
MACHINE_SIMD_OP_LIST(CASE)
MACHINE_SIMD128_OP_LIST(CASE)
MACHINE_SIMD256_OP_LIST(CASE)
IF_WASM(SIMPLIFIED_WASM_OP_LIST, CASE) {
// SIMD operators should not be in the graph, yet.
UNREACHABLE();

View File

@ -125,7 +125,8 @@ class Typer::Visitor : public Reducer {
SIMPLIFIED_CHANGE_OP_LIST(DECLARE_IMPOSSIBLE_CASE)
SIMPLIFIED_CHECKED_OP_LIST(DECLARE_IMPOSSIBLE_CASE)
IF_WASM(SIMPLIFIED_WASM_OP_LIST, DECLARE_IMPOSSIBLE_CASE)
MACHINE_SIMD_OP_LIST(DECLARE_IMPOSSIBLE_CASE)
MACHINE_SIMD128_OP_LIST(DECLARE_IMPOSSIBLE_CASE)
MACHINE_SIMD256_OP_LIST(DECLARE_IMPOSSIBLE_CASE)
MACHINE_UNOP_32_LIST(DECLARE_IMPOSSIBLE_CASE)
DECLARE_IMPOSSIBLE_CASE(Word32Xor)
DECLARE_IMPOSSIBLE_CASE(Word32Sar)

View File

@ -1947,7 +1947,8 @@ void Verifier::Visitor::Check(Node* node, const AllNodes& all) {
case IrOpcode::kTraceInstruction:
#define SIMD_MACHINE_OP_CASE(Name) case IrOpcode::k##Name:
MACHINE_SIMD_OP_LIST(SIMD_MACHINE_OP_CASE)
MACHINE_SIMD128_OP_LIST(SIMD_MACHINE_OP_CASE)
MACHINE_SIMD256_OP_LIST(SIMD_MACHINE_OP_CASE)
#undef SIMD_MACHINE_OP_CASE
// TODO(rossberg): Check.

View File

@ -3804,15 +3804,23 @@ void WasmGraphBuilder::StoreMem(MachineRepresentation mem_rep, Node* index,
gasm_->StoreUnaligned(UnalignedStoreRepresentation{mem_rep},
MemBuffer(capped_offset), index, val);
break;
case MemoryAccessKind::kProtected:
SetSourcePosition(
gasm_->ProtectedStore(mem_rep, MemBuffer(capped_offset), index, val),
position);
case MemoryAccessKind::kProtected: {
Node* store =
gasm_->ProtectedStore(mem_rep, MemBuffer(capped_offset), index, val);
SetSourcePosition(store, position);
if (mem_rep == MachineRepresentation::kSimd128) {
graph()->RecordSimdStore(store);
}
break;
case MemoryAccessKind::kNormal:
gasm_->Store(StoreRepresentation{mem_rep, kNoWriteBarrier},
MemBuffer(capped_offset), index, val);
}
case MemoryAccessKind::kNormal: {
Node* store = gasm_->Store(StoreRepresentation{mem_rep, kNoWriteBarrier},
MemBuffer(capped_offset), index, val);
if (mem_rep == MachineRepresentation::kSimd128) {
graph()->RecordSimdStore(store);
}
break;
}
}
if (v8_flags.trace_wasm_memory) {
@ -8498,6 +8506,12 @@ bool BuildGraphForWasmFunction(wasm::CompilationEnv* env,
WasmGraphBuilder::kCalledFromWasm);
builder.LowerInt64(sig);
#ifdef V8_ENABLE_WASM_SIMD256_REVEC
if (v8_flags.experimental_wasm_revectorize && builder.has_simd()) {
mcgraph->graph()->SetSimd(true);
}
#endif
return true;
}

View File

@ -1232,6 +1232,13 @@ DEFINE_BOOL(trace_wasm_gdb_remote, false, "trace Webassembly GDB-remote server")
DEFINE_DEBUG_BOOL(trace_wasm_instances, false,
"trace creation and collection of wasm instances")
// Flags for WASM SIMD256 revectorize
#ifdef V8_ENABLE_WASM_SIMD256_REVEC
DEFINE_BOOL(experimental_wasm_revectorize, false,
"enable 128 to 256 bit revectorization for Webassembly SIMD")
DEFINE_BOOL(trace_wasm_revectorize, false, "trace wasm revectorize")
#endif // V8_ENABLE_WASM_SIMD256_REVEC
#endif // V8_ENABLE_WEBASSEMBLY
DEFINE_INT(stress_sampling_allocation_profiler, 0,

View File

@ -475,6 +475,7 @@ class RuntimeCallTimer final {
V(OptimizeFinalizePipelineJob) \
V(OptimizeHeapBrokerInitialization) \
V(OptimizeNonConcurrent) \
V(OptimizeRevectorizer) \
V(OptimizeSerialization) \
V(OptimizeSerializeMetadata) \
V(ParseEval) \

View File

@ -4,6 +4,17 @@
import("../../gni/v8.gni")
if (v8_enable_webassembly) {
# Specifies if the target build is a simulator build. Comparing target cpu
# with v8 target cpu to not affect simulator builds for making cross-compile
# snapshots.
target_is_simulator = (target_cpu != v8_target_cpu && !v8_multi_arch_build) ||
(current_cpu != v8_current_cpu && v8_multi_arch_build)
if (!target_is_simulator && v8_current_cpu == "x64") {
v8_enable_wasm_simd256_revec = true
}
}
if (is_fuchsia) {
import("//build/config/fuchsia/generate_runner_scripts.gni")
import("//third_party/fuchsia-sdk/sdk/build/component.gni")
@ -586,7 +597,10 @@ v8_source_set("unittests_sources") {
}
if (v8_enable_wasm_simd256_revec) {
sources += [ "compiler/linear-scheduler-unittest.cc" ]
sources += [
"compiler/linear-scheduler-unittest.cc",
"compiler/revec-unittest.cc",
]
}
if (v8_enable_wasm_gdb_remote_debugging) {

View File

@ -0,0 +1,106 @@
// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/codegen/machine-type.h"
#include "src/compiler/common-operator.h"
#include "src/compiler/machine-graph.h"
#include "src/compiler/machine-operator.h"
#include "src/compiler/node-properties.h"
#include "src/compiler/node.h"
#include "src/compiler/revectorizer.h"
#include "src/compiler/wasm-compiler.h"
#include "src/wasm/wasm-module.h"
#include "test/unittests/compiler/graph-unittest.h"
#include "test/unittests/compiler/node-test-utils.h"
#include "testing/gmock-support.h"
using testing::AllOf;
using testing::Capture;
using testing::CaptureEq;
namespace v8 {
namespace internal {
namespace compiler {
class RevecTest : public TestWithIsolateAndZone {
public:
RevecTest()
: TestWithIsolateAndZone(kCompressGraphZone),
graph_(zone()),
common_(zone()),
machine_(zone(), MachineRepresentation::kWord64,
MachineOperatorBuilder::Flag::kAllOptionalOps),
mcgraph_(&graph_, &common_, &machine_) {}
Graph* graph() { return &graph_; }
CommonOperatorBuilder* common() { return &common_; }
MachineOperatorBuilder* machine() { return &machine_; }
MachineGraph* mcgraph() { return &mcgraph_; }
private:
Graph graph_;
CommonOperatorBuilder common_;
MachineOperatorBuilder machine_;
MachineGraph mcgraph_;
};
// Create a graph which add two 256 bit vectors(a, b), store the result in c:
// simd128 *a,*b,*c;
// *c = *a + *b;
// *(c+1) = *(a+1) + *(b+1);
// In Revectorization, two simd 128 nodes can be combined into one 256 node:
// simd256 *d, *e, *f;
// *f = *d + *e;
TEST_F(RevecTest, F32x8Add) {
Node* start = graph()->NewNode(common()->Start(5));
graph()->SetStart(start);
Node* zero = graph()->NewNode(common()->Int32Constant(0));
Node* sixteen = graph()->NewNode(common()->Int64Constant(16));
// offset of memory start field in WASM instance object.
Node* offset = graph()->NewNode(common()->Int64Constant(23));
Node* p0 = graph()->NewNode(common()->Parameter(0), start);
Node* p1 = graph()->NewNode(common()->Parameter(1), start);
Node* p2 = graph()->NewNode(common()->Parameter(2), start);
Node* p3 = graph()->NewNode(common()->Parameter(3), start);
StoreRepresentation store_rep(MachineRepresentation::kSimd128,
WriteBarrierKind::kNoWriteBarrier);
LoadRepresentation load_rep(MachineType::Simd128());
Node* load0 = graph()->NewNode(machine()->Load(MachineType::Int64()), p0,
offset, start, start);
Node* mem_buffer1 = graph()->NewNode(machine()->Int64Add(), load0, sixteen);
Node* mem_buffer2 = graph()->NewNode(machine()->Int64Add(), load0, sixteen);
Node* mem_store = graph()->NewNode(machine()->Int64Add(), load0, sixteen);
Node* load1 = graph()->NewNode(machine()->ProtectedLoad(load_rep), load0, p1,
load0, start);
Node* load2 = graph()->NewNode(machine()->ProtectedLoad(load_rep),
mem_buffer1, p1, load1, start);
Node* load3 = graph()->NewNode(machine()->ProtectedLoad(load_rep), load0, p2,
load2, start);
Node* load4 = graph()->NewNode(machine()->ProtectedLoad(load_rep),
mem_buffer2, p2, load3, start);
Node* add1 = graph()->NewNode(machine()->F32x4Add(), load1, load3);
Node* add2 = graph()->NewNode(machine()->F32x4Add(), load2, load4);
Node* store1 = graph()->NewNode(machine()->Store(store_rep), load0, p3, add1,
load4, start);
Node* store2 = graph()->NewNode(machine()->Store(store_rep), mem_store, p3,
add2, store1, start);
Node* ret = graph()->NewNode(common()->Return(0), zero, store2, start);
Node* end = graph()->NewNode(common()->End(1), ret);
graph()->SetEnd(end);
graph()->RecordSimdStore(store1);
graph()->RecordSimdStore(store2);
graph()->SetSimd(true);
// Test whether the graph can be revectorized
Revectorizer revec(zone(), graph(), mcgraph());
EXPECT_TRUE(revec.TryRevectorize(nullptr));
}
} // namespace compiler
} // namespace internal
} // namespace v8