mirror of
https://github.com/KhronosGroup/SPIRV-Tools
synced 2024-12-11 11:20:05 +00:00
d35a78db57
Fixes #4960 * Switches to using enum classes with an underlying type to avoid undefined behaviour
514 lines
19 KiB
C++
514 lines
19 KiB
C++
// Copyright (c) 2018 Google LLC.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "source/opt/loop_fission.h"
|
|
|
|
#include <set>
|
|
|
|
#include "source/opt/register_pressure.h"
|
|
|
|
// Implement loop fission with an optional parameter to split only
|
|
// if the register pressure in a given loop meets a certain criteria. This is
|
|
// controlled via the constructors of LoopFissionPass.
|
|
//
|
|
// 1 - Build a list of loops to be split, these are top level loops (loops
|
|
// without child loops themselves) which meet the register pressure criteria, as
|
|
// determined by the ShouldSplitLoop method of LoopFissionPass.
|
|
//
|
|
// 2 - For each loop in the list, group each instruction into a set of related
|
|
// instructions by traversing each instructions users and operands recursively.
|
|
// We stop if we encounter an instruction we have seen before or an instruction
|
|
// which we don't consider relevant (i.e OpLoopMerge). We then group these
|
|
// groups into two different sets, one for the first loop and one for the
|
|
// second.
|
|
//
|
|
// 3 - We then run CanPerformSplit to check that it would be legal to split a
|
|
// loop using those two sets. We check that we haven't altered the relative
|
|
// order load/stores appear in the binary and that we aren't breaking any
|
|
// dependency between load/stores by splitting them into two loops. We also
|
|
// check that none of the OpBranch instructions are dependent on a load as we
|
|
// leave control flow structure intact and move only instructions in the body so
|
|
// we want to avoid any loads with side affects or aliasing.
|
|
//
|
|
// 4 - We then split the loop by calling SplitLoop. This function clones the
|
|
// loop and attaches it to the preheader and connects the new loops merge block
|
|
// to the current loop header block. We then use the two sets built in step 2 to
|
|
// remove instructions from each loop. If an instruction appears in the first
|
|
// set it is removed from the second loop and vice versa.
|
|
//
|
|
// 5 - If the multiple split passes flag is set we check if each of the loops
|
|
// still meet the register pressure criteria. If they do then we add them to the
|
|
// list of loops to be split (created in step one) to allow for loops to be
|
|
// split multiple times.
|
|
//
|
|
|
|
namespace spvtools {
|
|
namespace opt {
|
|
|
|
class LoopFissionImpl {
|
|
public:
|
|
LoopFissionImpl(IRContext* context, Loop* loop)
|
|
: context_(context), loop_(loop), load_used_in_condition_(false) {}
|
|
|
|
// Group each instruction in the loop into sets of instructions related by
|
|
// their usedef chains. An instruction which uses another will appear in the
|
|
// same set. Then merge those sets into just two sets. Returns false if there
|
|
// was one or less sets created.
|
|
bool GroupInstructionsByUseDef();
|
|
|
|
// Check if the sets built by GroupInstructionsByUseDef violate any data
|
|
// dependence rules.
|
|
bool CanPerformSplit();
|
|
|
|
// Split the loop and return a pointer to the new loop.
|
|
Loop* SplitLoop();
|
|
|
|
// Checks if |inst| is safe to move. We can only move instructions which don't
|
|
// have any side effects and OpLoads and OpStores.
|
|
bool MovableInstruction(const Instruction& inst) const;
|
|
|
|
private:
|
|
// Traverse the def use chain of |inst| and add the users and uses of |inst|
|
|
// which are in the same loop to the |returned_set|.
|
|
void TraverseUseDef(Instruction* inst, std::set<Instruction*>* returned_set,
|
|
bool ignore_phi_users = false, bool report_loads = false);
|
|
|
|
// We group the instructions in the block into two different groups, the
|
|
// instructions to be kept in the original loop and the ones to be cloned into
|
|
// the new loop. As the cloned loop is attached to the preheader it will be
|
|
// the first loop and the second loop will be the original.
|
|
std::set<Instruction*> cloned_loop_instructions_;
|
|
std::set<Instruction*> original_loop_instructions_;
|
|
|
|
// We need a set of all the instructions to be seen so we can break any
|
|
// recursion and also so we can ignore certain instructions by preemptively
|
|
// adding them to this set.
|
|
std::set<Instruction*> seen_instructions_;
|
|
|
|
// A map of instructions to their relative position in the function.
|
|
std::map<Instruction*, size_t> instruction_order_;
|
|
|
|
IRContext* context_;
|
|
|
|
Loop* loop_;
|
|
|
|
// This is set to true by TraverseUseDef when traversing the instructions
|
|
// related to the loop condition and any if conditions should any of those
|
|
// instructions be a load.
|
|
bool load_used_in_condition_;
|
|
};
|
|
|
|
bool LoopFissionImpl::MovableInstruction(const Instruction& inst) const {
|
|
return inst.opcode() == spv::Op::OpLoad ||
|
|
inst.opcode() == spv::Op::OpStore ||
|
|
inst.opcode() == spv::Op::OpSelectionMerge ||
|
|
inst.opcode() == spv::Op::OpPhi || inst.IsOpcodeCodeMotionSafe();
|
|
}
|
|
|
|
void LoopFissionImpl::TraverseUseDef(Instruction* inst,
|
|
std::set<Instruction*>* returned_set,
|
|
bool ignore_phi_users, bool report_loads) {
|
|
assert(returned_set && "Set to be returned cannot be null.");
|
|
|
|
analysis::DefUseManager* def_use = context_->get_def_use_mgr();
|
|
std::set<Instruction*>& inst_set = *returned_set;
|
|
|
|
// We create this functor to traverse the use def chain to build the
|
|
// grouping of related instructions. The lambda captures the std::function
|
|
// to allow it to recurse.
|
|
std::function<void(Instruction*)> traverser_functor;
|
|
traverser_functor = [this, def_use, &inst_set, &traverser_functor,
|
|
ignore_phi_users, report_loads](Instruction* user) {
|
|
// If we've seen the instruction before or it is not inside the loop end the
|
|
// traversal.
|
|
if (!user || seen_instructions_.count(user) != 0 ||
|
|
!context_->get_instr_block(user) ||
|
|
!loop_->IsInsideLoop(context_->get_instr_block(user))) {
|
|
return;
|
|
}
|
|
|
|
// Don't include labels or loop merge instructions in the instruction sets.
|
|
// Including them would mean we group instructions related only by using the
|
|
// same labels (i.e phis). We already preempt the inclusion of
|
|
// OpSelectionMerge by adding related instructions to the seen_instructions_
|
|
// set.
|
|
if (user->opcode() == spv::Op::OpLoopMerge ||
|
|
user->opcode() == spv::Op::OpLabel)
|
|
return;
|
|
|
|
// If the |report_loads| flag is set, set the class field
|
|
// load_used_in_condition_ to false. This is used to check that none of the
|
|
// condition checks in the loop rely on loads.
|
|
if (user->opcode() == spv::Op::OpLoad && report_loads) {
|
|
load_used_in_condition_ = true;
|
|
}
|
|
|
|
// Add the instruction to the set of instructions already seen, this breaks
|
|
// recursion and allows us to ignore certain instructions.
|
|
seen_instructions_.insert(user);
|
|
|
|
inst_set.insert(user);
|
|
|
|
// Wrapper functor to traverse the operands of each instruction.
|
|
auto traverse_operand = [&traverser_functor, def_use](const uint32_t* id) {
|
|
traverser_functor(def_use->GetDef(*id));
|
|
};
|
|
user->ForEachInOperand(traverse_operand);
|
|
|
|
// For the first traversal we want to ignore the users of the phi.
|
|
if (ignore_phi_users && user->opcode() == spv::Op::OpPhi) return;
|
|
|
|
// Traverse each user with this lambda.
|
|
def_use->ForEachUser(user, traverser_functor);
|
|
|
|
// Wrapper functor for the use traversal.
|
|
auto traverse_use = [&traverser_functor](Instruction* use, uint32_t) {
|
|
traverser_functor(use);
|
|
};
|
|
def_use->ForEachUse(user, traverse_use);
|
|
|
|
};
|
|
|
|
// We start the traversal of the use def graph by invoking the above
|
|
// lambda with the |inst| parameter.
|
|
traverser_functor(inst);
|
|
}
|
|
|
|
bool LoopFissionImpl::GroupInstructionsByUseDef() {
|
|
std::vector<std::set<Instruction*>> sets{};
|
|
|
|
// We want to ignore all the instructions stemming from the loop condition
|
|
// instruction.
|
|
BasicBlock* condition_block = loop_->FindConditionBlock();
|
|
|
|
if (!condition_block) return false;
|
|
Instruction* condition = &*condition_block->tail();
|
|
|
|
// We iterate over the blocks via iterating over all the blocks in the
|
|
// function, we do this so we are iterating in the same order which the blocks
|
|
// appear in the binary.
|
|
Function& function = *loop_->GetHeaderBlock()->GetParent();
|
|
|
|
// Create a temporary set to ignore certain groups of instructions within the
|
|
// loop. We don't want any instructions related to control flow to be removed
|
|
// from either loop only instructions within the control flow bodies.
|
|
std::set<Instruction*> instructions_to_ignore{};
|
|
TraverseUseDef(condition, &instructions_to_ignore, true, true);
|
|
|
|
// Traverse control flow instructions to ensure they are added to the
|
|
// seen_instructions_ set and will be ignored when it it called with actual
|
|
// sets.
|
|
for (BasicBlock& block : function) {
|
|
if (!loop_->IsInsideLoop(block.id())) continue;
|
|
|
|
for (Instruction& inst : block) {
|
|
// Ignore all instructions related to control flow.
|
|
if (inst.opcode() == spv::Op::OpSelectionMerge || inst.IsBranch()) {
|
|
TraverseUseDef(&inst, &instructions_to_ignore, true, true);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Traverse the instructions and generate the sets, automatically ignoring any
|
|
// instructions in instructions_to_ignore.
|
|
for (BasicBlock& block : function) {
|
|
if (!loop_->IsInsideLoop(block.id()) ||
|
|
loop_->GetHeaderBlock()->id() == block.id())
|
|
continue;
|
|
|
|
for (Instruction& inst : block) {
|
|
// Record the order that each load/store is seen.
|
|
if (inst.opcode() == spv::Op::OpLoad ||
|
|
inst.opcode() == spv::Op::OpStore) {
|
|
instruction_order_[&inst] = instruction_order_.size();
|
|
}
|
|
|
|
// Ignore instructions already seen in a traversal.
|
|
if (seen_instructions_.count(&inst) != 0) {
|
|
continue;
|
|
}
|
|
|
|
// Build the set.
|
|
std::set<Instruction*> inst_set{};
|
|
TraverseUseDef(&inst, &inst_set);
|
|
if (!inst_set.empty()) sets.push_back(std::move(inst_set));
|
|
}
|
|
}
|
|
|
|
// If we have one or zero sets return false to indicate that due to
|
|
// insufficient instructions we couldn't split the loop into two groups and
|
|
// thus the loop can't be split any further.
|
|
if (sets.size() < 2) {
|
|
return false;
|
|
}
|
|
|
|
// Merge the loop sets into two different sets. In CanPerformSplit we will
|
|
// validate that we don't break the relative ordering of loads/stores by doing
|
|
// this.
|
|
for (size_t index = 0; index < sets.size() / 2; ++index) {
|
|
cloned_loop_instructions_.insert(sets[index].begin(), sets[index].end());
|
|
}
|
|
for (size_t index = sets.size() / 2; index < sets.size(); ++index) {
|
|
original_loop_instructions_.insert(sets[index].begin(), sets[index].end());
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool LoopFissionImpl::CanPerformSplit() {
|
|
// Return false if any of the condition instructions in the loop depend on a
|
|
// load.
|
|
if (load_used_in_condition_) {
|
|
return false;
|
|
}
|
|
|
|
// Build a list of all parent loops of this loop. Loop dependence analysis
|
|
// needs this structure.
|
|
std::vector<const Loop*> loops;
|
|
Loop* parent_loop = loop_;
|
|
while (parent_loop) {
|
|
loops.push_back(parent_loop);
|
|
parent_loop = parent_loop->GetParent();
|
|
}
|
|
|
|
LoopDependenceAnalysis analysis{context_, loops};
|
|
|
|
// A list of all the stores in the cloned loop.
|
|
std::vector<Instruction*> set_one_stores{};
|
|
|
|
// A list of all the loads in the cloned loop.
|
|
std::vector<Instruction*> set_one_loads{};
|
|
|
|
// Populate the above lists.
|
|
for (Instruction* inst : cloned_loop_instructions_) {
|
|
if (inst->opcode() == spv::Op::OpStore) {
|
|
set_one_stores.push_back(inst);
|
|
} else if (inst->opcode() == spv::Op::OpLoad) {
|
|
set_one_loads.push_back(inst);
|
|
}
|
|
|
|
// If we find any instruction which we can't move (such as a barrier),
|
|
// return false.
|
|
if (!MovableInstruction(*inst)) return false;
|
|
}
|
|
|
|
// We need to calculate the depth of the loop to create the loop dependency
|
|
// distance vectors.
|
|
const size_t loop_depth = loop_->GetDepth();
|
|
|
|
// Check the dependencies between loads in the cloned loop and stores in the
|
|
// original and vice versa.
|
|
for (Instruction* inst : original_loop_instructions_) {
|
|
// If we find any instruction which we can't move (such as a barrier),
|
|
// return false.
|
|
if (!MovableInstruction(*inst)) return false;
|
|
|
|
// Look at the dependency between the loads in the original and stores in
|
|
// the cloned loops.
|
|
if (inst->opcode() == spv::Op::OpLoad) {
|
|
for (Instruction* store : set_one_stores) {
|
|
DistanceVector vec{loop_depth};
|
|
|
|
// If the store actually should appear after the load, return false.
|
|
// This means the store has been placed in the wrong grouping.
|
|
if (instruction_order_[store] > instruction_order_[inst]) {
|
|
return false;
|
|
}
|
|
// If not independent check the distance vector.
|
|
if (!analysis.GetDependence(store, inst, &vec)) {
|
|
for (DistanceEntry& entry : vec.GetEntries()) {
|
|
// A distance greater than zero means that the store in the cloned
|
|
// loop has a dependency on the load in the original loop.
|
|
if (entry.distance > 0) return false;
|
|
}
|
|
}
|
|
}
|
|
} else if (inst->opcode() == spv::Op::OpStore) {
|
|
for (Instruction* load : set_one_loads) {
|
|
DistanceVector vec{loop_depth};
|
|
|
|
// If the load actually should appear after the store, return false.
|
|
if (instruction_order_[load] > instruction_order_[inst]) {
|
|
return false;
|
|
}
|
|
|
|
// If not independent check the distance vector.
|
|
if (!analysis.GetDependence(inst, load, &vec)) {
|
|
for (DistanceEntry& entry : vec.GetEntries()) {
|
|
// A distance less than zero means the load in the cloned loop is
|
|
// dependent on the store instruction in the original loop.
|
|
if (entry.distance < 0) return false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Loop* LoopFissionImpl::SplitLoop() {
|
|
// Clone the loop.
|
|
LoopUtils util{context_, loop_};
|
|
LoopUtils::LoopCloningResult clone_results;
|
|
Loop* cloned_loop = util.CloneAndAttachLoopToHeader(&clone_results);
|
|
|
|
// Update the OpLoopMerge in the cloned loop.
|
|
cloned_loop->UpdateLoopMergeInst();
|
|
|
|
// Add the loop_ to the module.
|
|
// TODO(1841): Handle failure to create pre-header.
|
|
Function::iterator it =
|
|
util.GetFunction()->FindBlock(loop_->GetOrCreatePreHeaderBlock()->id());
|
|
util.GetFunction()->AddBasicBlocks(clone_results.cloned_bb_.begin(),
|
|
clone_results.cloned_bb_.end(), ++it);
|
|
loop_->SetPreHeaderBlock(cloned_loop->GetMergeBlock());
|
|
|
|
std::vector<Instruction*> instructions_to_kill{};
|
|
|
|
// Kill all the instructions which should appear in the cloned loop but not in
|
|
// the original loop.
|
|
for (uint32_t id : loop_->GetBlocks()) {
|
|
BasicBlock* block = context_->cfg()->block(id);
|
|
|
|
for (Instruction& inst : *block) {
|
|
// If the instruction appears in the cloned loop instruction group, kill
|
|
// it.
|
|
if (cloned_loop_instructions_.count(&inst) == 1 &&
|
|
original_loop_instructions_.count(&inst) == 0) {
|
|
instructions_to_kill.push_back(&inst);
|
|
if (inst.opcode() == spv::Op::OpPhi) {
|
|
context_->ReplaceAllUsesWith(
|
|
inst.result_id(), clone_results.value_map_[inst.result_id()]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Kill all instructions which should appear in the original loop and not in
|
|
// the cloned loop.
|
|
for (uint32_t id : cloned_loop->GetBlocks()) {
|
|
BasicBlock* block = context_->cfg()->block(id);
|
|
for (Instruction& inst : *block) {
|
|
Instruction* old_inst = clone_results.ptr_map_[&inst];
|
|
// If the instruction belongs to the original loop instruction group, kill
|
|
// it.
|
|
if (cloned_loop_instructions_.count(old_inst) == 0 &&
|
|
original_loop_instructions_.count(old_inst) == 1) {
|
|
instructions_to_kill.push_back(&inst);
|
|
}
|
|
}
|
|
}
|
|
|
|
for (Instruction* i : instructions_to_kill) {
|
|
context_->KillInst(i);
|
|
}
|
|
|
|
return cloned_loop;
|
|
}
|
|
|
|
LoopFissionPass::LoopFissionPass(const size_t register_threshold_to_split,
|
|
bool split_multiple_times)
|
|
: split_multiple_times_(split_multiple_times) {
|
|
// Split if the number of registers in the loop exceeds
|
|
// |register_threshold_to_split|.
|
|
split_criteria_ =
|
|
[register_threshold_to_split](
|
|
const RegisterLiveness::RegionRegisterLiveness& liveness) {
|
|
return liveness.used_registers_ > register_threshold_to_split;
|
|
};
|
|
}
|
|
|
|
LoopFissionPass::LoopFissionPass() : split_multiple_times_(false) {
|
|
// Split by default.
|
|
split_criteria_ = [](const RegisterLiveness::RegionRegisterLiveness&) {
|
|
return true;
|
|
};
|
|
}
|
|
|
|
bool LoopFissionPass::ShouldSplitLoop(const Loop& loop, IRContext* c) {
|
|
LivenessAnalysis* analysis = c->GetLivenessAnalysis();
|
|
|
|
RegisterLiveness::RegionRegisterLiveness liveness{};
|
|
|
|
Function* function = loop.GetHeaderBlock()->GetParent();
|
|
analysis->Get(function)->ComputeLoopRegisterPressure(loop, &liveness);
|
|
|
|
return split_criteria_(liveness);
|
|
}
|
|
|
|
Pass::Status LoopFissionPass::Process() {
|
|
bool changed = false;
|
|
|
|
for (Function& f : *context()->module()) {
|
|
// We collect all the inner most loops in the function and run the loop
|
|
// splitting util on each. The reason we do this is to allow us to iterate
|
|
// over each, as creating new loops will invalidate the loop iterator.
|
|
std::vector<Loop*> inner_most_loops{};
|
|
LoopDescriptor& loop_descriptor = *context()->GetLoopDescriptor(&f);
|
|
for (Loop& loop : loop_descriptor) {
|
|
if (!loop.HasChildren() && ShouldSplitLoop(loop, context())) {
|
|
inner_most_loops.push_back(&loop);
|
|
}
|
|
}
|
|
|
|
// List of new loops which meet the criteria to be split again.
|
|
std::vector<Loop*> new_loops_to_split{};
|
|
|
|
while (!inner_most_loops.empty()) {
|
|
for (Loop* loop : inner_most_loops) {
|
|
LoopFissionImpl impl{context(), loop};
|
|
|
|
// Group the instructions in the loop into two different sets of related
|
|
// instructions. If we can't group the instructions into the two sets
|
|
// then we can't split the loop any further.
|
|
if (!impl.GroupInstructionsByUseDef()) {
|
|
continue;
|
|
}
|
|
|
|
if (impl.CanPerformSplit()) {
|
|
Loop* second_loop = impl.SplitLoop();
|
|
changed = true;
|
|
context()->InvalidateAnalysesExceptFor(
|
|
IRContext::kAnalysisLoopAnalysis);
|
|
|
|
// If the newly created loop meets the criteria to be split, split it
|
|
// again.
|
|
if (ShouldSplitLoop(*second_loop, context()))
|
|
new_loops_to_split.push_back(second_loop);
|
|
|
|
// If the original loop (now split) still meets the criteria to be
|
|
// split, split it again.
|
|
if (ShouldSplitLoop(*loop, context()))
|
|
new_loops_to_split.push_back(loop);
|
|
}
|
|
}
|
|
|
|
// If the split multiple times flag has been set add the new loops which
|
|
// meet the splitting criteria into the list of loops to be split on the
|
|
// next iteration.
|
|
if (split_multiple_times_) {
|
|
inner_most_loops = std::move(new_loops_to_split);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return changed ? Pass::Status::SuccessWithChange
|
|
: Pass::Status::SuccessWithoutChange;
|
|
}
|
|
|
|
} // namespace opt
|
|
} // namespace spvtools
|