SPIRV-Tools/source/opt/loop_fission.cpp

509 lines
19 KiB
C++
Raw Normal View History

// Copyright (c) 2018 Google LLC.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "opt/loop_fission.h"
#include "opt/register_pressure.h"
// Implement loop fission with an optional parameter to split only
// if the register pressure in a given loop meets a certain criteria. This is
// controlled via the constructors of LoopFissionPass.
//
// 1 - Build a list of loops to be split, these are top level loops (loops
// without child loops themselves) which meet the register pressure criteria, as
// determined by the ShouldSplitLoop method of LoopFissionPass.
//
// 2 - For each loop in the list, group each instruction into a set of related
// instructions by traversing each instructions users and operands recursively.
// We stop if we encounter an instruction we have seen before or an instruction
// which we don't consider relevent (i.e OpLoopMerge). We then group these
// groups into two different sets, one for the first loop and one for the
// second.
//
// 3 - We then run CanPerformSplit to check that it would be legal to split a
// loop using those two sets. We check that we haven't altered the relative
// order load/stores appear in the binary and that we aren't breaking any
// dependency between load/stores by splitting them into two loops. We also
// check that none of the OpBranch instructions are dependent on a load as we
// leave control flow structure intact and move only instructions in the body so
// we want to avoid any loads with side affects or aliasing.
//
// 4 - We then split the loop by calling SplitLoop. This function clones the
// loop and attaches it to the preheader and connects the new loops merge block
// to the current loop header block. We then use the two sets built in step 2 to
// remove instructions from each loop. If an instruction appears in the first
// set it is removed from the second loop and vice versa.
//
// 5 - If the multiple split passes flag is set we check if each of the loops
// still meet the register pressure criteria. If they do then we add them to the
// list of loops to be split (created in step one) to allow for loops to be
// split multiple times.
//
namespace spvtools {
namespace opt {
class LoopFissionImpl {
public:
LoopFissionImpl(ir::IRContext* context, ir::Loop* loop)
: context_(context), loop_(loop), load_used_in_condition_(false) {}
// Group each instruction in the loop into sets of instructions related by
// their usedef chains. An instruction which uses another will appear in the
// same set. Then merge those sets into just two sets. Returns false if there
// was one or less sets created.
bool GroupInstructionsByUseDef();
// Check if the sets built by GroupInstructionsByUseDef violate any data
// dependence rules.
bool CanPerformSplit();
// Split the loop and return a pointer to the new loop.
ir::Loop* SplitLoop();
// Checks if |inst| is safe to move. We can only move instructions which don't
// have any side effects and OpLoads and OpStores.
bool MovableInstruction(const ir::Instruction& inst) const;
private:
// Traverse the def use chain of |inst| and add the users and uses of |inst|
// which are in the same loop to the |returned_set|.
void TraverseUseDef(ir::Instruction* inst,
std::set<ir::Instruction*>* returned_set,
bool ignore_phi_users = false, bool report_loads = false);
// We group the instructions in the block into two different groups, the
// instructions to be kept in the original loop and the ones to be cloned into
// the new loop. As the cloned loop is attached to the preheader it will be
// the first loop and the second loop will be the original.
std::set<ir::Instruction*> cloned_loop_instructions_;
std::set<ir::Instruction*> original_loop_instructions_;
// We need a set of all the instructions to be seen so we can break any
// recursion and also so we can ignore certain instructions by preemptively
// adding them to this set.
std::set<ir::Instruction*> seen_instructions_;
// A map of instructions to their relative position in the function.
std::map<ir::Instruction*, size_t> instruction_order_;
ir::IRContext* context_;
ir::Loop* loop_;
// This is set to true by TraverseUseDef when traversing the instructions
// related to the loop condition and any if conditions should any of those
// instructions be a load.
bool load_used_in_condition_;
};
bool LoopFissionImpl::MovableInstruction(const ir::Instruction& inst) const {
return inst.opcode() == SpvOp::SpvOpLoad ||
inst.opcode() == SpvOp::SpvOpStore ||
inst.opcode() == SpvOp::SpvOpSelectionMerge ||
inst.opcode() == SpvOp::SpvOpPhi || inst.IsOpcodeCodeMotionSafe();
}
void LoopFissionImpl::TraverseUseDef(ir::Instruction* inst,
std::set<ir::Instruction*>* returned_set,
bool ignore_phi_users, bool report_loads) {
assert(returned_set && "Set to be returned cannot be null.");
opt::analysis::DefUseManager* def_use = context_->get_def_use_mgr();
std::set<ir::Instruction*>& inst_set = *returned_set;
// We create this functor to traverse the use def chain to build the
// grouping of related instructions. The lambda captures the std::function
// to allow it to recurse.
std::function<void(ir::Instruction*)> traverser_functor;
traverser_functor = [this, def_use, &inst_set, &traverser_functor,
ignore_phi_users, report_loads](ir::Instruction* user) {
// If we've seen the instruction before or it is not inside the loop end the
// traversal.
if (!user || seen_instructions_.count(user) != 0 ||
!context_->get_instr_block(user) ||
!loop_->IsInsideLoop(context_->get_instr_block(user))) {
return;
}
// Don't include labels or loop merge instructions in the instruction sets.
// Including them would mean we group instructions related only by using the
// same labels (i.e phis). We already preempt the inclusion of
// OpSelectionMerge by adding related instructions to the seen_instructions_
// set.
if (user->opcode() == SpvOp::SpvOpLoopMerge ||
user->opcode() == SpvOp::SpvOpLabel)
return;
// If the |report_loads| flag is set, set the class field
// load_used_in_condition_ to false. This is used to check that none of the
// condition checks in the loop rely on loads.
if (user->opcode() == SpvOp::SpvOpLoad && report_loads) {
load_used_in_condition_ = true;
}
// Add the instruction to the set of instructions already seen, this breaks
// recursion and allows us to ignore certain instructions.
seen_instructions_.insert(user);
inst_set.insert(user);
// Wrapper functor to traverse the operands of each instruction.
auto traverse_operand = [&traverser_functor, def_use](const uint32_t* id) {
traverser_functor(def_use->GetDef(*id));
};
user->ForEachInOperand(traverse_operand);
// For the first traversal we want to ignore the users of the phi.
if (ignore_phi_users && user->opcode() == SpvOp::SpvOpPhi) return;
// Traverse each user with this lambda.
def_use->ForEachUser(user, traverser_functor);
// Wrapper functor for the use traversal.
auto traverse_use = [&traverser_functor](ir::Instruction* use, uint32_t) {
traverser_functor(use);
};
def_use->ForEachUse(user, traverse_use);
};
// We start the traversal of the use def graph by invoking the above
// lambda with the |inst| parameter.
traverser_functor(inst);
}
bool LoopFissionImpl::GroupInstructionsByUseDef() {
std::vector<std::set<ir::Instruction*>> sets{};
// We want to ignore all the instructions stemming from the loop condition
// instruction.
ir::BasicBlock* condition_block = loop_->FindConditionBlock();
if (!condition_block) return false;
ir::Instruction* condition = &*condition_block->tail();
// We iterate over the blocks via iterating over all the blocks in the
// function, we do this so we are iterating in the same order which the blocks
// appear in the binary.
ir::Function& function = *loop_->GetHeaderBlock()->GetParent();
// Create a temporary set to ignore certain groups of instructions within the
// loop. We don't want any instructions related to control flow to be removed
// from either loop only instructions within the control flow bodies.
std::set<ir::Instruction*> instructions_to_ignore{};
TraverseUseDef(condition, &instructions_to_ignore, true, true);
// Traverse control flow instructions to ensure they are added to the
// seen_instructions_ set and will be ignored when it it called with actual
// sets.
for (ir::BasicBlock& block : function) {
if (!loop_->IsInsideLoop(block.id())) continue;
for (ir::Instruction& inst : block) {
// Ignore all instructions related to control flow.
if (inst.opcode() == SpvOp::SpvOpSelectionMerge || inst.IsBranch()) {
TraverseUseDef(&inst, &instructions_to_ignore, true, true);
}
}
}
// Traverse the instructions and generate the sets, automatically ignoring any
// instructions in instructions_to_ignore.
for (ir::BasicBlock& block : function) {
if (!loop_->IsInsideLoop(block.id()) ||
loop_->GetHeaderBlock()->id() == block.id())
continue;
for (ir::Instruction& inst : block) {
// Record the order that each load/store is seen.
if (inst.opcode() == SpvOp::SpvOpLoad ||
inst.opcode() == SpvOp::SpvOpStore) {
instruction_order_[&inst] = instruction_order_.size();
}
// Ignore instructions already seen in a traversal.
if (seen_instructions_.count(&inst) != 0) {
continue;
}
// Build the set.
std::set<ir::Instruction*> inst_set{};
TraverseUseDef(&inst, &inst_set);
if (!inst_set.empty()) sets.push_back(std::move(inst_set));
}
}
// If we have one or zero sets return false to indicate that due to
// insufficient instructions we couldn't split the loop into two groups and
// thus the loop can't be split any further.
if (sets.size() < 2) {
return false;
}
// Merge the loop sets into two different sets. In CanPerformSplit we will
// validate that we don't break the relative ordering of loads/stores by doing
// this.
for (size_t index = 0; index < sets.size() / 2; ++index) {
cloned_loop_instructions_.insert(sets[index].begin(), sets[index].end());
}
for (size_t index = sets.size() / 2; index < sets.size(); ++index) {
original_loop_instructions_.insert(sets[index].begin(), sets[index].end());
}
return true;
}
bool LoopFissionImpl::CanPerformSplit() {
// Return false if any of the condition instructions in the loop depend on a
// load.
if (load_used_in_condition_) {
return false;
}
// Build a list of all parent loops of this loop. Loop dependence analysis
// needs this structure.
std::vector<const ir::Loop*> loops;
ir::Loop* parent_loop = loop_;
while (parent_loop) {
loops.push_back(parent_loop);
parent_loop = parent_loop->GetParent();
}
LoopDependenceAnalysis analysis{context_, loops};
// A list of all the stores in the cloned loop.
std::vector<ir::Instruction*> set_one_stores{};
// A list of all the loads in the cloned loop.
std::vector<ir::Instruction*> set_one_loads{};
// Populate the above lists.
for (ir::Instruction* inst : cloned_loop_instructions_) {
if (inst->opcode() == SpvOp::SpvOpStore) {
set_one_stores.push_back(inst);
} else if (inst->opcode() == SpvOp::SpvOpLoad) {
set_one_loads.push_back(inst);
}
// If we find any instruction which we can't move (such as a barrier),
// return false.
if (!MovableInstruction(*inst)) return false;
}
// We need to calculate the depth of the loop to create the loop dependency
// distance vectors.
const size_t loop_depth = loop_->GetDepth();
// Check the dependencies between loads in the cloned loop and stores in the
// original and vice versa.
for (ir::Instruction* inst : original_loop_instructions_) {
// If we find any instruction which we can't move (such as a barrier),
// return false.
if (!MovableInstruction(*inst)) return false;
// Look at the dependency between the loads in the original and stores in
// the cloned loops.
if (inst->opcode() == SpvOp::SpvOpLoad) {
for (ir::Instruction* store : set_one_stores) {
DistanceVector vec{loop_depth};
// If the store actually should appear after the load, return false.
// This means the store has been placed in the wrong grouping.
if (instruction_order_[store] > instruction_order_[inst]) {
return false;
}
// If not independent check the distance vector.
if (!analysis.GetDependence(store, inst, &vec)) {
for (DistanceEntry& entry : vec.GetEntries()) {
// A distance greater than zero means that the store in the cloned
// loop has a dependency on the load in the original loop.
if (entry.distance > 0) return false;
}
}
}
} else if (inst->opcode() == SpvOp::SpvOpStore) {
for (ir::Instruction* load : set_one_loads) {
DistanceVector vec{loop_depth};
// If the load actually should appear after the store, return false.
if (instruction_order_[load] > instruction_order_[inst]) {
return false;
}
// If not independent check the distance vector.
if (!analysis.GetDependence(inst, load, &vec)) {
for (DistanceEntry& entry : vec.GetEntries()) {
// A distance less than zero means the load in the cloned loop is
// dependent on the store instruction in the original loop.
if (entry.distance < 0) return false;
}
}
}
}
}
return true;
}
ir::Loop* LoopFissionImpl::SplitLoop() {
// Clone the loop.
LoopUtils util{context_, loop_};
LoopUtils::LoopCloningResult clone_results;
ir::Loop* cloned_loop = util.CloneAndAttachLoopToHeader(&clone_results);
// Update the OpLoopMerge in the cloned loop.
cloned_loop->UpdateLoopMergeInst();
// Add the loop_ to the module.
ir::Function::iterator it =
util.GetFunction()->FindBlock(loop_->GetOrCreatePreHeaderBlock()->id());
util.GetFunction()->AddBasicBlocks(clone_results.cloned_bb_.begin(),
clone_results.cloned_bb_.end(), ++it);
loop_->SetPreHeaderBlock(cloned_loop->GetMergeBlock());
std::vector<ir::Instruction*> instructions_to_kill{};
// Kill all the instructions which should appear in the cloned loop but not in
// the original loop.
for (uint32_t id : loop_->GetBlocks()) {
ir::BasicBlock* block = context_->cfg()->block(id);
for (ir::Instruction& inst : *block) {
// If the instruction appears in the cloned loop instruction group, kill
// it.
if (cloned_loop_instructions_.count(&inst) == 1 &&
original_loop_instructions_.count(&inst) == 0) {
instructions_to_kill.push_back(&inst);
if (inst.opcode() == SpvOp::SpvOpPhi) {
context_->ReplaceAllUsesWith(
inst.result_id(), clone_results.value_map_[inst.result_id()]);
}
}
}
}
// Kill all instructions which should appear in the original loop and not in
// the cloned loop.
for (uint32_t id : cloned_loop->GetBlocks()) {
ir::BasicBlock* block = context_->cfg()->block(id);
for (ir::Instruction& inst : *block) {
ir::Instruction* old_inst = clone_results.ptr_map_[&inst];
// If the instruction belongs to the original loop instruction group, kill
// it.
if (cloned_loop_instructions_.count(old_inst) == 0 &&
original_loop_instructions_.count(old_inst) == 1) {
instructions_to_kill.push_back(&inst);
}
}
}
for (ir::Instruction* i : instructions_to_kill) {
context_->KillInst(i);
}
return cloned_loop;
}
LoopFissionPass::LoopFissionPass(const size_t register_threshold_to_split,
bool split_multiple_times)
: split_multiple_times_(split_multiple_times) {
// Split if the number of registers in the loop exceeds
// |register_threshold_to_split|.
split_criteria_ =
[register_threshold_to_split](
const RegisterLiveness::RegionRegisterLiveness& liveness) {
return liveness.used_registers_ > register_threshold_to_split;
};
}
LoopFissionPass::LoopFissionPass() : split_multiple_times_(false) {
// Split by default.
split_criteria_ = [](const RegisterLiveness::RegionRegisterLiveness&) {
return true;
};
}
bool LoopFissionPass::ShouldSplitLoop(const ir::Loop& loop, ir::IRContext* c) {
LivenessAnalysis* analysis = c->GetLivenessAnalysis();
RegisterLiveness::RegionRegisterLiveness liveness{};
ir::Function* function = loop.GetHeaderBlock()->GetParent();
analysis->Get(function)->ComputeLoopRegisterPressure(loop, &liveness);
return split_criteria_(liveness);
}
Pass::Status LoopFissionPass::Process(ir::IRContext* c) {
bool changed = false;
for (ir::Function& f : *c->module()) {
// We collect all the inner most loops in the function and run the loop
// splitting util on each. The reason we do this is to allow us to iterate
// over each, as creating new loops will invalidate the the loop iterator.
std::vector<ir::Loop*> inner_most_loops{};
ir::LoopDescriptor& loop_descriptor = *c->GetLoopDescriptor(&f);
for (ir::Loop& loop : loop_descriptor) {
if (!loop.HasChildren() && ShouldSplitLoop(loop, c)) {
inner_most_loops.push_back(&loop);
}
}
// List of new loops which meet the criteria to be split again.
std::vector<ir::Loop*> new_loops_to_split{};
while (!inner_most_loops.empty()) {
for (ir::Loop* loop : inner_most_loops) {
LoopFissionImpl impl{c, loop};
// Group the instructions in the loop into two different sets of related
// instructions. If we can't group the instructions into the two sets
// then we can't split the loop any further.
if (!impl.GroupInstructionsByUseDef()) {
continue;
}
if (impl.CanPerformSplit()) {
ir::Loop* second_loop = impl.SplitLoop();
changed = true;
c->InvalidateAnalysesExceptFor(ir::IRContext::kAnalysisLoopAnalysis);
// If the newly created loop meets the criteria to be split, split it
// again.
if (ShouldSplitLoop(*second_loop, c))
new_loops_to_split.push_back(second_loop);
// If the original loop (now split) still meets the criteria to be
// split, split it again.
if (ShouldSplitLoop(*loop, c)) new_loops_to_split.push_back(loop);
}
}
// If the split multiple times flag has been set add the new loops which
// meet the splitting criteria into the list of loops to be split on the
// next iteration.
if (split_multiple_times_) {
inner_most_loops = std::move(new_loops_to_split);
} else {
break;
}
}
}
return changed ? Pass::Status::SuccessWithChange
: Pass::Status::SuccessWithoutChange;
}
} // namespace opt
} // namespace spvtools