Add loop peeling utility

The loop peeler util takes a loop as input and create a new one before. The iterator of the duplicated loop then set to accommodate the number of iteration required for the peeling. The loop peeling pass that decided to do the peeling and profitability analysis is left for a follow-up PR.
2024-11-22 11:40:05 +00:00 · 2018-03-07 18:21:11 +00:00 · 2018-03-07 18:21:11 +00:00 · bdf421cf40
commit bdf421cf40
parent b3daa93b46
12 changed files with 1972 additions and 19 deletions
--- a/Android.mk
+++ b/Android.mk
@ -100,6 +100,7 @@ SPVTOOLS_OPT_SRC_FILES := \
 		source/opt/local_single_store_elim_pass.cpp \
 		source/opt/local_ssa_elim_pass.cpp \
 		source/opt/loop_descriptor.cpp \
+		source/opt/loop_peeling.cpp \
 		source/opt/loop_unroller.cpp \
 		source/opt/loop_unswitch_pass.cpp \
 		source/opt/loop_utils.cpp \
--- a/source/opt/CMakeLists.txt
+++ b/source/opt/CMakeLists.txt
@ -58,6 +58,7 @@ add_library(SPIRV-Tools-opt
  local_ssa_elim_pass.h
  log.h
  loop_descriptor.h
+  loop_peeling.h
  loop_unroller.h
  loop_utils.h
  loop_unswitch_pass.h
@ -132,6 +133,7 @@ add_library(SPIRV-Tools-opt
  local_single_store_elim_pass.cpp
  local_ssa_elim_pass.cpp
  loop_descriptor.cpp
+  loop_peeling.cpp
  loop_utils.cpp
  loop_unroller.cpp
  loop_unswitch_pass.cpp
--- a/source/opt/function.h
+++ b/source/opt/function.h
@ -15,6 +15,7 @@
 #ifndef LIBSPIRV_OPT_CONSTRUCTS_H_
 #define LIBSPIRV_OPT_CONSTRUCTS_H_

+#include <algorithm>
 #include <functional>
 #include <memory>
 #include <utility>
@ -92,6 +93,13 @@ class Function {
    return const_iterator(&blocks_, blocks_.cend());
  }

+  // Returns an iterator to the basic block |id|.
+  iterator FindBlock(uint32_t bb_id) {
+    return std::find_if(begin(), end(), [bb_id](const ir::BasicBlock& it_bb) {
+      return bb_id == it_bb.id();
+    });
+  }
+
  // Runs the given function |f| on each instruction in this function, and
  // optionally on debug line instructions that might precede them.
  void ForEachInst(const std::function<void(Instruction*)>& f,
--- a/source/opt/ir_builder.h
+++ b/source/opt/ir_builder.h
@ -159,6 +159,61 @@ class InstructionBuilder {
    return AddInstruction(std::move(phi_inst));
  }

+  // Creates an addition instruction.
+  // The id |type| must be the id of the instruction's type, must be the same as
+  // |op1| and |op2| types.
+  // The id |op1| is the left hand side of the operation.
+  // The id |op2| is the right hand side of the operation.
+  ir::Instruction* AddIAdd(uint32_t type, uint32_t op1, uint32_t op2) {
+    std::unique_ptr<ir::Instruction> inst(new ir::Instruction(
+        GetContext(), SpvOpIAdd, type, GetContext()->TakeNextId(),
+        {{SPV_OPERAND_TYPE_ID, {op1}}, {SPV_OPERAND_TYPE_ID, {op2}}}));
+    return AddInstruction(std::move(inst));
+  }
+
+  // Creates a less than instruction for unsigned integer.
+  // The id |op1| is the left hand side of the operation.
+  // The id |op2| is the right hand side of the operation.
+  // It is assumed that |op1| and |op2| have the same underlying type.
+  ir::Instruction* AddULessThan(uint32_t op1, uint32_t op2) {
+    analysis::Bool bool_type;
+    uint32_t type = GetContext()->get_type_mgr()->GetId(&bool_type);
+    std::unique_ptr<ir::Instruction> inst(new ir::Instruction(
+        GetContext(), SpvOpULessThan, type, GetContext()->TakeNextId(),
+        {{SPV_OPERAND_TYPE_ID, {op1}}, {SPV_OPERAND_TYPE_ID, {op2}}}));
+    return AddInstruction(std::move(inst));
+  }
+
+  // Creates a less than instruction for signed integer.
+  // The id |op1| is the left hand side of the operation.
+  // The id |op2| is the right hand side of the operation.
+  // It is assumed that |op1| and |op2| have the same underlying type.
+  ir::Instruction* AddSLessThan(uint32_t op1, uint32_t op2) {
+    analysis::Bool bool_type;
+    uint32_t type = GetContext()->get_type_mgr()->GetId(&bool_type);
+    std::unique_ptr<ir::Instruction> inst(new ir::Instruction(
+        GetContext(), SpvOpSLessThan, type, GetContext()->TakeNextId(),
+        {{SPV_OPERAND_TYPE_ID, {op1}}, {SPV_OPERAND_TYPE_ID, {op2}}}));
+    return AddInstruction(std::move(inst));
+  }
+
+  // Creates an OpILessThan or OpULessThen instruction depending on the sign of
+  // |op1|. The id |op1| is the left hand side of the operation. The id |op2| is
+  // the right hand side of the operation. It is assumed that |op1| and |op2|
+  // have the same underlying type.
+  ir::Instruction* AddLessThan(uint32_t op1, uint32_t op2) {
+    ir::Instruction* op1_insn = context_->get_def_use_mgr()->GetDef(op1);
+    analysis::Type* type =
+        GetContext()->get_type_mgr()->GetType(op1_insn->type_id());
+    analysis::Integer* int_type = type->AsInteger();
+    assert(int_type && "Operand is not of int type");
+
+    if (int_type->IsSigned())
+      return AddSLessThan(op1, op2);
+    else
+      return AddULessThan(op1, op2);
+  }
+
  // Creates a select instruction.
  // |type| must match the types of |true_value| and |false_value|. It is up to
  // the caller to ensure that |cond| is a correct type (bool or vector of
--- a/source/opt/loop_descriptor.cpp
+++ b/source/opt/loop_descriptor.cpp
@ -289,13 +289,16 @@ void Loop::SetMergeBlock(BasicBlock* merge) {
 }

 void Loop::SetPreHeaderBlock(BasicBlock* preheader) {
-  assert(!IsInsideLoop(preheader) && "The preheader block is in the loop");
-  assert(preheader->tail()->opcode() == SpvOpBranch &&
-         "The preheader block does not unconditionally branch to the header "
-         "block");
-  assert(preheader->tail()->GetSingleWordOperand(0) == GetHeaderBlock()->id() &&
-         "The preheader block does not unconditionally branch to the header "
-         "block");
+  if (preheader) {
+    assert(!IsInsideLoop(preheader) && "The preheader block is in the loop");
+    assert(preheader->tail()->opcode() == SpvOpBranch &&
+           "The preheader block does not unconditionally branch to the header "
+           "block");
+    assert(preheader->tail()->GetSingleWordOperand(0) ==
+               GetHeaderBlock()->id() &&
+           "The preheader block does not unconditionally branch to the header "
+           "block");
+  }
  loop_preheader_ = preheader;
 }

--- a/source/opt/loop_peeling.cpp
+++ b/source/opt/loop_peeling.cpp
@ -0,0 +1,563 @@
+// Copyright (c) 2018 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "ir_builder.h"
+#include "ir_context.h"
+#include "loop_descriptor.h"
+#include "loop_peeling.h"
+#include "loop_utils.h"
+
+namespace spvtools {
+namespace opt {
+
+void LoopPeeling::DuplicateAndConnectLoop(
+    LoopUtils::LoopCloningResult* clone_results) {
+  ir::CFG& cfg = *context_->cfg();
+  analysis::DefUseManager* def_use_mgr = context_->get_def_use_mgr();
+
+  assert(CanPeelLoop() && "Cannot peel loop!");
+
+  std::vector<ir::BasicBlock*> ordered_loop_blocks;
+  ir::BasicBlock* pre_header = loop_->GetOrCreatePreHeaderBlock();
+
+  loop_->ComputeLoopStructuredOrder(&ordered_loop_blocks);
+
+  cloned_loop_ = loop_utils_.CloneLoop(clone_results, ordered_loop_blocks);
+
+  // Add the basic block to the function.
+  ir::Function::iterator it =
+      loop_utils_.GetFunction()->FindBlock(pre_header->id());
+  assert(it != loop_utils_.GetFunction()->end() &&
+         "Pre-header not found in the function.");
+  loop_utils_.GetFunction()->AddBasicBlocks(
+      clone_results->cloned_bb_.begin(), clone_results->cloned_bb_.end(), ++it);
+
+  // Make the |loop_|'s preheader the |cloned_loop_| one.
+  ir::BasicBlock* cloned_header = cloned_loop_->GetHeaderBlock();
+  pre_header->ForEachSuccessorLabel(
+      [cloned_header](uint32_t* succ) { *succ = cloned_header->id(); });
+
+  // Update cfg.
+  cfg.RemoveEdge(pre_header->id(), loop_->GetHeaderBlock()->id());
+  cloned_loop_->SetPreHeaderBlock(pre_header);
+  loop_->SetPreHeaderBlock(nullptr);
+
+  // When cloning the loop, we didn't cloned the merge block, so currently
+  // |cloned_loop_| shares the same block as |loop_|.
+  // We mutate all branches from |cloned_loop_| block to |loop_|'s merge into a
+  // branch to |loop_|'s header (so header will also be the merge of
+  // |cloned_loop_|).
+  uint32_t cloned_loop_exit = 0;
+  for (uint32_t pred_id : cfg.preds(loop_->GetMergeBlock()->id())) {
+    if (loop_->IsInsideLoop(pred_id)) continue;
+    ir::BasicBlock* bb = cfg.block(pred_id);
+    assert(cloned_loop_exit == 0 && "The loop has multiple exits.");
+    cloned_loop_exit = bb->id();
+    bb->ForEachSuccessorLabel([this](uint32_t* succ) {
+      if (*succ == loop_->GetMergeBlock()->id())
+        *succ = loop_->GetHeaderBlock()->id();
+    });
+  }
+
+  // Update cfg.
+  cfg.RemoveNonExistingEdges(loop_->GetMergeBlock()->id());
+  cfg.AddEdge(cloned_loop_exit, loop_->GetHeaderBlock()->id());
+
+  // Patch the phi of the original loop header:
+  //  - Set the loop entry branch to come from the cloned loop exit block;
+  //  - Set the initial value of the phi using the corresponding cloned loop
+  //    exit values.
+  //
+  // We patch the iterating value initializers of the original loop using the
+  // corresponding cloned loop exit values. Connects the cloned loop iterating
+  // values to the original loop. This make sure that the initial value of the
+  // second loop starts with the last value of the first loop.
+  //
+  // For example, loops like:
+  //
+  // int z = 0;
+  // for (int i = 0; i++ < M; i += cst1) {
+  //   if (cond)
+  //     z += cst2;
+  // }
+  //
+  // Will become:
+  //
+  // int z = 0;
+  // int i = 0;
+  // for (; i++ < M; i += cst1) {
+  //   if (cond)
+  //     z += cst2;
+  // }
+  // for (; i++ < M; i += cst1) {
+  //   if (cond)
+  //     z += cst2;
+  // }
+  loop_->GetHeaderBlock()->ForEachPhiInst([cloned_loop_exit, def_use_mgr,
+                                           clone_results,
+                                           this](ir::Instruction* phi) {
+    for (uint32_t i = 0; i < phi->NumInOperands(); i += 2) {
+      if (!loop_->IsInsideLoop(phi->GetSingleWordInOperand(i + 1))) {
+        phi->SetInOperand(i,
+                          {clone_results->value_map_.at(
+                              exit_value_.at(phi->result_id())->result_id())});
+        phi->SetInOperand(i + 1, {cloned_loop_exit});
+        def_use_mgr->AnalyzeInstUse(phi);
+        return;
+      }
+    }
+  });
+
+  // Force the creation of a new preheader for the original loop and set it as
+  // the merge block for the cloned loop.
+  cloned_loop_->SetMergeBlock(loop_->GetOrCreatePreHeaderBlock());
+}
+
+void LoopPeeling::InsertCanonicalInductionVariable() {
+  ir::BasicBlock::iterator insert_point =
+      GetClonedLoop()->GetLatchBlock()->tail();
+  if (GetClonedLoop()->GetLatchBlock()->GetMergeInst()) {
+    --insert_point;
+  }
+  InstructionBuilder builder(context_, &*insert_point,
+                             ir::IRContext::kAnalysisDefUse |
+                                 ir::IRContext::kAnalysisInstrToBlockMapping);
+  ir::Instruction* uint_1_cst =
+      builder.Add32BitConstantInteger<uint32_t>(1, int_type_->IsSigned());
+  // Create the increment.
+  // Note that we do "1 + 1" here, one of the operand should the phi
+  // value but we don't have it yet. The operand will be set latter.
+  ir::Instruction* iv_inc = builder.AddIAdd(
+      uint_1_cst->type_id(), uint_1_cst->result_id(), uint_1_cst->result_id());
+
+  builder.SetInsertPoint(&*GetClonedLoop()->GetHeaderBlock()->begin());
+
+  canonical_induction_variable_ = builder.AddPhi(
+      uint_1_cst->type_id(),
+      {builder.Add32BitConstantInteger<uint32_t>(0, int_type_->IsSigned())
+           ->result_id(),
+       GetClonedLoop()->GetPreHeaderBlock()->id(), iv_inc->result_id(),
+       GetClonedLoop()->GetLatchBlock()->id()});
+  // Connect everything.
+  iv_inc->SetInOperand(0, {canonical_induction_variable_->result_id()});
+
+  // Update def/use manager.
+  context_->get_def_use_mgr()->AnalyzeInstUse(iv_inc);
+
+  // If do-while form, use the incremented value.
+  if (do_while_form_) {
+    canonical_induction_variable_ = iv_inc;
+  }
+}
+
+void LoopPeeling::GetIteratorUpdateOperations(
+    const ir::Loop* loop, ir::Instruction* iterator,
+    std::unordered_set<ir::Instruction*>* operations) {
+  opt::analysis::DefUseManager* def_use_mgr = context_->get_def_use_mgr();
+  operations->insert(iterator);
+  iterator->ForEachInId([def_use_mgr, loop, operations, this](uint32_t* id) {
+    ir::Instruction* insn = def_use_mgr->GetDef(*id);
+    if (insn->opcode() == SpvOpLabel) {
+      return;
+    }
+    if (operations->count(insn)) {
+      return;
+    }
+    if (!loop->IsInsideLoop(insn)) {
+      return;
+    }
+    GetIteratorUpdateOperations(loop, insn, operations);
+  });
+}
+
+// Gather the set of blocks for all the path from |entry| to |root|.
+static void GetBlocksInPath(uint32_t block, uint32_t entry,
+                            std::unordered_set<uint32_t>* blocks_in_path,
+                            const ir::CFG& cfg) {
+  for (uint32_t pid : cfg.preds(block)) {
+    if (blocks_in_path->insert(pid).second) {
+      if (pid != entry) {
+        GetBlocksInPath(pid, entry, blocks_in_path, cfg);
+      }
+    }
+  }
+}
+
+bool LoopPeeling::IsConditionCheckSideEffectFree() const {
+  ir::CFG& cfg = *context_->cfg();
+
+  // The "do-while" form does not cause issues, the algorithm takes into account
+  // the first iteration.
+  if (!do_while_form_) {
+    uint32_t condition_block_id = cfg.preds(loop_->GetMergeBlock()->id())[0];
+
+    std::unordered_set<uint32_t> blocks_in_path;
+
+    blocks_in_path.insert(condition_block_id);
+    GetBlocksInPath(condition_block_id, loop_->GetHeaderBlock()->id(),
+                    &blocks_in_path, cfg);
+
+    for (uint32_t bb_id : blocks_in_path) {
+      ir::BasicBlock* bb = cfg.block(bb_id);
+      if (!bb->WhileEachInst([this](ir::Instruction* insn) {
+            if (insn->IsBranch()) return true;
+            switch (insn->opcode()) {
+              case SpvOpLabel:
+              case SpvOpSelectionMerge:
+              case SpvOpLoopMerge:
+                return true;
+              default:
+                break;
+            }
+            return context_->IsCombinatorInstruction(insn);
+          })) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+void LoopPeeling::GetIteratingExitValues() {
+  ir::CFG& cfg = *context_->cfg();
+
+  loop_->GetHeaderBlock()->ForEachPhiInst([this](ir::Instruction* phi) {
+    exit_value_[phi->result_id()] = nullptr;
+  });
+
+  if (!loop_->GetMergeBlock()) {
+    return;
+  }
+  if (cfg.preds(loop_->GetMergeBlock()->id()).size() != 1) {
+    return;
+  }
+  opt::analysis::DefUseManager* def_use_mgr = context_->get_def_use_mgr();
+
+  uint32_t condition_block_id = cfg.preds(loop_->GetMergeBlock()->id())[0];
+
+  auto& header_pred = cfg.preds(loop_->GetHeaderBlock()->id());
+  do_while_form_ = std::find(header_pred.begin(), header_pred.end(),
+                             condition_block_id) != header_pred.end();
+  if (do_while_form_) {
+    loop_->GetHeaderBlock()->ForEachPhiInst(
+        [condition_block_id, def_use_mgr, this](ir::Instruction* phi) {
+          std::unordered_set<ir::Instruction*> operations;
+
+          for (uint32_t i = 0; i < phi->NumInOperands(); i += 2) {
+            if (condition_block_id == phi->GetSingleWordInOperand(i + 1)) {
+              exit_value_[phi->result_id()] =
+                  def_use_mgr->GetDef(phi->GetSingleWordInOperand(i));
+            }
+          }
+        });
+  } else {
+    DominatorTree* dom_tree =
+        &context_->GetDominatorAnalysis(loop_utils_.GetFunction(), cfg)
+             ->GetDomTree();
+    ir::BasicBlock* condition_block = cfg.block(condition_block_id);
+
+    loop_->GetHeaderBlock()->ForEachPhiInst(
+        [dom_tree, condition_block, this](ir::Instruction* phi) {
+          std::unordered_set<ir::Instruction*> operations;
+
+          // Not the back-edge value, check if the phi instruction is the only
+          // possible candidate.
+          GetIteratorUpdateOperations(loop_, phi, &operations);
+
+          for (ir::Instruction* insn : operations) {
+            if (insn == phi) {
+              continue;
+            }
+            if (dom_tree->Dominates(context_->get_instr_block(insn),
+                                    condition_block)) {
+              return;
+            }
+          }
+          exit_value_[phi->result_id()] = phi;
+        });
+  }
+}
+
+void LoopPeeling::FixExitCondition(
+    const std::function<uint32_t(ir::Instruction*)>& condition_builder) {
+  ir::CFG& cfg = *context_->cfg();
+
+  uint32_t condition_block_id = 0;
+  for (uint32_t id : cfg.preds(GetClonedLoop()->GetMergeBlock()->id())) {
+    if (GetClonedLoop()->IsInsideLoop(id)) {
+      condition_block_id = id;
+      break;
+    }
+  }
+  assert(condition_block_id != 0 && "2nd loop in improperly connected");
+
+  ir::BasicBlock* condition_block = cfg.block(condition_block_id);
+  ir::Instruction* exit_condition = condition_block->terminator();
+  assert(exit_condition->opcode() == SpvOpBranchConditional);
+  ir::BasicBlock::iterator insert_point = condition_block->tail();
+  if (condition_block->GetMergeInst()) {
+    --insert_point;
+  }
+
+  exit_condition->SetInOperand(0, {condition_builder(&*insert_point)});
+
+  uint32_t to_continue_block_idx =
+      GetClonedLoop()->IsInsideLoop(exit_condition->GetSingleWordInOperand(1))
+          ? 1
+          : 2;
+  exit_condition->SetInOperand(
+      1, {exit_condition->GetSingleWordInOperand(to_continue_block_idx)});
+  exit_condition->SetInOperand(2, {GetClonedLoop()->GetMergeBlock()->id()});
+
+  // Update def/use manager.
+  context_->get_def_use_mgr()->AnalyzeInstUse(exit_condition);
+}
+
+ir::BasicBlock* LoopPeeling::CreateBlockBefore(ir::BasicBlock* bb) {
+  analysis::DefUseManager* def_use_mgr = context_->get_def_use_mgr();
+  ir::CFG& cfg = *context_->cfg();
+  assert(cfg.preds(bb->id()).size() == 1 && "More than one predecessor");
+
+  std::unique_ptr<ir::BasicBlock> new_bb = MakeUnique<ir::BasicBlock>(
+      std::unique_ptr<ir::Instruction>(new ir::Instruction(
+          context_, SpvOpLabel, 0, context_->TakeNextId(), {})));
+  new_bb->SetParent(loop_utils_.GetFunction());
+  // Update the loop descriptor.
+  ir::Loop* in_loop = (*loop_utils_.GetLoopDescriptor())[bb];
+  if (in_loop) {
+    in_loop->AddBasicBlock(new_bb.get());
+    loop_utils_.GetLoopDescriptor()->SetBasicBlockToLoop(new_bb->id(), in_loop);
+  }
+
+  context_->set_instr_block(new_bb->GetLabelInst(), new_bb.get());
+  def_use_mgr->AnalyzeInstDefUse(new_bb->GetLabelInst());
+
+  ir::BasicBlock* bb_pred = cfg.block(cfg.preds(bb->id())[0]);
+  bb_pred->tail()->ForEachInId([bb, &new_bb](uint32_t* id) {
+    if (*id == bb->id()) {
+      *id = new_bb->id();
+    }
+  });
+  cfg.RemoveEdge(bb_pred->id(), bb->id());
+  cfg.AddEdge(bb_pred->id(), new_bb->id());
+  def_use_mgr->AnalyzeInstUse(&*bb_pred->tail());
+
+  // Update the incoming branch.
+  bb->ForEachPhiInst([&new_bb, def_use_mgr](ir::Instruction* phi) {
+    phi->SetInOperand(1, {new_bb->id()});
+    def_use_mgr->AnalyzeInstUse(phi);
+  });
+  InstructionBuilder(context_, new_bb.get(),
+                     ir::IRContext::kAnalysisDefUse |
+                         ir::IRContext::kAnalysisInstrToBlockMapping)
+      .AddBranch(bb->id());
+  cfg.AddEdge(new_bb->id(), bb->id());
+
+  // Add the basic block to the function.
+  ir::Function::iterator it = loop_utils_.GetFunction()->FindBlock(bb->id());
+  assert(it != loop_utils_.GetFunction()->end() &&
+         "Basic block not found in the function.");
+  ir::BasicBlock* ret = new_bb.get();
+  loop_utils_.GetFunction()->AddBasicBlock(std::move(new_bb), it);
+  return ret;
+}
+
+ir::BasicBlock* LoopPeeling::ProtectLoop(ir::Loop* loop,
+                                         ir::Instruction* condition,
+                                         ir::BasicBlock* if_merge) {
+  ir::BasicBlock* if_block = loop->GetOrCreatePreHeaderBlock();
+  // Will no longer be a pre-header because of the if.
+  loop->SetPreHeaderBlock(nullptr);
+  // Kill the branch to the header.
+  context_->KillInst(&*if_block->tail());
+
+  InstructionBuilder builder(context_, if_block,
+                             ir::IRContext::kAnalysisDefUse |
+                                 ir::IRContext::kAnalysisInstrToBlockMapping);
+  builder.AddConditionalBranch(condition->result_id(),
+                               loop->GetHeaderBlock()->id(), if_merge->id(),
+                               if_merge->id());
+
+  return if_block;
+}
+
+void LoopPeeling::PeelBefore(uint32_t peel_factor) {
+  assert(CanPeelLoop() && "Cannot peel loop");
+  LoopUtils::LoopCloningResult clone_results;
+
+  // Clone the loop and insert the cloned one before the loop.
+  DuplicateAndConnectLoop(&clone_results);
+
+  // Add a canonical induction variable "canonical_induction_variable_".
+  InsertCanonicalInductionVariable();
+
+  InstructionBuilder builder(context_,
+                             &*cloned_loop_->GetPreHeaderBlock()->tail(),
+                             ir::IRContext::kAnalysisDefUse |
+                                 ir::IRContext::kAnalysisInstrToBlockMapping);
+  ir::Instruction* factor =
+      builder.Add32BitConstantInteger(peel_factor, int_type_->IsSigned());
+
+  ir::Instruction* has_remaining_iteration = builder.AddLessThan(
+      factor->result_id(), loop_iteration_count_->result_id());
+  ir::Instruction* max_iteration = builder.AddSelect(
+      factor->type_id(), has_remaining_iteration->result_id(),
+      factor->result_id(), loop_iteration_count_->result_id());
+
+  // Change the exit condition of the cloned loop to be (exit when become
+  // false):
+  //  "canonical_induction_variable_" < min("factor", "loop_iteration_count_")
+  FixExitCondition([max_iteration, this](ir::Instruction* insert_before_point) {
+    return InstructionBuilder(context_, insert_before_point,
+                              ir::IRContext::kAnalysisDefUse |
+                                  ir::IRContext::kAnalysisInstrToBlockMapping)
+        .AddLessThan(canonical_induction_variable_->result_id(),
+                     max_iteration->result_id())
+        ->result_id();
+  });
+
+  // "Protect" the second loop: the second loop can only be executed if
+  // |has_remaining_iteration| is true (i.e. factor < loop_iteration_count_).
+  ir::BasicBlock* if_merge_block = loop_->GetMergeBlock();
+  loop_->SetMergeBlock(CreateBlockBefore(loop_->GetMergeBlock()));
+  // Prevent the second loop from being executed if we already executed all the
+  // required iterations.
+  ir::BasicBlock* if_block =
+      ProtectLoop(loop_, has_remaining_iteration, if_merge_block);
+  // Patch the phi of the merge block.
+  if_merge_block->ForEachPhiInst(
+      [&clone_results, if_block, this](ir::Instruction* phi) {
+        // if_merge_block had previously only 1 predecessor.
+        uint32_t incoming_value = phi->GetSingleWordInOperand(0);
+        auto def_in_loop = clone_results.value_map_.find(incoming_value);
+        if (def_in_loop != clone_results.value_map_.end())
+          incoming_value = def_in_loop->second;
+        phi->AddOperand(
+            {spv_operand_type_t::SPV_OPERAND_TYPE_ID, {incoming_value}});
+        phi->AddOperand(
+            {spv_operand_type_t::SPV_OPERAND_TYPE_ID, {if_block->id()}});
+        context_->get_def_use_mgr()->AnalyzeInstUse(phi);
+      });
+
+  context_->InvalidateAnalysesExceptFor(
+      ir::IRContext::kAnalysisDefUse |
+      ir::IRContext::kAnalysisInstrToBlockMapping |
+      ir::IRContext::kAnalysisLoopAnalysis | ir::IRContext::kAnalysisCFG);
+}
+
+void LoopPeeling::PeelAfter(uint32_t peel_factor) {
+  assert(CanPeelLoop() && "Cannot peel loop");
+  LoopUtils::LoopCloningResult clone_results;
+
+  // Clone the loop and insert the cloned one before the loop.
+  DuplicateAndConnectLoop(&clone_results);
+
+  // Add a canonical induction variable "canonical_induction_variable_".
+  InsertCanonicalInductionVariable();
+
+  InstructionBuilder builder(context_,
+                             &*cloned_loop_->GetPreHeaderBlock()->tail(),
+                             ir::IRContext::kAnalysisDefUse |
+                                 ir::IRContext::kAnalysisInstrToBlockMapping);
+  ir::Instruction* factor =
+      builder.Add32BitConstantInteger(peel_factor, int_type_->IsSigned());
+
+  ir::Instruction* has_remaining_iteration = builder.AddLessThan(
+      factor->result_id(), loop_iteration_count_->result_id());
+
+  // Change the exit condition of the cloned loop to be (exit when become
+  // false):
+  //  "canonical_induction_variable_" + "factor" < "loop_iteration_count_"
+  FixExitCondition([factor, this](ir::Instruction* insert_before_point) {
+    InstructionBuilder cond_builder(
+        context_, insert_before_point,
+        ir::IRContext::kAnalysisDefUse |
+            ir::IRContext::kAnalysisInstrToBlockMapping);
+    // Build the following check: canonical_induction_variable_ + factor <
+    // iteration_count
+    return cond_builder
+        .AddLessThan(cond_builder
+                         .AddIAdd(canonical_induction_variable_->type_id(),
+                                  canonical_induction_variable_->result_id(),
+                                  factor->result_id())
+                         ->result_id(),
+                     loop_iteration_count_->result_id())
+        ->result_id();
+  });
+
+  // "Protect" the first loop: the first loop can only be executed if
+  // factor < loop_iteration_count_.
+
+  // The original loop's pre-header was the cloned loop merge block.
+  GetClonedLoop()->SetMergeBlock(
+      CreateBlockBefore(GetOriginalLoop()->GetPreHeaderBlock()));
+  // Use the second loop preheader as if merge block.
+
+  // Prevent the first loop if only the peeled loop needs it.
+  ir::BasicBlock* if_block =
+      ProtectLoop(cloned_loop_, has_remaining_iteration,
+                  GetOriginalLoop()->GetPreHeaderBlock());
+
+  // Patch the phi of the header block.
+  // We added an if to enclose the first loop and because the phi node are
+  // connected to the exit value of the first loop, the definition no longer
+  // dominate the preheader.
+  // We had to the preheader (our if merge block) the required phi instruction
+  // and patch the header phi.
+  GetOriginalLoop()->GetHeaderBlock()->ForEachPhiInst(
+      [&clone_results, if_block, this](ir::Instruction* phi) {
+        opt::analysis::DefUseManager* def_use_mgr = context_->get_def_use_mgr();
+
+        auto find_value_idx = [](ir::Instruction* phi_inst, ir::Loop* loop) {
+          uint32_t preheader_value_idx =
+              !loop->IsInsideLoop(phi_inst->GetSingleWordInOperand(1)) ? 0 : 2;
+          return preheader_value_idx;
+        };
+
+        ir::Instruction* cloned_phi =
+            def_use_mgr->GetDef(clone_results.value_map_.at(phi->result_id()));
+        uint32_t cloned_preheader_value = cloned_phi->GetSingleWordInOperand(
+            find_value_idx(cloned_phi, GetClonedLoop()));
+
+        ir::Instruction* new_phi =
+            InstructionBuilder(context_,
+                               &*GetOriginalLoop()->GetPreHeaderBlock()->tail(),
+                               ir::IRContext::kAnalysisDefUse |
+                                   ir::IRContext::kAnalysisInstrToBlockMapping)
+                .AddPhi(phi->type_id(),
+                        {phi->GetSingleWordInOperand(
+                             find_value_idx(phi, GetOriginalLoop())),
+                         GetClonedLoop()->GetMergeBlock()->id(),
+                         cloned_preheader_value, if_block->id()});
+
+        phi->SetInOperand(find_value_idx(phi, GetOriginalLoop()),
+                          {new_phi->result_id()});
+        def_use_mgr->AnalyzeInstUse(phi);
+      });
+
+  context_->InvalidateAnalysesExceptFor(
+      ir::IRContext::kAnalysisDefUse |
+      ir::IRContext::kAnalysisInstrToBlockMapping |
+      ir::IRContext::kAnalysisLoopAnalysis | ir::IRContext::kAnalysisCFG);
+}
+
+}  // namespace opt
+}  // namespace spvtools
--- a/source/opt/loop_peeling.h
+++ b/source/opt/loop_peeling.h
@ -0,0 +1,223 @@
+// Copyright (c) 2018 Google LLC.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SOURCE_OPT_LOOP_PEELING_H_
+#define SOURCE_OPT_LOOP_PEELING_H_
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "opt/ir_context.h"
+#include "opt/loop_descriptor.h"
+#include "opt/loop_utils.h"
+
+namespace spvtools {
+namespace opt {
+
+// Utility class to perform the peeling of a given loop.
+// The loop peeling transformation make a certain amount of a loop iterations to
+// be executed either before (peel before) or after (peel after) the transformed
+// loop.
+//
+// For peeling cases the transformation does the following steps:
+//   - It clones the loop and inserts the cloned loop before the original loop;
+//   - It connects all iterating values of the cloned loop with the
+//     corresponding original loop values so that the second loop starts with
+//     the appropriate values.
+//   - It inserts a new induction variable "i" is inserted into the cloned that
+//     starts with the value 0 and increment by step of one.
+//
+// The last step is specific to each case:
+//   - Peel before: the transformation is to peel the "N" first iterations.
+//     The exit condition of the cloned loop is changed so that the loop
+//     exits when "i < N" becomes false. The original loop is then protected to
+//     only execute if there is any iteration left to do.
+//   - Peel after: the transformation is to peel the "N" last iterations,
+//     then the exit condition of the cloned loop is changed so that the loop
+//     exits when "i + N < max_iteration" becomes false, where "max_iteration"
+//     is the upper bound of the loop. The cloned loop is then protected to
+//     only execute if there is any iteration left to do no covered by the
+//     second.
+//
+// To be peelable:
+//   - The loop must be in LCSSA form;
+//   - The loop must not contain any breaks;
+//   - The loop must not have any ambiguous iterators updates (see
+//     "CanPeelLoop").
+// The method "CanPeelLoop" checks that those constrained are met.
+//
+// FIXME(Victor): Allow the utility it accept an canonical induction variable
+// rather than automatically create one.
+// FIXME(Victor): When possible, evaluate the initial value of the second loop
+// iterating values rather than using the exit value of the first loop.
+// FIXME(Victor): Make the utility work-out the upper bound without having to
+// provide it. This should become easy once the scalar evolution is in.
+class LoopPeeling {
+ public:
+  // LoopPeeling constructor.
+  // |loop| is the loop to peel.
+  // |loop_iteration_count| is the instruction holding the |loop| iteration
+  // count, must be invariant for |loop| and must be of an int 32 type (signed
+  // or unsigned).
+  LoopPeeling(ir::IRContext* context, ir::Loop* loop,
+              ir::Instruction* loop_iteration_count)
+      : context_(context),
+        loop_utils_(context, loop),
+        loop_(loop),
+        loop_iteration_count_(!loop->IsInsideLoop(loop_iteration_count)
+                                  ? loop_iteration_count
+                                  : nullptr),
+        int_type_(nullptr),
+        canonical_induction_variable_(nullptr) {
+    if (loop_iteration_count_) {
+      int_type_ = context_->get_type_mgr()
+                      ->GetType(loop_iteration_count_->type_id())
+                      ->AsInteger();
+    }
+    GetIteratingExitValues();
+  }
+
+  // Returns true if the loop can be peeled.
+  // To be peelable, all operation involved in the update of the loop iterators
+  // must not dominates the exit condition. This restriction is a work around to
+  // not miss compile code like:
+  //
+  //   for (int i = 0; i + 1 < N; i++) {}
+  //   for (int i = 0; ++i < N; i++) {}
+  //
+  // The increment will happen before the test on the exit condition leading to
+  // very look-a-like code.
+  //
+  // This restriction will not apply if a loop rotate is applied before (i.e.
+  // becomes a do-while loop).
+  bool CanPeelLoop() const {
+    ir::CFG& cfg = *context_->cfg();
+
+    if (!loop_iteration_count_) {
+      return false;
+    }
+    if (!int_type_) {
+      return false;
+    }
+    if (int_type_->width() != 32) {
+      return false;
+    }
+    if (!loop_->IsLCSSA()) {
+      return false;
+    }
+    if (!loop_->GetMergeBlock()) {
+      return false;
+    }
+    if (cfg.preds(loop_->GetMergeBlock()->id()).size() != 1) {
+      return false;
+    }
+    if (!IsConditionCheckSideEffectFree()) {
+      return false;
+    }
+
+    return !std::any_of(exit_value_.cbegin(), exit_value_.cend(),
+                        [](std::pair<uint32_t, ir::Instruction*> it) {
+                          return it.second == nullptr;
+                        });
+  }
+
+  // Moves the execution of the |factor| first iterations of the loop into a
+  // dedicated loop.
+  void PeelBefore(uint32_t factor);
+
+  // Moves the execution of the |factor| last iterations of the loop into a
+  // dedicated loop.
+  void PeelAfter(uint32_t factor);
+
+  // Returns the cloned loop.
+  ir::Loop* GetClonedLoop() { return cloned_loop_; }
+  // Returns the original loop.
+  ir::Loop* GetOriginalLoop() { return loop_; }
+
+ private:
+  ir::IRContext* context_;
+  LoopUtils loop_utils_;
+  // The original loop.
+  ir::Loop* loop_;
+  // The initial |loop_| upper bound.
+  ir::Instruction* loop_iteration_count_;
+  // The int type to use for the canonical_induction_variable_.
+  analysis::Integer* int_type_;
+  // The cloned loop.
+  ir::Loop* cloned_loop_;
+  // This is set to true when the exit and back-edge branch instruction is the
+  // same.
+  bool do_while_form_;
+
+  // The canonical induction variable of the cloned loop. The induction variable
+  // is initialized to 0 and incremented by step of 1.
+  ir::Instruction* canonical_induction_variable_;
+
+  // Map between loop iterators and exit values. Loop iterators
+  std::unordered_map<uint32_t, ir::Instruction*> exit_value_;
+
+  // Duplicate |loop_| and place the new loop before the cloned loop. Iterating
+  // values from the cloned loop are then connected to the original loop as
+  // initializer.
+  void DuplicateAndConnectLoop(LoopUtils::LoopCloningResult* clone_results);
+
+  // Insert the canonical induction variable into the first loop as a simplified
+  // counter.
+  void InsertCanonicalInductionVariable();
+
+  // Fixes the exit condition of the before loop. The function calls
+  // |condition_builder| to get the condition to use in the conditional branch
+  // of the loop exit. The loop will be exited if the condition evaluate to
+  // true. |condition_builder| takes an ir::Instruction* that represent the
+  // insertion point.
+  void FixExitCondition(
+      const std::function<uint32_t(ir::Instruction*)>& condition_builder);
+
+  // Gathers all operations involved in the update of |iterator| into
+  // |operations|.
+  void GetIteratorUpdateOperations(
+      const ir::Loop* loop, ir::Instruction* iterator,
+      std::unordered_set<ir::Instruction*>* operations);
+
+  // Gathers exiting iterator values. The function builds a map between each
+  // iterating value in the loop (a phi instruction in the loop header) and its
+  // SSA value when it exit the loop. If no exit value can be accurately found,
+  // it is map to nullptr (see comment on CanPeelLoop).
+  void GetIteratingExitValues();
+
+  // Returns true if a for-loop has no instruction with effects before the
+  // condition check.
+  bool IsConditionCheckSideEffectFree() const;
+
+  // Creates a new basic block and insert it between |bb| and the predecessor of
+  // |bb|.
+  ir::BasicBlock* CreateBlockBefore(ir::BasicBlock* bb);
+
+  // Inserts code to only execute |loop| only if the given |condition| is true.
+  // |if_merge| is a suitable basic block to be used by the if condition as
+  // merge block.
+  // The function returns the if block protecting the loop.
+  ir::BasicBlock* ProtectLoop(ir::Loop* loop, ir::Instruction* condition,
+                              ir::BasicBlock* if_merge);
+};
+
+}  // namespace opt
+}  // namespace spvtools
+
+#endif  // SOURCE_OPT_LOOP_PEELING_H_
--- a/source/opt/loop_unswitch_pass.cpp
+++ b/source/opt/loop_unswitch_pass.cpp
@ -88,9 +88,7 @@ class LoopUnswitch {

  // Return the iterator to the basic block |bb|.
  ir::Function::iterator FindBasicBlockPosition(ir::BasicBlock* bb_to_find) {
-    ir::Function::iterator it = std::find_if(
-        function_->begin(), function_->end(),
-        [bb_to_find](const ir::BasicBlock& bb) { return bb_to_find == &bb; });
+    ir::Function::iterator it = function_->FindBlock(bb_to_find->id());
    assert(it != function_->end() && "Basic Block not found");
    return it;
  }
--- a/source/opt/loop_utils.cpp
+++ b/source/opt/loop_utils.cpp
@ -582,13 +582,20 @@ void LoopUtils::PopulateLoopDesc(
    new_loop->SetLatchBlock(
        cloning_result.old_to_new_bb_.at(old_loop->GetLatchBlock()->id()));
  if (old_loop->GetMergeBlock()) {
-    ir::BasicBlock* bb =
-        cloning_result.old_to_new_bb_.at(old_loop->GetMergeBlock()->id());
+    auto it =
+        cloning_result.old_to_new_bb_.find(old_loop->GetMergeBlock()->id());
+    ir::BasicBlock* bb = it != cloning_result.old_to_new_bb_.end()
+                             ? it->second
+                             : old_loop->GetMergeBlock();
    new_loop->SetMergeBlock(bb);
  }
-  if (old_loop->GetPreHeaderBlock())
-    new_loop->SetPreHeaderBlock(
-        cloning_result.old_to_new_bb_.at(old_loop->GetPreHeaderBlock()->id()));
+  if (old_loop->GetPreHeaderBlock()) {
+    auto it =
+        cloning_result.old_to_new_bb_.find(old_loop->GetPreHeaderBlock()->id());
+    if (it != cloning_result.old_to_new_bb_.end()) {
+      new_loop->SetPreHeaderBlock(it->second);
+    }
+  }
 }

 }  // namespace opt
--- a/source/opt/loop_utils.h
+++ b/source/opt/loop_utils.h
@ -87,11 +87,14 @@ class LoopUtils {

  // Clone |loop_| and remap its instructions. Newly created blocks
  // will be added to the |cloning_result.cloned_bb_| list, correctly ordered to
-  // be inserted into a function. If the loop is structured, the merge construct
-  // will also be cloned. The function preserves the def/use, cfg and instr to
-  // block analyses.
+  // be inserted into a function.
+  // It is assumed that |ordered_loop_blocks| is compatible with the result of
+  // |Loop::ComputeLoopStructuredOrder|. If the preheader and merge block are in
+  // the list they will also be cloned. If not, the resulting loop will share
+  // them with the original loop.
+  // The function preserves the def/use, cfg and instr to block analyses.
  // The cloned loop nest will be added to the loop descriptor and will have
-  // owner ship.
+  // ownership.
  ir::Loop* CloneLoop(
      LoopCloningResult* cloning_result,
      const std::vector<ir::BasicBlock*>& ordered_loop_blocks) const;
@ -125,6 +128,15 @@ class LoopUtils {
  // called, otherwise the analysis should be invalidated.
  void Finalize();

+  // Returns the context associate to |loop_|.
+  ir::IRContext* GetContext() { return context_; }
+  // Returns the loop descriptor owning |loop_|.
+  ir::LoopDescriptor* GetLoopDescriptor() { return loop_desc_; }
+  // Returns the loop on which the object operates on.
+  ir::Loop* GetLoop() const { return loop_; }
+  // Returns the function that |loop_| belong to.
+  ir::Function* GetFunction() const { return &function_; }
+
 private:
  ir::IRContext* context_;
  ir::LoopDescriptor* loop_desc_;
--- a/test/opt/loop_optimizations/CMakeLists.txt
+++ b/test/opt/loop_optimizations/CMakeLists.txt
@ -84,3 +84,9 @@ add_spvtools_unittest(TARGET unswitch_test
        unswitch.cpp
    LIBS SPIRV-Tools-opt
 )
+
+add_spvtools_unittest(TARGET peeling_test
+    SRCS ../function_utils.h
+        peeling.cpp
+    LIBS SPIRV-Tools-opt
+)
--- a/test/opt/loop_optimizations/peeling.cpp
+++ b/test/opt/loop_optimizations/peeling.cpp