diff --git a/include/spirv-tools/optimizer.hpp b/include/spirv-tools/optimizer.hpp index 2998ec1ec..60518103e 100644 --- a/include/spirv-tools/optimizer.hpp +++ b/include/spirv-tools/optimizer.hpp @@ -685,9 +685,9 @@ Optimizer::PassToken CreateSSARewritePass(); // any resulting half precision values back to float32 as needed. No variables // are changed. No image operations are changed. // -// Best if run late since it will generate better code with unneeded function -// scope loads and stores and composite inserts and extracts removed. Also best -// if followed by instruction simplification, redundancy elimination and DCE. +// Best if run after function scope store/load and composite operation +// eliminations are run. Also best if followed by instruction simplification, +// redundancy elimination and DCE. Optimizer::PassToken CreateConvertRelaxedToHalfPass(); // Create relax float ops pass. diff --git a/source/opt/convert_to_half_pass.cpp b/source/opt/convert_to_half_pass.cpp index 4c02c73e2..5022e1b68 100644 --- a/source/opt/convert_to_half_pass.cpp +++ b/source/opt/convert_to_half_pass.cpp @@ -42,7 +42,7 @@ bool ConvertToHalfPass::IsFloat(Instruction* inst, uint32_t width) { return Pass::IsFloat(ty_id, width); } -bool ConvertToHalfPass::IsRelaxed(Instruction* inst) { +bool ConvertToHalfPass::IsDecoratedRelaxed(Instruction* inst) { uint32_t r_id = inst->result_id(); for (auto r_inst : get_decoration_mgr()->GetDecorationsFor(r_id, false)) if (r_inst->opcode() == SpvOpDecorate && @@ -51,6 +51,12 @@ bool ConvertToHalfPass::IsRelaxed(Instruction* inst) { return false; } +bool ConvertToHalfPass::IsRelaxed(uint32_t id) { + return relaxed_ids_set_.count(id) > 0; +} + +void ConvertToHalfPass::AddRelaxed(uint32_t id) { relaxed_ids_set_.insert(id); } + analysis::Type* ConvertToHalfPass::FloatScalarType(uint32_t width) { analysis::Float float_ty(width); return context()->get_type_mgr()->GetRegisteredType(&float_ty); @@ -87,16 +93,19 @@ uint32_t ConvertToHalfPass::EquivFloatTypeId(uint32_t ty_id, uint32_t width) { } void ConvertToHalfPass::GenConvert(uint32_t* val_idp, uint32_t width, - InstructionBuilder* builder) { + Instruction* inst) { Instruction* val_inst = get_def_use_mgr()->GetDef(*val_idp); uint32_t ty_id = val_inst->type_id(); uint32_t nty_id = EquivFloatTypeId(ty_id, width); if (nty_id == ty_id) return; Instruction* cvt_inst; + InstructionBuilder builder( + context(), inst, + IRContext::kAnalysisDefUse | IRContext::kAnalysisInstrToBlockMapping); if (val_inst->opcode() == SpvOpUndef) - cvt_inst = builder->AddNullaryOp(nty_id, SpvOpUndef); + cvt_inst = builder.AddNullaryOp(nty_id, SpvOpUndef); else - cvt_inst = builder->AddUnaryOp(nty_id, SpvOpFConvert, *val_idp); + cvt_inst = builder.AddUnaryOp(nty_id, SpvOpFConvert, *val_idp); *val_idp = cvt_inst->result_id(); } @@ -153,17 +162,15 @@ bool ConvertToHalfPass::GenHalfArith(Instruction* inst) { bool modified = false; // Convert all float32 based operands to float16 equivalent and change // instruction type to float16 equivalent. - InstructionBuilder builder( - context(), inst, - IRContext::kAnalysisDefUse | IRContext::kAnalysisInstrToBlockMapping); - inst->ForEachInId([&builder, &modified, this](uint32_t* idp) { + inst->ForEachInId([&inst, &modified, this](uint32_t* idp) { Instruction* op_inst = get_def_use_mgr()->GetDef(*idp); if (!IsFloat(op_inst, 32)) return; - GenConvert(idp, 16, &builder); + GenConvert(idp, 16, inst); modified = true; }); if (IsFloat(inst, 32)) { inst->SetResultType(EquivFloatTypeId(inst->type_id(), 16)); + converted_ids_.insert(inst->result_id()); modified = true; } if (modified) get_def_use_mgr()->AnalyzeInstUse(inst); @@ -171,23 +178,10 @@ bool ConvertToHalfPass::GenHalfArith(Instruction* inst) { } bool ConvertToHalfPass::ProcessPhi(Instruction* inst) { - // Skip if not float32 - if (!IsFloat(inst, 32)) return false; - // Skip if no relaxed operands. - bool relaxed_found = false; - uint32_t ocnt = 0; - inst->ForEachInId([&ocnt, &relaxed_found, this](uint32_t* idp) { - if (ocnt % 2 == 0) { - Instruction* val_inst = get_def_use_mgr()->GetDef(*idp); - if (IsRelaxed(val_inst)) relaxed_found = true; - } - ++ocnt; - }); - if (!relaxed_found) return false; // Add float16 converts of any float32 operands and change type // of phi to float16 equivalent. Operand converts need to be added to // preceeding blocks. - ocnt = 0; + uint32_t ocnt = 0; uint32_t* prev_idp; inst->ForEachInId([&ocnt, &prev_idp, this](uint32_t* idp) { if (ocnt % 2 == 0) { @@ -203,65 +197,32 @@ bool ConvertToHalfPass::ProcessPhi(Instruction* inst) { insert_before->opcode() != SpvOpLoopMerge) ++insert_before; } - InstructionBuilder builder(context(), &*insert_before, - IRContext::kAnalysisDefUse | - IRContext::kAnalysisInstrToBlockMapping); - GenConvert(prev_idp, 16, &builder); + GenConvert(prev_idp, 16, &*insert_before); } } ++ocnt; }); inst->SetResultType(EquivFloatTypeId(inst->type_id(), 16)); get_def_use_mgr()->AnalyzeInstUse(inst); + converted_ids_.insert(inst->result_id()); return true; } -bool ConvertToHalfPass::ProcessExtract(Instruction* inst) { - bool modified = false; - uint32_t comp_id = inst->GetSingleWordInOperand(0); - Instruction* comp_inst = get_def_use_mgr()->GetDef(comp_id); - // If extract is relaxed float32 based type and the composite is a relaxed - // float32 based type, convert it to float16 equivalent. This is slightly - // aggressive and pushes any likely conversion to apply to the whole - // composite rather than apply to each extracted component later. This - // can be a win if the platform can convert the entire composite in the same - // time as one component. It risks converting components that may not be - // used, although empirical data on a large set of real-world shaders seems - // to suggest this is not common and the composite convert is the best choice. - if (IsFloat(inst, 32) && IsRelaxed(inst) && IsFloat(comp_inst, 32) && - IsRelaxed(comp_inst)) { - InstructionBuilder builder( - context(), inst, - IRContext::kAnalysisDefUse | IRContext::kAnalysisInstrToBlockMapping); - GenConvert(&comp_id, 16, &builder); - inst->SetInOperand(0, {comp_id}); - comp_inst = get_def_use_mgr()->GetDef(comp_id); - modified = true; - } - // If the composite is a float16 based type, make sure the type of the - // extract agrees. - if (IsFloat(comp_inst, 16) && !IsFloat(inst, 16)) { - inst->SetResultType(EquivFloatTypeId(inst->type_id(), 16)); - modified = true; - } - if (modified) get_def_use_mgr()->AnalyzeInstUse(inst); - return modified; -} - bool ConvertToHalfPass::ProcessConvert(Instruction* inst) { // If float32 and relaxed, change to float16 convert - if (IsFloat(inst, 32) && IsRelaxed(inst)) { + if (IsFloat(inst, 32) && IsRelaxed(inst->result_id())) { inst->SetResultType(EquivFloatTypeId(inst->type_id(), 16)); get_def_use_mgr()->AnalyzeInstUse(inst); + converted_ids_.insert(inst->result_id()); } - // If operand and result types are the same, replace result with operand - // and change convert to copy to keep validator happy; DCE will clean it up + // If operand and result types are the same, change FConvert to CopyObject to + // keep validator happy; simplification and DCE will clean it up + // One way this can happen is if an FConvert generated during this pass + // (likely by ProcessPhi) is later encountered here and its operand has been + // changed to half. uint32_t val_id = inst->GetSingleWordInOperand(0); Instruction* val_inst = get_def_use_mgr()->GetDef(val_id); - if (inst->type_id() == val_inst->type_id()) { - context()->ReplaceAllUsesWith(inst->result_id(), val_id); - inst->SetOpcode(SpvOpCopyObject); - } + if (inst->type_id() == val_inst->type_id()) inst->SetOpcode(SpvOpCopyObject); return true; // modified } @@ -270,12 +231,8 @@ bool ConvertToHalfPass::ProcessImageRef(Instruction* inst) { // If image reference, only need to convert dref args back to float32 if (dref_image_ops_.count(inst->opcode()) != 0) { uint32_t dref_id = inst->GetSingleWordInOperand(kImageSampleDrefIdInIdx); - Instruction* dref_inst = get_def_use_mgr()->GetDef(dref_id); - if (IsFloat(dref_inst, 16) && IsRelaxed(dref_inst)) { - InstructionBuilder builder( - context(), inst, - IRContext::kAnalysisDefUse | IRContext::kAnalysisInstrToBlockMapping); - GenConvert(&dref_id, 32, &builder); + if (converted_ids_.count(dref_id) > 0) { + GenConvert(&dref_id, 32, inst); inst->SetInOperand(kImageSampleDrefIdInIdx, {dref_id}); get_def_use_mgr()->AnalyzeInstUse(inst); modified = true; @@ -288,32 +245,24 @@ bool ConvertToHalfPass::ProcessDefault(Instruction* inst) { bool modified = false; // If non-relaxed instruction has changed operands, need to convert // them back to float32 - InstructionBuilder builder( - context(), inst, - IRContext::kAnalysisDefUse | IRContext::kAnalysisInstrToBlockMapping); - inst->ForEachInId([&builder, &modified, this](uint32_t* idp) { - Instruction* op_inst = get_def_use_mgr()->GetDef(*idp); - if (!IsFloat(op_inst, 16)) return; - if (!IsRelaxed(op_inst)) return; + inst->ForEachInId([&inst, &modified, this](uint32_t* idp) { + if (converted_ids_.count(*idp) == 0) return; uint32_t old_id = *idp; - GenConvert(idp, 32, &builder); + GenConvert(idp, 32, inst); if (*idp != old_id) modified = true; }); if (modified) get_def_use_mgr()->AnalyzeInstUse(inst); return modified; } -bool ConvertToHalfPass::GenHalfCode(Instruction* inst) { +bool ConvertToHalfPass::GenHalfInst(Instruction* inst) { bool modified = false; // Remember id for later deletion of RelaxedPrecision decoration - bool inst_relaxed = IsRelaxed(inst); - if (inst_relaxed) relaxed_ids_.push_back(inst->result_id()); + bool inst_relaxed = IsRelaxed(inst->result_id()); if (IsArithmetic(inst) && inst_relaxed) modified = GenHalfArith(inst); - else if (inst->opcode() == SpvOpPhi) + else if (inst->opcode() == SpvOpPhi && inst_relaxed) modified = ProcessPhi(inst); - else if (inst->opcode() == SpvOpCompositeExtract) - modified = ProcessExtract(inst); else if (inst->opcode() == SpvOpFConvert) modified = ProcessConvert(inst); else if (image_ops_.count(inst->opcode()) != 0) @@ -323,13 +272,62 @@ bool ConvertToHalfPass::GenHalfCode(Instruction* inst) { return modified; } +bool ConvertToHalfPass::CloseRelaxInst(Instruction* inst) { + if (inst->result_id() == 0) return false; + if (IsRelaxed(inst->result_id())) return false; + if (!IsFloat(inst, 32)) return false; + if (IsDecoratedRelaxed(inst)) { + AddRelaxed(inst->result_id()); + return true; + } + if (closure_ops_.count(inst->opcode()) == 0) return false; + // Can relax if all float operands are relaxed + bool relax = true; + inst->ForEachInId([&relax, this](uint32_t* idp) { + Instruction* op_inst = get_def_use_mgr()->GetDef(*idp); + if (!IsFloat(op_inst, 32)) return; + if (!IsRelaxed(*idp)) relax = false; + }); + if (relax) { + AddRelaxed(inst->result_id()); + return true; + } + // Can relax if all uses are relaxed + relax = true; + get_def_use_mgr()->ForEachUser(inst, [&relax, this](Instruction* uinst) { + if (uinst->result_id() == 0 || !IsFloat(uinst, 32) || + (!IsDecoratedRelaxed(uinst) && !IsRelaxed(uinst->result_id()))) { + relax = false; + return; + } + }); + if (relax) { + AddRelaxed(inst->result_id()); + return true; + } + return false; +} + bool ConvertToHalfPass::ProcessFunction(Function* func) { + // Do a closure of Relaxed on composite and phi instructions + bool changed = true; + while (changed) { + changed = false; + cfg()->ForEachBlockInReversePostOrder( + func->entry().get(), [&changed, this](BasicBlock* bb) { + for (auto ii = bb->begin(); ii != bb->end(); ++ii) + changed |= CloseRelaxInst(&*ii); + }); + } + // Do convert of relaxed instructions to half precision bool modified = false; cfg()->ForEachBlockInReversePostOrder( func->entry().get(), [&modified, this](BasicBlock* bb) { for (auto ii = bb->begin(); ii != bb->end(); ++ii) - modified |= GenHalfCode(&*ii); + modified |= GenHalfInst(&*ii); }); + // Replace invalid converts of matrix into equivalent vector extracts, + // converts and finally a composite construct cfg()->ForEachBlockInReversePostOrder( func->entry().get(), [&modified, this](BasicBlock* bb) { for (auto ii = bb->begin(); ii != bb->end(); ++ii) @@ -346,7 +344,7 @@ Pass::Status ConvertToHalfPass::ProcessImpl() { // If modified, make sure module has Float16 capability if (modified) context()->AddCapability(SpvCapabilityFloat16); // Remove all RelaxedPrecision decorations from instructions and globals - for (auto c_id : relaxed_ids_) RemoveRelaxedDecoration(c_id); + for (auto c_id : relaxed_ids_set_) RemoveRelaxedDecoration(c_id); for (auto& val : get_module()->types_values()) { uint32_t v_id = val.result_id(); if (v_id != 0) RemoveRelaxedDecoration(v_id); @@ -366,6 +364,7 @@ void ConvertToHalfPass::Initialize() { SpvOpVectorShuffle, SpvOpCompositeConstruct, SpvOpCompositeInsert, + SpvOpCompositeExtract, SpvOpCopyObject, SpvOpTranspose, SpvOpConvertSToF, @@ -453,7 +452,19 @@ void ConvertToHalfPass::Initialize() { SpvOpImageSparseSampleProjDrefExplicitLod, SpvOpImageSparseDrefGather, }; - relaxed_ids_.clear(); + closure_ops_ = { + SpvOpVectorExtractDynamic, + SpvOpVectorInsertDynamic, + SpvOpVectorShuffle, + SpvOpCompositeConstruct, + SpvOpCompositeInsert, + SpvOpCompositeExtract, + SpvOpCopyObject, + SpvOpTranspose, + SpvOpPhi, + }; + relaxed_ids_set_.clear(); + converted_ids_.clear(); } } // namespace opt diff --git a/source/opt/convert_to_half_pass.h b/source/opt/convert_to_half_pass.h index 522584861..143aebfa2 100644 --- a/source/opt/convert_to_half_pass.h +++ b/source/opt/convert_to_half_pass.h @@ -38,7 +38,8 @@ class ConvertToHalfPass : public Pass { const char* name() const override { return "convert-to-half-pass"; } private: - // Return true if |inst| is an arithmetic op that can be of type float16 + // Return true if |inst| is an arithmetic, composite or phi op that can be + // of type float16 bool IsArithmetic(Instruction* inst); // Return true if |inst| returns scalar, vector or matrix type with base @@ -46,7 +47,13 @@ class ConvertToHalfPass : public Pass { bool IsFloat(Instruction* inst, uint32_t width); // Return true if |inst| is decorated with RelaxedPrecision - bool IsRelaxed(Instruction* inst); + bool IsDecoratedRelaxed(Instruction* inst); + + // Return true if |id| has been added to the relaxed id set + bool IsRelaxed(uint32_t id); + + // Add |id| to the relaxed id set + void AddRelaxed(uint32_t id); // Return type id for float with |width| analysis::Type* FloatScalarType(uint32_t width); @@ -64,19 +71,23 @@ class ConvertToHalfPass : public Pass { // Append instructions to builder to convert value |*val_idp| to type // |ty_id| but with |width|. Set |*val_idp| to the new id. - void GenConvert(uint32_t* val_idp, uint32_t width, - InstructionBuilder* builder); + void GenConvert(uint32_t* val_idp, uint32_t width, Instruction* inst); // Remove RelaxedPrecision decoration of |id|. void RemoveRelaxedDecoration(uint32_t id); + // Add |inst| to relaxed instruction set if warranted. Specifically, if + // it is float32 and either decorated relaxed or a composite or phi + // instruction where all operands are relaxed or all uses are relaxed. + bool CloseRelaxInst(Instruction* inst); + // If |inst| is an arithmetic, phi, extract or convert instruction of float32 // base type and decorated with RelaxedPrecision, change it to the equivalent // float16 based type instruction. Specifically, insert instructions to // convert all operands to float16 (if needed) and change its type to the // equivalent float16 type. Otherwise, insert instructions to convert its // operands back to their original types, if needed. - bool GenHalfCode(Instruction* inst); + bool GenHalfInst(Instruction* inst); // Gen code for relaxed arithmetic |inst| bool GenHalfArith(Instruction* inst); @@ -84,9 +95,6 @@ class ConvertToHalfPass : public Pass { // Gen code for relaxed phi |inst| bool ProcessPhi(Instruction* inst); - // Gen code for relaxed extract |inst| - bool ProcessExtract(Instruction* inst); - // Gen code for relaxed convert |inst| bool ProcessConvert(Instruction* inst); @@ -98,11 +106,11 @@ class ConvertToHalfPass : public Pass { // If |inst| is an FConvert of a matrix type, decompose it to a series // of vector extracts, converts and inserts into an Undef. These are - // generated by GenHalfCode because they are easier to manipulate, but are + // generated by GenHalfInst because they are easier to manipulate, but are // invalid so we need to clean them up. bool MatConvertCleanup(Instruction* inst); - // Call GenHalfCode on every instruction in |func|. + // Call GenHalfInst on every instruction in |func|. // If code is generated for an instruction, replace the instruction // with the new instructions that are generated. bool ProcessFunction(Function* func); @@ -124,8 +132,14 @@ class ConvertToHalfPass : public Pass { // Set of dref sample operations std::unordered_set dref_image_ops_; + // Set of dref sample operations + std::unordered_set closure_ops_; + + // Set of ids of all relaxed instructions + std::unordered_set relaxed_ids_set_; + // Ids of all converted instructions - std::vector relaxed_ids_; + std::unordered_set converted_ids_; }; } // namespace opt diff --git a/test/opt/convert_relaxed_to_half_test.cpp b/test/opt/convert_relaxed_to_half_test.cpp index 3ac80099f..c13815477 100644 --- a/test/opt/convert_relaxed_to_half_test.cpp +++ b/test/opt/convert_relaxed_to_half_test.cpp @@ -997,7 +997,7 @@ OpFunctionEnd %99 = OpFConvert %v4half %15 OpBranch %65 %65 = OpLabel -%96 = OpPhi %v4half %99 %5 %76 %71 +%96 = OpPhi %v4half %99 %5 %100 %71 %95 = OpPhi %int %int_0 %5 %78 %71 %70 = OpSLessThan %bool %95 %int_10 OpLoopMerge %66 %71 None @@ -1222,6 +1222,115 @@ OpFunctionEnd defs_after + func_after, true, true); } +TEST_F(ConvertToHalfTest, ConvertToHalfWithClosure) { + // Include as many contiguous composite instructions as possible into + // half-precision computations + // + // Compiled with glslang -V -Os + // + // clang-format off + // + // #version 410 core + // + // precision mediump float; + // + // layout(location = 1) in vec3 foo; + // layout(location = 2) in mat2 bar; + // layout(location = 1) out vec3 res; + // + // vec3 func(vec3 tap, mat2 M) { + // return vec3(M * tap.xy, 1.0); + // } + // + // void main() { + // res = func(foo, bar); + // } + // + // clang-format on + + const std::string defs = + R"(OpCapability Shader +; CHECK: OpCapability Float16 +%1 = OpExtInstImport "GLSL.std.450" +OpMemoryModel Logical GLSL450 +OpEntryPoint Fragment %main "main" %res %foo %bar +OpExecutionMode %main OriginUpperLeft +OpSource GLSL 410 +OpName %main "main" +OpName %res "res" +OpName %foo "foo" +OpName %bar "bar" +OpDecorate %res RelaxedPrecision +; CHECK-NOT: OpDecorate %res RelaxedPrecision +OpDecorate %res Location 1 +OpDecorate %foo RelaxedPrecision +; CHECK-NOT: OpDecorate %foo RelaxedPrecision +OpDecorate %foo Location 1 +OpDecorate %bar RelaxedPrecision +; CHECK-NOT: OpDecorate %bar RelaxedPrecision +OpDecorate %bar Location 2 +OpDecorate %34 RelaxedPrecision +OpDecorate %36 RelaxedPrecision +OpDecorate %41 RelaxedPrecision +OpDecorate %42 RelaxedPrecision +; CHECK-NOT: OpDecorate %34 RelaxedPrecision +; CHECK-NOT: OpDecorate %36 RelaxedPrecision +; CHECK-NOT: OpDecorate %41 RelaxedPrecision +; CHECK-NOT: OpDecorate %42 RelaxedPrecision +%void = OpTypeVoid +%3 = OpTypeFunction %void +%float = OpTypeFloat 32 +%v3float = OpTypeVector %float 3 +%v2float = OpTypeVector %float 2 +%mat2v2float = OpTypeMatrix %v2float 2 +%float_1 = OpConstant %float 1 +%_ptr_Output_v3float = OpTypePointer Output %v3float +%res = OpVariable %_ptr_Output_v3float Output +%_ptr_Input_v3float = OpTypePointer Input %v3float +%foo = OpVariable %_ptr_Input_v3float Input +%_ptr_Input_mat2v2float = OpTypePointer Input %mat2v2float +%bar = OpVariable %_ptr_Input_mat2v2float Input +)"; + + const std::string func = + R"(%main = OpFunction %void None %3 +%5 = OpLabel +%34 = OpLoad %v3float %foo +%36 = OpLoad %mat2v2float %bar +; CHECK: %48 = OpFConvert %v3half %34 +; CHECK: %49 = OpFConvert %v3half %34 +%41 = OpVectorShuffle %v2float %34 %34 0 1 +; CHECK-NOT: %41 = OpVectorShuffle %v2float %34 %34 0 1 +; CHECK: %41 = OpVectorShuffle %v2half %48 %49 0 1 +%42 = OpMatrixTimesVector %v2float %36 %41 +; CHECK-NOT: %42 = OpMatrixTimesVector %v2float %36 %41 +; CHECK: %55 = OpCompositeExtract %v2float %36 0 +; CHECK: %56 = OpFConvert %v2half %55 +; CHECK: %57 = OpCompositeExtract %v2float %36 1 +; CHECK: %58 = OpFConvert %v2half %57 +; CHECK: %59 = OpCompositeConstruct %mat2v2half %56 %58 +; CHECK: %52 = OpCopyObject %mat2v2float %36 +; CHECK: %42 = OpMatrixTimesVector %v2half %59 %41 +%43 = OpCompositeExtract %float %42 0 +%44 = OpCompositeExtract %float %42 1 +; CHECK-NOT: %43 = OpCompositeExtract %float %42 0 +; CHECK-NOT: %44 = OpCompositeExtract %float %42 1 +; CHECK: %43 = OpCompositeExtract %half %42 0 +; CHECK: %44 = OpCompositeExtract %half %42 1 +%45 = OpCompositeConstruct %v3float %43 %44 %float_1 +; CHECK-NOT: %45 = OpCompositeConstruct %v3float %43 %44 %float_1 +; CHECK: %53 = OpFConvert %float %43 +; CHECK: %54 = OpFConvert %float %44 +; CHECK: %45 = OpCompositeConstruct %v3float %53 %54 %float_1 +OpStore %res %45 +OpReturn +OpFunctionEnd +)"; + + SetAssembleOptions(SPV_TEXT_TO_BINARY_OPTION_PRESERVE_NUMERIC_IDS); + SinglePassRunAndMatch(defs + func, true); +} + } // namespace } // namespace opt } // namespace spvtools