From 78cf86150e5a716b149046dcbab555ff7684a0a4 Mon Sep 17 00:00:00 2001
From: Andrey Tuganov <andreyt@google.com>
Date: Wed, 2 Aug 2017 16:47:25 -0400
Subject: [PATCH] Add id descriptor feature to SPIR-V

Id descriptors are computed as a recursive hash of all instructions used
to define an id. Descriptors are invarint of actual id values and
the similar code in different files would produce the same descriptors.

Multiple ids can have the same descriptor. For example
%1 = OpConstant %u32 1
%2 = OpConstant %u32 1
would produce two ids with the same descriptor. But
%3 = OpConstant %s32 1
%4 = OpConstant %u32 2
would have descriptors different from %1 and %2.

Descriptors will be used as handles of move-to-front sequences in SPIR-V
compression.
---
 source/CMakeLists.txt               |  2 +
 source/id_descriptor.cpp            | 78 +++++++++++++++++++++++++++++
 source/id_descriptor.h              | 59 ++++++++++++++++++++++
 source/spirv_stats.cpp              | 36 ++++++++++++-
 source/spirv_stats.h                | 14 +++++-
 test/stats/stats_aggregate_test.cpp | 51 +++++++++++++++++++
 tools/stats/stats.cpp               | 35 +++++++++----
 tools/stats/stats_analyzer.cpp      | 49 +++++++++++++++++-
 tools/stats/stats_analyzer.h        |  5 ++
 9 files changed, 316 insertions(+), 13 deletions(-)
 create mode 100644 source/id_descriptor.cpp
 create mode 100644 source/id_descriptor.h
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 6ea6ad260..44bb9731a 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -207,6 +207,7 @@ set(SPIRV_SOURCES
   ${CMAKE_CURRENT_SOURCE_DIR}/enum_string_mapping.h
   ${CMAKE_CURRENT_SOURCE_DIR}/ext_inst.h
   ${CMAKE_CURRENT_SOURCE_DIR}/extensions.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/id_descriptor.h
   ${CMAKE_CURRENT_SOURCE_DIR}/instruction.h
   ${CMAKE_CURRENT_SOURCE_DIR}/macro.h
   ${CMAKE_CURRENT_SOURCE_DIR}/name_mapper.h
@@ -234,6 +235,7 @@ set(SPIRV_SOURCES
   ${CMAKE_CURRENT_SOURCE_DIR}/enum_string_mapping.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/ext_inst.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/extensions.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/id_descriptor.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/libspirv.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/message.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/name_mapper.cpp
diff --git a/source/id_descriptor.cpp b/source/id_descriptor.cpp
new file mode 100644
index 000000000..d2c6220b2
--- /dev/null
+++ b/source/id_descriptor.cpp
@@ -0,0 +1,78 @@
+// Copyright (c) 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "id_descriptor.h"
+
+#include <cassert>
+#include <iostream>
+
+#include "opcode.h"
+#include "operand.h"
+
+namespace libspirv {
+
+namespace {
+
+// Hashes an array of words. Order of words is important.
+uint32_t HashU32Array(const std::vector<uint32_t>& words) {
+  // The hash function is a sum of hashes of each word seeded by word index.
+  // Knuth's multiplicative hash is used to hash the words.
+  const uint32_t kKnuthMulHash = 2654435761;
+  uint32_t val = 0;
+  for (uint32_t i = 0; i < words.size(); ++i) {
+    val += (words[i] + i + 123) * kKnuthMulHash;
+  }
+  return val;
+}
+
+}  // namespace
+
+uint32_t IdDescriptorCollection::ProcessInstruction(
+    const spv_parsed_instruction_t& inst) {
+  if (!inst.result_id)
+    return 0;
+
+  assert(words_.empty());
+  words_.push_back(inst.words[0]);
+
+  for (size_t operand_index = 0; operand_index < inst.num_operands;
+       ++operand_index) {
+    const auto &operand = inst.operands[operand_index];
+    if (spvIsIdType(operand.type)) {
+      const uint32_t id = inst.words[operand.offset];
+      const auto it = id_to_descriptor_.find(id);
+      // Forward declared ids are not hashed.
+      if (it != id_to_descriptor_.end()) {
+        words_.push_back(it->second);
+      }
+    } else {
+      for (size_t operand_word_index = 0;
+           operand_word_index < operand.num_words; ++operand_word_index) {
+        words_.push_back(inst.words[operand.offset + operand_word_index]);
+      }
+    }
+  }
+
+  const uint32_t descriptor = HashU32Array(words_);
+  assert(descriptor);
+
+  words_.clear();
+
+  const auto result = id_to_descriptor_.emplace(inst.result_id, descriptor);
+  assert(result.second);
+  (void)result;
+  return descriptor;
+}
+
+}  // namespace libspirv
diff --git a/source/id_descriptor.h b/source/id_descriptor.h
new file mode 100644
index 000000000..b4947a78f
--- /dev/null
+++ b/source/id_descriptor.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef LIBSPIRV_ID_DESCRIPTOR_H_
+#define LIBSPIRV_ID_DESCRIPTOR_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "spirv-tools/libspirv.hpp"
+
+namespace libspirv {
+
+// Computes and stores id descriptors.
+//
+// Descriptors are computed as hash of all words in the instruction where ids
+// were substituted with previously computed descriptors.
+class IdDescriptorCollection {
+ public:
+  IdDescriptorCollection() {
+    words_.reserve(16);
+  }
+
+  // Computes descriptor for the result id of the given instruction and
+  // registers it in id_to_descriptor_. Returns the computed descriptor.
+  // This function needs to be sequentially called for every instruction in the
+  // module.
+  uint32_t ProcessInstruction(const spv_parsed_instruction_t& inst);
+
+  // Returns a previously computed descriptor id.
+  uint32_t GetDescriptor(uint32_t id) const {
+    const auto it = id_to_descriptor_.find(id);
+    if (it == id_to_descriptor_.end())
+      return 0;
+    return it->second;
+  }
+
+ private:
+  std::unordered_map<uint32_t, uint32_t> id_to_descriptor_;
+
+  // Scratch buffer used for hashing. Class member to optimize on allocation.
+  std::vector<uint32_t> words_;
+};
+
+}  // namespace libspirv
+
+#endif  // LIBSPIRV_ID_DESCRIPTOR_H_
+
diff --git a/source/spirv_stats.cpp b/source/spirv_stats.cpp
index 0c628d5d6..34cab6d5b 100644
--- a/source/spirv_stats.cpp
+++ b/source/spirv_stats.cpp
@@ -26,6 +26,7 @@
 #include "enum_string_mapping.h"
 #include "extensions.h"
 #include "instruction.h"
+#include "id_descriptor.h"
 #include "opcode.h"
 #include "operand.h"
 #include "spirv-tools/libspirv.h"
@@ -35,6 +36,7 @@
 #include "val/instruction.h"
 #include "val/validation_state.h"
 
+using libspirv::IdDescriptorCollection;
 using libspirv::Instruction;
 using libspirv::SpirvStats;
 using libspirv::ValidationState_t;
@@ -77,10 +79,41 @@ class StatsAggregator {
     ProcessEnums();
     ProcessLiteralStrings();
     ProcessNonIdWords();
+    ProcessIdDescriptors();
 
     return SPV_SUCCESS;
   }
 
+  // Collects statistics of descriptors generated by IdDescriptorCollection.
+  void ProcessIdDescriptors() {
+    const Instruction& inst = GetCurrentInstruction();
+    const uint32_t new_descriptor =
+        id_descriptors_.ProcessInstruction(inst.c_inst());
+
+    if (new_descriptor) {
+      std::stringstream ss;
+      ss << spvOpcodeString(inst.opcode());
+      for (size_t i = 1; i < inst.words().size(); ++i) {
+        ss << " " << inst.word(i);
+      }
+      stats_->id_descriptor_labels.emplace(new_descriptor, ss.str());
+    }
+
+    uint32_t index = 0;
+    for (const auto& operand : inst.operands()) {
+      if (spvIsIdType(operand.type)) {
+        const uint32_t descriptor =
+            id_descriptors_.GetDescriptor(inst.word(operand.offset));
+        if (descriptor) {
+          ++stats_->id_descriptor_hist[descriptor];
+          ++stats_->operand_slot_id_descriptor_hist[
+              std::pair<uint32_t, uint32_t>(inst.opcode(), index)][descriptor];
+        }
+      }
+      ++index;
+    }
+  }
+
   // Collects statistics of enum words for operands of specific types.
   void ProcessEnums() {
     const Instruction& inst = GetCurrentInstruction();
@@ -135,7 +168,7 @@ class StatsAggregator {
     uint32_t index = 0;
     for (const auto& operand : inst.operands()) {
       if (operand.num_words == 1 && !spvIsIdType(operand.type)) {
-          ++stats_->non_id_words_hist[std::pair<uint32_t, uint32_t>(
+          ++stats_->operand_slot_non_id_words_hist[std::pair<uint32_t, uint32_t>(
               inst.opcode(), index)][inst.word(operand.offset)];
       }
       ++index;
@@ -240,6 +273,7 @@ class StatsAggregator {
   SpirvStats* stats_;
   spv_validator_options_t validator_options_;
   std::unique_ptr<ValidationState_t> vstate_;
+  IdDescriptorCollection id_descriptors_;
 };
 
 spv_result_t ProcessHeader(
diff --git a/source/spirv_stats.h b/source/spirv_stats.h
index 4b169514e..541dd0361 100644
--- a/source/spirv_stats.h
+++ b/source/spirv_stats.h
@@ -77,7 +77,19 @@ struct SpirvStats {
   // This is a generalization of enum_hist, also includes literal integers and
   // masks.
   std::map<std::pair<uint32_t, uint32_t>,
-      std::map<uint32_t, uint32_t>> non_id_words_hist;
+      std::map<uint32_t, uint32_t>> operand_slot_non_id_words_hist;
+
+  // Historgam of descriptors generated by IdDescriptorCollection.
+  // Descriptor -> count.
+  std::unordered_map<uint32_t, uint32_t> id_descriptor_hist;
+
+  // Debut labels for id descriptors, descriptor -> label.
+  std::unordered_map<uint32_t, std::string> id_descriptor_labels;
+
+  // Historgam of descriptors generated by IdDescriptorCollection for every
+  // operand slot. pair<opcode, operand index> -> descriptor -> count.
+  std::map<std::pair<uint32_t, uint32_t>,
+      std::map<uint32_t, uint32_t>> operand_slot_id_descriptor_hist;
 
   // Histogram of literal strings, sharded by opcodes, opcode -> string -> count.
   // This is suboptimal if an opcode has multiple literal string operands,
diff --git a/test/stats/stats_aggregate_test.cpp b/test/stats/stats_aggregate_test.cpp
index 43026e802..6ea38923c 100644
--- a/test/stats/stats_aggregate_test.cpp
+++ b/test/stats/stats_aggregate_test.cpp
@@ -433,4 +433,55 @@ OpMemoryModel Logical GLSL450
   EXPECT_EQ(1u, stats.s64_constant_hist.at(-64));
 }
 
+TEST(AggregateStats, IdDescriptor) {
+  const std::string code1 = R"(
+OpCapability Addresses
+OpCapability Kernel
+OpCapability GenericPointer
+OpCapability Linkage
+OpMemoryModel Physical32 OpenCL
+%u32 = OpTypeInt 32 0
+%f32 = OpTypeFloat 32
+%1 = OpConstant %f32 1
+%2 = OpConstant %f32 1
+%3 = OpConstant %u32 32
+)";
+
+  const std::string code2 = R"(
+OpCapability Shader
+OpCapability Linkage
+OpMemoryModel Logical GLSL450
+%f32 = OpTypeFloat 32
+%u32 = OpTypeInt 32 0
+%1 = OpConstant %f32 1
+%2 = OpConstant %f32 3
+%3 = OpConstant %u32 32
+)";
+
+  const uint32_t kF32 = 1951208733;
+  const uint32_t kU32 = 2430404313;
+  const uint32_t kF32_1 = 296981500;
+  const uint32_t kF32_3 = 1450415100;
+  const uint32_t kU32_32 = 827246872;
+
+  SpirvStats stats;
+
+  CompileAndAggregateStats(code1, &stats);
+
+  {
+    const std::unordered_map<uint32_t, uint32_t> expected = {
+      {kF32, 3}, {kU32, 2}, {kF32_1, 2}, {kU32_32, 1}
+    };
+    EXPECT_EQ(expected, stats.id_descriptor_hist);
+  }
+
+  CompileAndAggregateStats(code2, &stats);
+  {
+    const std::unordered_map<uint32_t, uint32_t> expected = {
+      {kF32, 6}, {kU32, 4}, {kF32_1, 3}, {kF32_3, 1}, {kU32_32, 2}
+    };
+    EXPECT_EQ(expected, stats.id_descriptor_hist);
+  }
+}
+
 }  // namespace
diff --git a/tools/stats/stats.cpp b/tools/stats/stats.cpp
index 6e6878d33..3eea98215 100644
--- a/tools/stats/stats.cpp
+++ b/tools/stats/stats.cpp
@@ -73,6 +73,11 @@ Options:
                    Output generated C++ code for Huffman codecs for
                    single-word non-id slots.
                    This flag disables non-C++ output.
+
+  --codegen_id_descriptor_huffman_codecs
+                   Output generated C++ code for Huffman codecs for
+                   common id descriptors.
+                   This flag disables non-C++ output.
 )",
       argv0, argv0, argv0);
 }
@@ -113,6 +118,7 @@ int main(int argc, char** argv) {
   bool codegen_opcode_and_num_operands_markov_huffman_codecs = false;
   bool codegen_literal_string_huffman_codecs = false;
   bool codegen_non_id_word_huffman_codecs = false;
+  bool codegen_id_descriptor_huffman_codecs = false;
 
   std::vector<const char*> paths;
   const char* output_path = nullptr;
@@ -128,27 +134,31 @@ int main(int argc, char** argv) {
         codegen_opcode_hist = true;
         export_text = false;
       } else if (0 == strcmp(cur_arg,
-			     "--codegen_opcode_and_num_operands_hist")) {
+                             "--codegen_opcode_and_num_operands_hist")) {
         codegen_opcode_and_num_operands_hist = true;
         export_text = false;
       } else if (strcmp(
-	  "--codegen_opcode_and_num_operands_markov_huffman_codecs",
-	  cur_arg) == 0) {
+          "--codegen_opcode_and_num_operands_markov_huffman_codecs",
+          cur_arg) == 0) {
         codegen_opcode_and_num_operands_markov_huffman_codecs = true;
         export_text = false;
       } else if (0 == strcmp(cur_arg,
-			     "--codegen_literal_string_huffman_codecs")) {
+                             "--codegen_literal_string_huffman_codecs")) {
         codegen_literal_string_huffman_codecs = true;
         export_text = false;
       } else if (0 == strcmp(cur_arg,
-			     "--codegen_non_id_word_huffman_codecs")) {
+                             "--codegen_non_id_word_huffman_codecs")) {
         codegen_non_id_word_huffman_codecs = true;
         export_text = false;
+      } else if (0 == strcmp(cur_arg,
+                             "--codegen_id_descriptor_huffman_codecs")) {
+        codegen_id_descriptor_huffman_codecs = true;
+        export_text = false;
       } else if (0 == strcmp(cur_arg, "--verbose") ||
-		 0 == strcmp(cur_arg, "-v")) {
+                 0 == strcmp(cur_arg, "-v")) {
         verbose = true;
       } else if (0 == strcmp(cur_arg, "--output") ||
-		 0 == strcmp(cur_arg, "-o")) {
+                 0 == strcmp(cur_arg, "-o")) {
         expect_output_path = true;
       } else {
         PrintUsage(argv[0]);
@@ -157,10 +167,10 @@ int main(int argc, char** argv) {
       }
     } else {
       if (expect_output_path) {
-	output_path = cur_arg;
-	expect_output_path = false;
+        output_path = cur_arg;
+        expect_output_path = false;
       } else {
-	paths.push_back(cur_arg);
+        paths.push_back(cur_arg);
       }
     }
   }
@@ -255,5 +265,10 @@ int main(int argc, char** argv) {
     analyzer.WriteCodegenNonIdWordHuffmanCodecs(out);
   }
 
+  if (codegen_id_descriptor_huffman_codecs) {
+    out << std::endl;
+    analyzer.WriteCodegenIdDescriptorHuffmanCodecs(out);
+  }
+
   return 0;
 }
diff --git a/tools/stats/stats_analyzer.cpp b/tools/stats/stats_analyzer.cpp
index e196e49ab..3f2e7ec0e 100644
--- a/tools/stats/stats_analyzer.cpp
+++ b/tools/stats/stats_analyzer.cpp
@@ -726,7 +726,7 @@ void StatsAnalyzer::WriteCodegenNonIdWordHuffmanCodecs(std::ostream& out) {
       << "  std::map<std::pair<uint32_t, uint32_t>, "
       << "std::unique_ptr<HuffmanCodec<uint64_t>>> codecs;\n";
 
-  for (const auto& kv : stats_.non_id_words_hist) {
+  for (const auto& kv : stats_.operand_slot_non_id_words_hist) {
     const auto& opcode_and_index = kv.first;
     const uint32_t opcode = opcode_and_index.first;
     const uint32_t index = opcode_and_index.second;
@@ -764,3 +764,50 @@ void StatsAnalyzer::WriteCodegenNonIdWordHuffmanCodecs(std::ostream& out) {
 
   out << "  return codecs;\n}\n";
 }
+
+void StatsAnalyzer::WriteCodegenIdDescriptorHuffmanCodecs(
+    std::ostream& out) {
+  out << "std::map<std::pair<uint32_t, uint32_t>, "
+      << "std::unique_ptr<HuffmanCodec<uint64_t>>>\n"
+      << "GetIdDescriptorHuffmanCodecs() {\n"
+      << "  std::map<std::pair<uint32_t, uint32_t>, "
+      << "std::unique_ptr<HuffmanCodec<uint64_t>>> codecs;\n";
+
+  for (const auto& kv : stats_.operand_slot_id_descriptor_hist) {
+    const auto& opcode_and_index = kv.first;
+    const uint32_t opcode = opcode_and_index.first;
+    const uint32_t index = opcode_and_index.second;
+
+    const double kOpcodeFrequentEnoughToAnalyze = 0.001;
+    if (opcode_freq_[opcode] < kOpcodeFrequentEnoughToAnalyze) continue;
+
+    const std::map<uint32_t, uint32_t>& hist = kv.second;
+
+    uint32_t total = 0;
+    for (const auto& pair : hist) {
+      total += pair.second;
+    }
+
+    out << "  {\n";
+    out << "    std::unique_ptr<HuffmanCodec<uint64_t>> "
+        << "codec(new HuffmanCodec<uint64_t>({\n";
+    for (const auto& pair : hist) {
+      const uint32_t descriptor = pair.first;
+      const uint32_t count = pair.second;
+      const double freq = double(count) / double(total);
+      const double kDescriptorFrequentEnoughToAnalyze = 0.005;
+      if (freq < kDescriptorFrequentEnoughToAnalyze) continue;
+      out << "      { " << descriptor << ", " << count << " },\n";
+    }
+
+    out << "      { kMarkvNoneOfTheAbove, " << 1 + int(total * 0.05) << " },\n";
+
+    out << "    }));\n" << std::endl;
+    out << "    codecs.emplace(std::pair<uint32_t, uint32_t>(SpvOp"
+        << spvOpcodeString(SpvOp(opcode))
+        << ", " << index << "), std::move(codec));\n";
+    out << "  }\n\n";
+  }
+
+  out << "  return codecs;\n}\n";
+}
diff --git a/tools/stats/stats_analyzer.h b/tools/stats/stats_analyzer.h
index 54e2c3d8d..6cff20a09 100644
--- a/tools/stats/stats_analyzer.h
+++ b/tools/stats/stats_analyzer.h
@@ -60,6 +60,11 @@ class StatsAnalyzer {
   // specific operand slot (opcode and operand number).
   void WriteCodegenNonIdWordHuffmanCodecs(std::ostream& out);
 
+  // Writes C++ code containing a function returning a map of Huffman codecs
+  // for common id descriptors. Each Huffman codec is created for a
+  // specific operand slot (opcode and operand number).
+  void WriteCodegenIdDescriptorHuffmanCodecs(std::ostream& out);
+
  private:
   const libspirv::SpirvStats& stats_;