From 6dee1eec3be1786a9816442c137221478d16c15d Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Wed, 22 Dec 2021 13:54:49 -0500
Subject: [PATCH] Refactor summary API and update nvbench/summary.cuh docs.

The string used when constructing a summary is no longer a human
readable name, but rather a tag string (e.g. "nv/cold/time/gpu/mean").
These will make lookup easier and more stable going forward.

name vs. short_name no longer exists. Now there is just "name", which
is used for column headings. The "description" string may still be
used for detailed information.

Updated the json tests and compare script to reflect these changes.
---
 nvbench/csv_printer.cu          | 22 ++++-----
 nvbench/detail/measure_cold.cu  | 57 +++++++++++------------
 nvbench/detail/measure_cupti.cu | 28 +++++-------
 nvbench/detail/measure_hot.cu   | 29 ++++++------
 nvbench/json_printer.cu         |  2 +-
 nvbench/markdown_printer.cu     | 22 ++++-----
 nvbench/state.cuh               |  8 ++--
 nvbench/state.cxx               | 81 ++++++++++++++++++++++-----------
 nvbench/summary.cuh             | 66 ++++++++++++++++-----------
 nvbench/type_axis.cxx           |  3 +-
 scripts/nvbench_compare.py      |  8 ++--
 testing/device/noisy_bench.cu   |  4 +-
 12 files changed, 182 insertions(+), 148 deletions(-)

diff --git a/nvbench/csv_printer.cu b/nvbench/csv_printer.cu
index df64518..6acb535 100644
--- a/nvbench/csv_printer.cu
+++ b/nvbench/csv_printer.cu
@@ -116,10 +116,10 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
         {
           continue;
         }
-        const std::string &key    = summ.get_name();
-        const std::string &header = summ.has_value("short_name")
-                                      ? summ.get_string("short_name")
-                                      : key;
+        const std::string &tag    = summ.get_tag();
+        const std::string &header = summ.has_value("name")
+                                      ? summ.get_string("name")
+                                      : tag;
 
         const std::string hint = summ.has_value("hint")
                                    ? summ.get_string("hint")
@@ -127,31 +127,31 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
         std::string value = std::visit(format_visitor, summ.get_value("value"));
         if (hint == "duration")
         {
-          table.add_cell(row, key, header + " (sec)", std::move(value));
+          table.add_cell(row, tag, header + " (sec)", std::move(value));
         }
         else if (hint == "item_rate")
         {
-          table.add_cell(row, key, header + " (elem/sec)", std::move(value));
+          table.add_cell(row, tag, header + " (elem/sec)", std::move(value));
         }
         else if (hint == "bytes")
         {
-          table.add_cell(row, key, header + " (bytes)", std::move(value));
+          table.add_cell(row, tag, header + " (bytes)", std::move(value));
         }
         else if (hint == "byte_rate")
         {
-          table.add_cell(row, key, header + " (bytes/sec)", std::move(value));
+          table.add_cell(row, tag, header + " (bytes/sec)", std::move(value));
         }
         else if (hint == "sample_size")
         {
-          table.add_cell(row, key, header, std::move(value));
+          table.add_cell(row, tag, header, std::move(value));
         }
         else if (hint == "percentage")
         {
-          table.add_cell(row, key, header, std::move(value));
+          table.add_cell(row, tag, header, std::move(value));
         }
         else
         {
-          table.add_cell(row, key, header, std::move(value));
+          table.add_cell(row, tag, header, std::move(value));
         }
       }
       row++;
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index 95a4c37..4e19362 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -175,53 +175,50 @@ void measure_cold_base::generate_summaries()
 {
   const auto d_samples = static_cast<double>(m_total_samples);
   {
-    auto &summ = m_state.add_summary("Number of Samples (Cold)");
+    auto &summ = m_state.add_summary("nv/cold/sample_size");
+    summ.set_string("name", "Samples");
     summ.set_string("hint", "sample_size");
-    summ.set_string("short_name", "Samples");
-    summ.set_string("description",
-                    "Number of kernel executions in cold time measurements.");
+    summ.set_string("description", "Number of isolated kernel executions");
     summ.set_int64("value", m_total_samples);
   }
 
   const auto avg_cpu_time = m_total_cpu_time / d_samples;
   {
-    auto &summ = m_state.add_summary("Average CPU Time (Cold)");
+    auto &summ = m_state.add_summary("nv/cold/time/cpu/mean");
+    summ.set_string("name", "CPU Time");
     summ.set_string("hint", "duration");
-    summ.set_string("short_name", "CPU Time");
     summ.set_string("description",
-                    "Average isolated kernel execution time observed "
-                    "from host.");
+                    "Mean isolated kernel execution time "
+                    "(measured on host CPU)");
     summ.set_float64("value", avg_cpu_time);
   }
 
   {
-    auto &summ = m_state.add_summary("CPU Relative Standard Deviation (Cold)");
+    auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative");
+    summ.set_string("name", "Noise");
     summ.set_string("hint", "percentage");
-    summ.set_string("short_name", "Noise");
     summ.set_string("description",
-                    "Relative standard deviation of the cold CPU execution "
-                    "time measurements.");
+                    "Relative standard deviation of isolated CPU times");
     summ.set_float64("value", m_cpu_noise);
   }
 
   const auto avg_cuda_time = m_total_cuda_time / d_samples;
   {
-    auto &summ = m_state.add_summary("Average GPU Time (Cold)");
+    auto &summ = m_state.add_summary("nv/cold/time/gpu/mean");
+    summ.set_string("name", "GPU Time");
     summ.set_string("hint", "duration");
-    summ.set_string("short_name", "GPU Time");
     summ.set_string("description",
-                    "Average isolated kernel execution time as measured "
-                    "by CUDA events.");
+                    "Mean isolated kernel execution time "
+                    "(measured with CUDA events)");
     summ.set_float64("value", avg_cuda_time);
   }
 
   {
-    auto &summ = m_state.add_summary("GPU Relative Standard Deviation (Cold)");
+    auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative");
+    summ.set_string("name", "Noise");
     summ.set_string("hint", "percentage");
-    summ.set_string("short_name", "Noise");
     summ.set_string("description",
-                    "Relative standard deviation of the cold GPU execution "
-                    "time measurements.");
+                    "Relative standard deviation of isolated GPU times");
     summ.set_float64("value",
                      m_noise_tracker.empty()
                        ? std::numeric_limits<nvbench::float64_t>::infinity()
@@ -230,11 +227,11 @@ void measure_cold_base::generate_summaries()
 
   if (const auto items = m_state.get_element_count(); items != 0)
   {
-    auto &summ = m_state.add_summary("Element Throughput");
+    auto &summ = m_state.add_summary("nv/cold/bw/item_rate");
+    summ.set_string("name", "Elem/s");
     summ.set_string("hint", "item_rate");
-    summ.set_string("short_name", "Elem/s");
     summ.set_string("description",
-                    "Number of input elements handled per second.");
+                    "Number of input elements processed per second");
     summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
   }
 
@@ -242,12 +239,12 @@ void measure_cold_base::generate_summaries()
   {
     const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cuda_time;
     {
-      auto &summ = m_state.add_summary("Average Global Memory Throughput");
+      auto &summ = m_state.add_summary("nv/cold/bw/global/bytes_per_second");
+      summ.set_string("name", "GlobalMem BW");
       summ.set_string("hint", "byte_rate");
-      summ.set_string("short_name", "GlobalMem BW");
       summ.set_string("description",
                       "Number of bytes read/written per second to the CUDA "
-                      "device's global memory.");
+                      "device's global memory");
       summ.set_float64("value", avg_used_gmem_bw);
     }
 
@@ -255,12 +252,12 @@ void measure_cold_base::generate_summaries()
       const auto peak_gmem_bw = static_cast<double>(
         m_state.get_device()->get_global_memory_bus_bandwidth());
 
-      auto &summ = m_state.add_summary("Percent Peak Global Memory Throughput");
+      auto &summ = m_state.add_summary("nv/cold/bw/global/utilization");
+      summ.set_string("name", "BWUtil");
       summ.set_string("hint", "percentage");
-      summ.set_string("short_name", "BWPeak");
       summ.set_string("description",
-                      "Global device memory throughput as a percentage of the "
-                      "device's peak bandwidth.");
+                      "Global device memory utilization as a percentage of the "
+                      "device's peak bandwidth");
       summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw);
     }
   }
diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu
index e2952fd..f208632 100644
--- a/nvbench/detail/measure_cupti.cu
+++ b/nvbench/detail/measure_cupti.cu
@@ -53,11 +53,8 @@ struct metric_traits<metric_id::dram_peak_sustained_throughput>
   static constexpr const char *metric_name =
     "dram__throughput.avg.pct_of_peak_sustained_elapsed";
 
-  static constexpr const char *summary =
-    "Peak Sustained Global Memory Throughput (HW)";
-
-  static constexpr const char *hint       = "percentage";
-  static constexpr const char *short_name = "HBWPeak";
+  static constexpr const char *name = "HBWPeak";
+  static constexpr const char *hint = "percentage";
 
   static constexpr const char *description =
     "The utilization level of the device memory relative to the peak "
@@ -77,9 +74,8 @@ struct metric_traits<metric_id::global_load_efficiency>
   static constexpr const char *metric_name =
     "smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct";
 
-  static constexpr const char *summary    = "Global Load Efficiency (HW)";
-  static constexpr const char *hint       = "percentage";
-  static constexpr const char *short_name = "LoadEff";
+  static constexpr const char *name = "LoadEff";
+  static constexpr const char *hint = "percentage";
 
   static constexpr const char *description =
     "Ratio of requested global memory load throughput to required global "
@@ -99,9 +95,8 @@ struct metric_traits<metric_id::global_store_efficiency>
   static constexpr const char *metric_name =
     "smsp__sass_average_data_bytes_per_sector_mem_global_op_st.pct";
 
-  static constexpr const char *summary    = "Global Store Efficiency (HW)";
-  static constexpr const char *hint       = "percentage";
-  static constexpr const char *short_name = "StoreEff";
+  static constexpr const char *name = "StoreEff";
+  static constexpr const char *hint = "percentage";
 
   static constexpr const char *description =
     "Ratio of requested global memory store throughput to required global "
@@ -119,9 +114,8 @@ template <>
 struct metric_traits<metric_id::l1_hit_rate>
 {
   static constexpr const char *metric_name = "l1tex__t_sector_hit_rate.pct";
-  static constexpr const char *summary     = "L1 Cache Hit Rate (HW)";
+  static constexpr const char *name        = "L1HitRate";
   static constexpr const char *hint        = "percentage";
-  static constexpr const char *short_name  = "L1HitRate";
   static constexpr const char *description = "Hit rate at L1 cache.";
   static constexpr double divider          = 100.0;
 
@@ -135,9 +129,8 @@ template <>
 struct metric_traits<metric_id::l2_hit_rate>
 {
   static constexpr const char *metric_name = "lts__t_sector_hit_rate.pct";
-  static constexpr const char *summary     = "L2 Cache Hit Rate (HW)";
+  static constexpr const char *name        = "L2HitRate";
   static constexpr const char *hint        = "percentage";
-  static constexpr const char *short_name  = "L2HitRate";
   static constexpr const char *description = "Hit rate at L2 cache.";
   static constexpr double divider          = 100.0;
 
@@ -219,9 +212,10 @@ void gen_summary(std::size_t result_id,
 
   if (metric::is_collected(m_state))
   {
-    auto &summ = m_state.add_summary(metric::summary);
+    auto &summ =
+      m_state.add_summary(fmt::format("nv/cupti/{}", metric::metric_name));
+    summ.set_string("name", metric::name);
     summ.set_string("hint", metric::hint);
-    summ.set_string("short_name", metric::short_name);
     summ.set_string("description", metric::description);
     summ.set_float64("value", result[result_id++] / metric::divider);
   }
diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu
index 62efb59..ed7612a 100644
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -47,7 +47,7 @@ measure_hot_base::measure_hot_base(state &exec_state)
   try
   {
     nvbench::int64_t cold_samples =
-      m_state.get_summary("Number of Samples (Cold)").get_int64("value");
+      m_state.get_summary("nv/cold/sample_size").get_int64("value");
     m_min_samples = std::max(m_min_samples, cold_samples);
 
     // If the cold measurement ran successfully, disable skip_time. It'd just
@@ -85,25 +85,27 @@ void measure_hot_base::check()
 
 void measure_hot_base::generate_summaries()
 {
-  const auto d_samples     = static_cast<double>(m_total_samples);
+  const auto d_samples = static_cast<double>(m_total_samples);
+  {
+    auto &summ = m_state.add_summary("nv/batch/sample_size");
+    summ.set_string("name", "Samples");
+    summ.set_string("hint", "sample_size");
+    summ.set_string("description", "Number of batch kernel executions");
+    summ.set_int64("value", m_total_samples);
+  }
+
   const auto avg_cuda_time = m_total_cuda_time / d_samples;
   {
-    auto &summ = m_state.add_summary("Average GPU Time (Batch)");
+    auto &summ = m_state.add_summary("nv/batch/time/gpu/mean");
+    summ.set_string("name", "Batch GPU");
     summ.set_string("hint", "duration");
-    summ.set_string("short_name", "Batch GPU");
     summ.set_string("description",
-                    "Average back-to-back kernel execution time as measured "
-                    "by CUDA events.");
+                    "Mean batch kernel execution time "
+                    "(measured by CUDA events)");
     summ.set_float64("value", avg_cuda_time);
   }
 
   {
-    auto &summ = m_state.add_summary("Number of Samples (Batch)");
-    summ.set_string("hint", "sample_size");
-    summ.set_string("short_name", "Batch");
-    summ.set_string("description",
-                    "Number of kernel executions in hot time measurements.");
-    summ.set_int64("value", m_total_samples);
   }
 
   // Log if a printer exists:
@@ -163,8 +165,7 @@ void measure_hot_base::check_skip_time(nvbench::float64_t warmup_time)
 
 void measure_hot_base::block_stream()
 {
-  m_blocker.block(m_launch.get_stream(),
-                  m_state.get_blocking_kernel_timeout());
+  m_blocker.block(m_launch.get_stream(), m_state.get_blocking_kernel_timeout());
 }
 
 } // namespace nvbench::detail
diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu
index c087d02..ae3c448 100644
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -196,7 +196,7 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
         auto &summaries = st["summaries"];
         for (const auto &exec_summ : exec_state.get_summaries())
         {
-          auto &summ            = summaries[exec_summ.get_name()];
+          auto &summ            = summaries[exec_summ.get_tag()];
           ::write_named_values(summ, exec_summ);
         }
 
diff --git a/nvbench/markdown_printer.cu b/nvbench/markdown_printer.cu
index a23b360..276ca86 100644
--- a/nvbench/markdown_printer.cu
+++ b/nvbench/markdown_printer.cu
@@ -307,43 +307,43 @@ void markdown_printer::do_print_benchmark_results(
             {
               continue;
             }
-            const std::string &key    = summ.get_name();
-            const std::string &header = summ.has_value("short_name")
-                                          ? summ.get_string("short_name")
-                                          : key;
+            const std::string &tag    = summ.get_tag();
+            const std::string &header = summ.has_value("name")
+                                          ? summ.get_string("name")
+                                          : tag;
 
             std::string hint = summ.has_value("hint") ? summ.get_string("hint")
                                                       : std::string{};
             if (hint == "duration")
             {
-              table.add_cell(row, key, header, this->do_format_duration(summ));
+              table.add_cell(row, tag, header, this->do_format_duration(summ));
             }
             else if (hint == "item_rate")
             {
-              table.add_cell(row, key, header, this->do_format_item_rate(summ));
+              table.add_cell(row, tag, header, this->do_format_item_rate(summ));
             }
             else if (hint == "bytes")
             {
-              table.add_cell(row, key, header, this->do_format_bytes(summ));
+              table.add_cell(row, tag, header, this->do_format_bytes(summ));
             }
             else if (hint == "byte_rate")
             {
-              table.add_cell(row, key, header, this->do_format_byte_rate(summ));
+              table.add_cell(row, tag, header, this->do_format_byte_rate(summ));
             }
             else if (hint == "sample_size")
             {
               table.add_cell(row,
-                             key,
+                             tag,
                              header,
                              this->do_format_sample_size(summ));
             }
             else if (hint == "percentage")
             {
-              table.add_cell(row, key, header, this->do_format_percentage(summ));
+              table.add_cell(row, tag, header, this->do_format_percentage(summ));
             }
             else
             {
-              table.add_cell(row, key, header, this->do_format_default(summ));
+              table.add_cell(row, tag, header, this->do_format_default(summ));
             }
           }
           row++;
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index 8cf3e9c..0b0b4cc 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -119,7 +119,7 @@ struct state
                                 std::string column_name = {});
 
   void add_buffer_size(std::size_t num_bytes,
-                       std::string summary_name,
+                       std::string summary_tag,
                        std::string column_name = {},
                        std::string description = {});
 
@@ -266,10 +266,10 @@ struct state
         || is_dram_throughput_collected();
   }
 
-  summary &add_summary(std::string summary_name);
+  summary &add_summary(std::string summary_tag);
   summary &add_summary(summary s);
-  [[nodiscard]] const summary &get_summary(std::string_view name) const;
-  [[nodiscard]] summary &get_summary(std::string_view name);
+  [[nodiscard]] const summary &get_summary(std::string_view tag) const;
+  [[nodiscard]] summary &get_summary(std::string_view tag);
   [[nodiscard]] const std::vector<summary> &get_summaries() const;
   [[nodiscard]] std::vector<summary> &get_summaries();
 
diff --git a/nvbench/state.cxx b/nvbench/state.cxx
index 505722a..c7139ce 100644
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -109,9 +109,9 @@ catch (...)
   return default_value;
 }
 
-summary &state::add_summary(std::string summary_name)
+summary &state::add_summary(std::string summary_tag)
 {
-  return m_summaries.emplace_back(std::move(summary_name));
+  return m_summaries.emplace_back(std::move(summary_tag));
 }
 
 summary &state::add_summary(summary s)
@@ -120,29 +120,54 @@ summary &state::add_summary(summary s)
   return m_summaries.back();
 }
 
-const summary &state::get_summary(std::string_view name) const
+const summary &state::get_summary(std::string_view tag) const
 {
+  // Check tags first
   auto iter =
     std::find_if(m_summaries.cbegin(),
                  m_summaries.cend(),
-                 [&name](const auto &s) { return s.get_name() == name; });
-  if (iter == m_summaries.cend())
+                 [&tag](const auto &s) { return s.get_tag() == tag; });
+  if (iter != m_summaries.cend())
   {
-    NVBENCH_THROW(std::invalid_argument, "No summary named '{}'.", name);
+    return *iter;
   }
-  return *iter;
+
+  // Then names:
+  iter =
+    std::find_if(m_summaries.cbegin(),
+                 m_summaries.cend(),
+                 [&tag](const auto &s) { return s.get_string("name") == tag; });
+  if (iter != m_summaries.cend())
+  {
+    return *iter;
+  }
+
+  NVBENCH_THROW(std::invalid_argument, "No summary tagged '{}'.", tag);
 }
 
-summary &state::get_summary(std::string_view name)
+summary &state::get_summary(std::string_view tag)
 {
-  auto iter = std::find_if(m_summaries.begin(),
-                           m_summaries.end(),
-                           [&name](auto &s) { return s.get_name() == name; });
-  if (iter == m_summaries.end())
+  // Check tags first
+  auto iter =
+    std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
+      return s.get_tag() == tag;
+    });
+  if (iter != m_summaries.end())
   {
-    NVBENCH_THROW(std::invalid_argument, "No summary named '{}'.", name);
+    return *iter;
   }
-  return *iter;
+
+  // Then names:
+  iter =
+    std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
+      return s.get_string("name") == tag;
+    });
+  if (iter != m_summaries.end())
+  {
+    return *iter;
+  }
+
+  NVBENCH_THROW(std::invalid_argument, "No summary tagged '{}'.", tag);
 }
 
 const std::vector<summary> &state::get_summaries() const { return m_summaries; }
@@ -226,8 +251,9 @@ void state::add_element_count(std::size_t elements, std::string column_name)
   m_element_count += elements;
   if (!column_name.empty())
   {
-    auto &summ = this->add_summary("Element count: " + column_name);
-    summ.set_string("short_name", std::move(column_name));
+    auto &summ = this->add_summary("nv/element_count/" + column_name);
+    summ.set_string("description", "Number of elements: " + column_name);
+    summ.set_string("name", std::move(column_name));
     summ.set_int64("value", static_cast<nvbench::int64_t>(elements));
   }
 }
@@ -237,9 +263,8 @@ void state::add_global_memory_reads(std::size_t bytes, std::string column_name)
   m_global_memory_rw_bytes += bytes;
   if (!column_name.empty())
   {
-    this->add_buffer_size(bytes,
-                          "Input Buffer Size: " + column_name,
-                          std::move(column_name));
+    std::string tag = fmt::format("nv/gmem/reads/{}", column_name);
+    this->add_buffer_size(bytes, std::move(tag), std::move(column_name));
   }
 }
 
@@ -248,29 +273,33 @@ void state::add_global_memory_writes(std::size_t bytes, std::string column_name)
   m_global_memory_rw_bytes += bytes;
   if (!column_name.empty())
   {
-    this->add_buffer_size(bytes,
-                          "Output Buffer Size: " + column_name,
-                          std::move(column_name));
+    const std::string tag = fmt::format("nv/gmem/writes/{}", column_name);
+    this->add_buffer_size(bytes, std::move(tag), std::move(column_name));
   }
 }
 
 void state::add_buffer_size(std::size_t num_bytes,
-                            std::string summary_name,
+                            std::string summary_tag,
                             std::string column_name,
                             std::string description)
 {
-  auto &summ = this->add_summary(std::move(summary_name));
+  auto &summ = this->add_summary(std::move(summary_tag));
   summ.set_string("hint", "bytes");
+  summ.set_int64("value", static_cast<nvbench::int64_t>(num_bytes));
 
   if (!column_name.empty())
   {
-    summ.set_string("short_name", std::move(column_name));
+    summ.set_string("name", std::move(column_name));
+  }
+  else
+  {
+    summ.set_string("name", ("None"));
+    summ.set_string("hide", "No column name provided.");
   }
   if (!description.empty())
   {
     summ.set_string("description", std::move(description));
   }
-  summ.set_int64("value", static_cast<nvbench::int64_t>(num_bytes));
 }
 
 } // namespace nvbench
diff --git a/nvbench/summary.cuh b/nvbench/summary.cuh
index 7a6a2e6..a39c9e4 100644
--- a/nvbench/summary.cuh
+++ b/nvbench/summary.cuh
@@ -27,50 +27,62 @@ namespace nvbench
 {
 
 /**
- * A named set of key/value pairs associated with a benchmark result.
+ * @brief A single value associated with a benchmark state.
  *
- * The summary name is the unabbreviated name for the measurement.
- * An abbreviated name for column headings can be suggested in a "short_name"
- * entry (see below).
+ * Each summary object contains a single value with associated metadata, such
+ * as name, description, type, and formatting hints. Each summary object
+ * corresponds to a cell in an output markdown table, with summaries grouped
+ * into columns by their tag.
  *
- * Some keys have standard meanings that output formats may use to produce
- * more readable representations of the result:
+ * The summary tag provided at construction should be a unique identifier that
+ * will be convenient and unambiguous during lookups. For example, summaries
+ * produced by NVBench will begin with `nv/` and contain a hierarchical
+ * organization of descriptors, such as `nv/cold/time/gpu/mean`.
  *
- * - "hint": Formatting hints (see below)
- * - "short_name": Abbreviated name for table headings.
- * - "description": Longer description of result.
- * - "value": Actual value.
+ * The summary may contain an arbitrary number of key/value pairs. The keys
+ * are `std::string` and the values may be `std::string`, `int64_t`, or
+ * `float64_t`. These may be used to store arbitrary user data and will be
+ * written into the json output.
+ *
+ * Some keys are reserved and have special meaning. These may be used by tooling
+ * to help interpret data:
+ *
+ * - `"name": required [string]` Compact, used for table headings.
+ * - `"description": optional [string]` Longer description.
+ * - `"value": required [string|float64|int64]` Actual value.
+ * - `"hint": optional [string]` Formatting hints (see below)
+ * - `"hide": optional [string]` If present, the summary will not be included in
+ *                               markdown output tables.
+ *
+ * Additionally, keys beginning with `nv/` are reserved for NVBench.
+ *
+ * Hints indicate the type of data stored in "value", but may be omitted.
+ * NVBench uses the following hints:
  *
- * Hints:
- * - unset: Arbitrary value is stored in "value".
  * - "duration": "value" is a float64_t time duration in seconds.
  * - "item_rate": "value" is a float64_t item rate in elements / second.
  * - "bytes": "value" is an int64_t number of bytes.
  * - "byte_rate": "value" is a float64_t byte rate in bytes / second.
- * - "sample_size": "value" is an int64_t number of samples in a measurement.
- * - "percentage": "value" is a float64_t percentage (stored as a ratio, 1. =
- *    100%).
- *
- * The key/value pair functionality is implemented by the
- * `nvbench::named_values` base class.
+ * - "sample_size": "value" is an int64_t samples count.
+ * - "percentage": "value" is a float64_t percentage (100% stored as 1.0).
  *
  * Example: Adding a new summary to an nvbench::state object:
  *
  * ```
- * auto &summ = state.add_summary("Average GPU Time (Batch)");
+ * auto &summ = state.add_summary("nv/batch/gpu/time/mean");
+ * summ.set_string("name", "Batch GPU");
  * summ.set_string("hint", "duration");
- * summ.set_string("short_name", "Batch GPU");
  * summ.set_string("description",
- *                 "Average back-to-back kernel execution time as measured "
- *                 "by CUDA events.");
+ *                 "Average batch execution time measured by CUDA event
+ *                  timers.");
  * summ.set_float64("value", avg_batch_gpu_time);
  * ```
  */
 struct summary : public nvbench::named_values
 {
   summary() = default;
-  explicit summary(std::string name)
-      : m_name(std::move(name))
+  explicit summary(std::string tag)
+      : m_tag(std::move(tag))
   {}
 
   // move-only
@@ -79,11 +91,11 @@ struct summary : public nvbench::named_values
   summary &operator=(const summary &) = delete;
   summary &operator=(summary &&) = default;
 
-  void set_name(std::string name) { m_name = std::move(name); }
-  [[nodiscard]] const std::string &get_name() const { return m_name; }
+  void set_tag(std::string tag) { m_tag = std::move(tag); }
+  [[nodiscard]] const std::string &get_tag() const { return m_tag; }
 
 private:
-  std::string m_name;
+  std::string m_tag;
 };
 
 } // namespace nvbench
diff --git a/nvbench/type_axis.cxx b/nvbench/type_axis.cxx
index 2a4e628..af436ad 100644
--- a/nvbench/type_axis.cxx
+++ b/nvbench/type_axis.cxx
@@ -54,7 +54,8 @@ bool type_axis::get_is_active(std::size_t idx) const
 
 std::size_t type_axis::get_active_count() const
 {
-  return std::count(m_mask.cbegin(), m_mask.cend(), true);
+  return static_cast<std::size_t>(
+    std::count(m_mask.cbegin(), m_mask.cend(), true));
 }
 
 std::size_t type_axis::get_type_index(const std::string &input_string) const
diff --git a/scripts/nvbench_compare.py b/scripts/nvbench_compare.py
index e9ac8c1..b995bf1 100755
--- a/scripts/nvbench_compare.py
+++ b/scripts/nvbench_compare.py
@@ -154,13 +154,13 @@ def compare_benches(ref_benches, cmp_benches, threshold):
                 if not ref_summaries or not cmp_summaries:
                     continue
 
-                cmp_time_summary = cmp_summaries.get("Average GPU Time (Cold)")
-                ref_time_summary = ref_summaries.get("Average GPU Time (Cold)")
+                cmp_time_summary = cmp_summaries.get("nv/cold/time/gpu/mean")
+                ref_time_summary = ref_summaries.get("nv/cold/time/gpu/mean")
                 cmp_noise_summary = cmp_summaries.get(
-                    "GPU Relative Standard Deviation (Cold)"
+                    "nv/cold/time/gpu/stdev/relative"
                 )
                 ref_noise_summary = ref_summaries.get(
-                    "GPU Relative Standard Deviation (Cold)"
+                    "nv/cold/time/gpu/stdev/relative"
                 )
 
                 # TODO: Use other timings, too. Maybe multiple rows, with a
diff --git a/testing/device/noisy_bench.cu b/testing/device/noisy_bench.cu
index 686fbda..8cca3a1 100644
--- a/testing/device/noisy_bench.cu
+++ b/testing/device/noisy_bench.cu
@@ -48,12 +48,12 @@ void noisy_bench(nvbench::state &state)
   });
 
   const auto measured_mean = static_cast<nvbench::float32_t>(
-    state.get_summary("Average GPU Time (Cold)").get_float64("value"));
+    state.get_summary("nv/cold/time/gpu/mean").get_float64("value"));
   const auto measured_noise = [&]() {
     try
     {
       return static_cast<nvbench::float32_t>(
-        state.get_summary("GPU Relative Standard Deviation (Cold)")
+        state.get_summary("nv/cold/time/gpu/stdev/relative")
           .get_float64("value"));
     }
     catch (std::invalid_argument &)