Merge pull request #70 from allisonvacanti/walltime_reports

Python / JSON updates
2026-04-20 06:48:53 +00:00 · 2022-01-13 17:13:24 -05:00
parent 39ffc84ee3 8ba2cf1395
commit a06a7c668c
32 changed files with 32350 additions and 33869 deletions
--- a/cmake/NVBenchConfigTarget.cmake
+++ b/cmake/NVBenchConfigTarget.cmake
@@ -63,6 +63,11 @@ else()
  endif()
 endif()

+# GCC-specific flags
+if (CMAKE_CXX_COMPILER_ID STREQUAL GNU)
+  target_link_libraries(nvbench.build_interface INTERFACE stdc++fs)
+endif()
+
 # CUDA-specific flags
 target_compile_options(nvbench.build_interface INTERFACE
  $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcudafe=--display_error_number>
--- a/nvbench/config.cuh.in
+++ b/nvbench/config.cuh.in
@@ -32,3 +32,21 @@
 #define NVBENCH_MSVC_PUSH_DISABLE_WARNING(code)
 #define NVBENCH_MSVC_POP_WARNING()
 #endif
+
+// MSVC does not define __cplusplus correctly. _MSVC_LANG is used instead.
+#ifdef _MSVC_LANG
+#define NVBENCH_CPLUSPLUS _MSVC_LANG
+#else
+#define NVBENCH_CPLUSPLUS __cplusplus
+#endif
+
+// Detect current dialect:
+#if NVBENCH_CPLUSPLUS < 201703L
+#error "NVBench requires a C++17 compiler."
+#elif NVBENCH_CPLUSPLUS < 202002L
+#define NVBENCH_CPP_DIALECT 2017
+#elif NVBENCH_CPLUSPLUS == 202002L
+#define NVBENCH_CPP_DIALECT 2020
+#elif NVBENCH_CPLUSPLUS > 202002L // unknown, but is higher than C++20.
+#define NVBENCH_CPP_DIALECT 2023
+#endif
--- a/nvbench/csv_printer.cu
+++ b/nvbench/csv_printer.cu
@@ -116,10 +116,10 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
        {
          continue;
        }
-        const std::string &key    = summ.get_name();
-        const std::string &header = summ.has_value("short_name")
-                                      ? summ.get_string("short_name")
-                                      : key;
+        const std::string &tag    = summ.get_tag();
+        const std::string &header = summ.has_value("name")
+                                      ? summ.get_string("name")
+                                      : tag;

        const std::string hint = summ.has_value("hint")
                                   ? summ.get_string("hint")
@@ -127,31 +127,31 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
        std::string value = std::visit(format_visitor, summ.get_value("value"));
        if (hint == "duration")
        {
-          table.add_cell(row, key, header + " (sec)", std::move(value));
+          table.add_cell(row, tag, header + " (sec)", std::move(value));
        }
        else if (hint == "item_rate")
        {
-          table.add_cell(row, key, header + " (elem/sec)", std::move(value));
+          table.add_cell(row, tag, header + " (elem/sec)", std::move(value));
        }
        else if (hint == "bytes")
        {
-          table.add_cell(row, key, header + " (bytes)", std::move(value));
+          table.add_cell(row, tag, header + " (bytes)", std::move(value));
        }
        else if (hint == "byte_rate")
        {
-          table.add_cell(row, key, header + " (bytes/sec)", std::move(value));
+          table.add_cell(row, tag, header + " (bytes/sec)", std::move(value));
        }
        else if (hint == "sample_size")
        {
-          table.add_cell(row, key, header, std::move(value));
+          table.add_cell(row, tag, header, std::move(value));
        }
        else if (hint == "percentage")
        {
-          table.add_cell(row, key, header, std::move(value));
+          table.add_cell(row, tag, header, std::move(value));
        }
        else
        {
-          table.add_cell(row, key, header, std::move(value));
+          table.add_cell(row, tag, header, std::move(value));
        }
      }
      row++;
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -175,53 +175,50 @@ void measure_cold_base::generate_summaries()
 {
  const auto d_samples = static_cast<double>(m_total_samples);
  {
-    auto &summ = m_state.add_summary("Number of Samples (Cold)");
+    auto &summ = m_state.add_summary("nv/cold/sample_size");
+    summ.set_string("name", "Samples");
    summ.set_string("hint", "sample_size");
-    summ.set_string("short_name", "Samples");
-    summ.set_string("description",
-                    "Number of kernel executions in cold time measurements.");
+    summ.set_string("description", "Number of isolated kernel executions");
    summ.set_int64("value", m_total_samples);
  }

  const auto avg_cpu_time = m_total_cpu_time / d_samples;
  {
-    auto &summ = m_state.add_summary("Average CPU Time (Cold)");
+    auto &summ = m_state.add_summary("nv/cold/time/cpu/mean");
+    summ.set_string("name", "CPU Time");
    summ.set_string("hint", "duration");
-    summ.set_string("short_name", "CPU Time");
    summ.set_string("description",
-                    "Average isolated kernel execution time observed "
-                    "from host.");
+                    "Mean isolated kernel execution time "
+                    "(measured on host CPU)");
    summ.set_float64("value", avg_cpu_time);
  }

  {
-    auto &summ = m_state.add_summary("CPU Relative Standard Deviation (Cold)");
+    auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative");
+    summ.set_string("name", "Noise");
    summ.set_string("hint", "percentage");
-    summ.set_string("short_name", "Noise");
    summ.set_string("description",
-                    "Relative standard deviation of the cold CPU execution "
-                    "time measurements.");
+                    "Relative standard deviation of isolated CPU times");
    summ.set_float64("value", m_cpu_noise);
  }

  const auto avg_cuda_time = m_total_cuda_time / d_samples;
  {
-    auto &summ = m_state.add_summary("Average GPU Time (Cold)");
+    auto &summ = m_state.add_summary("nv/cold/time/gpu/mean");
+    summ.set_string("name", "GPU Time");
    summ.set_string("hint", "duration");
-    summ.set_string("short_name", "GPU Time");
    summ.set_string("description",
-                    "Average isolated kernel execution time as measured "
-                    "by CUDA events.");
+                    "Mean isolated kernel execution time "
+                    "(measured with CUDA events)");
    summ.set_float64("value", avg_cuda_time);
  }

  {
-    auto &summ = m_state.add_summary("GPU Relative Standard Deviation (Cold)");
+    auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative");
+    summ.set_string("name", "Noise");
    summ.set_string("hint", "percentage");
-    summ.set_string("short_name", "Noise");
    summ.set_string("description",
-                    "Relative standard deviation of the cold GPU execution "
-                    "time measurements.");
+                    "Relative standard deviation of isolated GPU times");
    summ.set_float64("value",
                     m_noise_tracker.empty()
                       ? std::numeric_limits<nvbench::float64_t>::infinity()
@@ -230,11 +227,11 @@ void measure_cold_base::generate_summaries()

  if (const auto items = m_state.get_element_count(); items != 0)
  {
-    auto &summ = m_state.add_summary("Element Throughput");
+    auto &summ = m_state.add_summary("nv/cold/bw/item_rate");
+    summ.set_string("name", "Elem/s");
    summ.set_string("hint", "item_rate");
-    summ.set_string("short_name", "Elem/s");
    summ.set_string("description",
-                    "Number of input elements handled per second.");
+                    "Number of input elements processed per second");
    summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
  }

@@ -242,12 +239,12 @@ void measure_cold_base::generate_summaries()
  {
    const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cuda_time;
    {
-      auto &summ = m_state.add_summary("Average Global Memory Throughput");
+      auto &summ = m_state.add_summary("nv/cold/bw/global/bytes_per_second");
+      summ.set_string("name", "GlobalMem BW");
      summ.set_string("hint", "byte_rate");
-      summ.set_string("short_name", "GlobalMem BW");
      summ.set_string("description",
                      "Number of bytes read/written per second to the CUDA "
-                      "device's global memory.");
+                      "device's global memory");
      summ.set_float64("value", avg_used_gmem_bw);
    }

@@ -255,14 +252,23 @@ void measure_cold_base::generate_summaries()
      const auto peak_gmem_bw = static_cast<double>(
        m_state.get_device()->get_global_memory_bus_bandwidth());

-      auto &summ = m_state.add_summary("Percent Peak Global Memory Throughput");
+      auto &summ = m_state.add_summary("nv/cold/bw/global/utilization");
+      summ.set_string("name", "BWUtil");
      summ.set_string("hint", "percentage");
-      summ.set_string("short_name", "BWPeak");
      summ.set_string("description",
-                      "Global device memory throughput as a percentage of the "
-                      "device's peak bandwidth.");
+                      "Global device memory utilization as a percentage of the "
+                      "device's peak bandwidth");
      summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw);
    }
+  } // bandwidth
+
+  {
+    auto &summ = m_state.add_summary("nv/cold/walltime");
+    summ.set_string("name", "Walltime");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "Walltime used for isolated measurements");
+    summ.set_float64("value", m_walltime_timer.get_duration());
+    summ.set_string("hide", "Hidden by default.");
  }

  // Log if a printer exists:
@@ -309,11 +315,17 @@ void measure_cold_base::generate_summaries()
    // Log to stdout:
    printer.log(nvbench::log_level::pass,
                fmt::format("Cold: {:0.6f}ms GPU, {:0.6f}ms CPU, {:0.2f}s "
-                            "total GPU, {}x",
+                            "total GPU, {:0.2f}s total wall, {}x ",
                            avg_cuda_time * 1e3,
                            avg_cpu_time * 1e3,
                            m_total_cuda_time,
+                            m_walltime_timer.get_duration(),
                            m_total_samples));
+
+    printer.process_bulk_data(m_state,
+                              "nv/cold/sample_times",
+                              "sample_times",
+                              m_cuda_times);
  }
 }

--- a/nvbench/detail/measure_cupti.cu
+++ b/nvbench/detail/measure_cupti.cu
@@ -53,11 +53,8 @@ struct metric_traits<metric_id::dram_peak_sustained_throughput>
  static constexpr const char *metric_name =
    "dram__throughput.avg.pct_of_peak_sustained_elapsed";

-  static constexpr const char *summary =
-    "Peak Sustained Global Memory Throughput (HW)";
-
-  static constexpr const char *hint       = "percentage";
-  static constexpr const char *short_name = "HBWPeak";
+  static constexpr const char *name = "HBWPeak";
+  static constexpr const char *hint = "percentage";

  static constexpr const char *description =
    "The utilization level of the device memory relative to the peak "
@@ -77,9 +74,8 @@ struct metric_traits<metric_id::global_load_efficiency>
  static constexpr const char *metric_name =
    "smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct";

-  static constexpr const char *summary    = "Global Load Efficiency (HW)";
-  static constexpr const char *hint       = "percentage";
-  static constexpr const char *short_name = "LoadEff";
+  static constexpr const char *name = "LoadEff";
+  static constexpr const char *hint = "percentage";

  static constexpr const char *description =
    "Ratio of requested global memory load throughput to required global "
@@ -99,9 +95,8 @@ struct metric_traits<metric_id::global_store_efficiency>
  static constexpr const char *metric_name =
    "smsp__sass_average_data_bytes_per_sector_mem_global_op_st.pct";

-  static constexpr const char *summary    = "Global Store Efficiency (HW)";
-  static constexpr const char *hint       = "percentage";
-  static constexpr const char *short_name = "StoreEff";
+  static constexpr const char *name = "StoreEff";
+  static constexpr const char *hint = "percentage";

  static constexpr const char *description =
    "Ratio of requested global memory store throughput to required global "
@@ -119,9 +114,8 @@ template <>
 struct metric_traits<metric_id::l1_hit_rate>
 {
  static constexpr const char *metric_name = "l1tex__t_sector_hit_rate.pct";
-  static constexpr const char *summary     = "L1 Cache Hit Rate (HW)";
+  static constexpr const char *name        = "L1HitRate";
  static constexpr const char *hint        = "percentage";
-  static constexpr const char *short_name  = "L1HitRate";
  static constexpr const char *description = "Hit rate at L1 cache.";
  static constexpr double divider          = 100.0;

@@ -135,9 +129,8 @@ template <>
 struct metric_traits<metric_id::l2_hit_rate>
 {
  static constexpr const char *metric_name = "lts__t_sector_hit_rate.pct";
-  static constexpr const char *summary     = "L2 Cache Hit Rate (HW)";
+  static constexpr const char *name        = "L2HitRate";
  static constexpr const char *hint        = "percentage";
-  static constexpr const char *short_name  = "L2HitRate";
  static constexpr const char *description = "Hit rate at L2 cache.";
  static constexpr double divider          = 100.0;

@@ -219,9 +212,10 @@ void gen_summary(std::size_t result_id,

  if (metric::is_collected(m_state))
  {
-    auto &summ = m_state.add_summary(metric::summary);
+    auto &summ =
+      m_state.add_summary(fmt::format("nv/cupti/{}", metric::metric_name));
+    summ.set_string("name", metric::name);
    summ.set_string("hint", metric::hint);
-    summ.set_string("short_name", metric::short_name);
    summ.set_string("description", metric::description);
    summ.set_float64("value", result[result_id++] / metric::divider);
  }
@@ -247,6 +241,34 @@ void measure_cupti_base::generate_summaries()
 try
 {
  gen_summaries(m_state, m_cupti.get_counter_values());
+
+  {
+    auto &summ = m_state.add_summary("nv/cupti/sample_size");
+    summ.set_string("name", "Samples");
+    summ.set_string("hint", "sample_size");
+    summ.set_string("description", "Number of CUPTI kernel executions");
+    summ.set_int64("value", m_total_samples);
+  }
+
+  {
+    auto &summ = m_state.add_summary("nv/cupti/walltime");
+    summ.set_string("name", "Walltime");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "Walltime used for CUPTI measurements");
+    summ.set_float64("value", m_walltime_timer.get_duration());
+    summ.set_string("hide", "Hidden by default.");
+  }
+
+  // Log if a printer exists:
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
+      printer_opt_ref.has_value())
+  {
+    auto &printer = printer_opt_ref.value().get();
+    printer.log(nvbench::log_level::pass,
+                fmt::format("CUPTI: {:0.2f}s total wall, {}x",
+                            m_walltime_timer.get_duration(),
+                            m_total_samples));
+  }
 }
 catch (const std::exception &ex)
 {
--- a/nvbench/detail/measure_cupti.cuh
+++ b/nvbench/detail/measure_cupti.cuh
@@ -75,8 +75,11 @@ protected:

  nvbench::launch m_launch;
  nvbench::detail::l2flush m_l2flush;
+  nvbench::cpu_timer m_walltime_timer;

  cupti_profiler m_cupti;
+
+  nvbench::int64_t m_total_samples{};
 };

 struct measure_cupti_base::kernel_launch_timer
@@ -129,6 +132,9 @@ private:
  // Run the kernel as many times as CUPTI requires.
  void run()
  {
+    m_walltime_timer.start();
+    m_total_samples = 0;
+
    kernel_launch_timer timer(*this);

    m_cupti.prepare_user_loop();
@@ -136,9 +142,12 @@ private:
    do
    {
      m_kernel_launcher(m_launch, timer);
+      ++m_total_samples;
    } while (m_cupti.is_replay_required());

    m_cupti.process_user_loop();
+
+    m_walltime_timer.stop();
  }

  KernelLauncher &m_kernel_launcher;
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -47,7 +47,7 @@ measure_hot_base::measure_hot_base(state &exec_state)
  try
  {
    nvbench::int64_t cold_samples =
-      m_state.get_summary("Number of Samples (Cold)").get_int64("value");
+      m_state.get_summary("nv/cold/sample_size").get_int64("value");
    m_min_samples = std::max(m_min_samples, cold_samples);

    // If the cold measurement ran successfully, disable skip_time. It'd just
@@ -85,25 +85,33 @@ void measure_hot_base::check()

 void measure_hot_base::generate_summaries()
 {
-  const auto d_samples     = static_cast<double>(m_total_samples);
+  const auto d_samples = static_cast<double>(m_total_samples);
+  {
+    auto &summ = m_state.add_summary("nv/batch/sample_size");
+    summ.set_string("name", "Samples");
+    summ.set_string("hint", "sample_size");
+    summ.set_string("description", "Number of batch kernel executions");
+    summ.set_int64("value", m_total_samples);
+  }
+
  const auto avg_cuda_time = m_total_cuda_time / d_samples;
  {
-    auto &summ = m_state.add_summary("Average GPU Time (Batch)");
+    auto &summ = m_state.add_summary("nv/batch/time/gpu/mean");
+    summ.set_string("name", "Batch GPU");
    summ.set_string("hint", "duration");
-    summ.set_string("short_name", "Batch GPU");
    summ.set_string("description",
-                    "Average back-to-back kernel execution time as measured "
-                    "by CUDA events.");
+                    "Mean batch kernel execution time "
+                    "(measured by CUDA events)");
    summ.set_float64("value", avg_cuda_time);
  }

  {
-    auto &summ = m_state.add_summary("Number of Samples (Batch)");
-    summ.set_string("hint", "sample_size");
-    summ.set_string("short_name", "Batch");
-    summ.set_string("description",
-                    "Number of kernel executions in hot time measurements.");
-    summ.set_int64("value", m_total_samples);
+    auto &summ = m_state.add_summary("nv/batch/walltime");
+    summ.set_string("name", "Walltime");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "Walltime used for batch measurements");
+    summ.set_float64("value", m_walltime_timer.get_duration());
+    summ.set_string("hide", "Hidden by default.");
  }

  // Log if a printer exists:
@@ -115,7 +123,7 @@ void measure_hot_base::generate_summaries()
    // Warn if timed out:
    if (m_max_time_exceeded)
    {
-      const auto timeout = m_timeout_timer.get_duration();
+      const auto timeout = m_walltime_timer.get_duration();

      if (m_total_samples < m_min_samples)
      {
@@ -140,9 +148,11 @@ void measure_hot_base::generate_summaries()

    // Log to stdout:
    printer.log(nvbench::log_level::pass,
-                fmt::format("Batch: {:0.6f}ms GPU, {:0.2f}s total GPU, {}x",
+                fmt::format("Batch: {:0.6f}ms GPU, {:0.2f}s total GPU, "
+                            "{:0.2f}s total wall, {}x",
                            avg_cuda_time * 1e3,
                            m_total_cuda_time,
+                            m_walltime_timer.get_duration(),
                            m_total_samples));
  }
 }
@@ -163,8 +173,7 @@ void measure_hot_base::check_skip_time(nvbench::float64_t warmup_time)

 void measure_hot_base::block_stream()
 {
-  m_blocker.block(m_launch.get_stream(),
-                  m_state.get_blocking_kernel_timeout());
+  m_blocker.block(m_launch.get_stream(), m_state.get_blocking_kernel_timeout());
 }

 } // namespace nvbench::detail
--- a/nvbench/detail/measure_hot.cuh
+++ b/nvbench/detail/measure_hot.cuh
@@ -68,7 +68,7 @@ protected:

  nvbench::launch m_launch;
  nvbench::cuda_timer m_cuda_timer;
-  nvbench::cpu_timer m_timeout_timer;
+  nvbench::cpu_timer m_walltime_timer;
  nvbench::blocking_kernel m_blocker;

  nvbench::int64_t m_min_samples{};
@@ -125,7 +125,7 @@ private:

  void run_trials()
  {
-    m_timeout_timer.start();
+    m_walltime_timer.start();

    // Use warmup results to estimate the number of iterations to run.
    // The .95 factor here pads the batch_size a bit to avoid needing a second
@@ -183,21 +183,22 @@ private:
        (m_min_time - m_total_cuda_time) /
        (m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples)));

-      m_timeout_timer.stop();
-      const auto total_time = m_timeout_timer.get_duration();
-
      if (m_total_cuda_time > m_min_time && // min time okay
          m_total_samples > m_min_samples)  // min samples okay
      {
        break; // Stop iterating
      }

-      if (total_time > m_timeout)
+
+      m_walltime_timer.stop();
+      if (m_walltime_timer.get_duration() > m_timeout)
      {
        m_max_time_exceeded = true;
        break;
      }
    } while (true);
+
+    m_walltime_timer.stop();
  }

  __forceinline__ void launch_kernel() { m_kernel_launcher(m_launch); }
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -20,42 +20,75 @@

 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/benchmark_base.cuh>
+#include <nvbench/config.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/device_manager.cuh>
+#include <nvbench/git_revision.cuh>
+#include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
+#include <nvbench/version.cuh>
+
+#include <nvbench/detail/throw.cuh>

 #include <fmt/format.h>

 #include <nlohmann/json.hpp>

 #include <cstdint>
+#include <fstream>
+#include <iterator>
 #include <ostream>
+#include <stdexcept>
 #include <string>
+#include <utility>
 #include <vector>

+#ifdef __GNUC__
+#include <experimental/filesystem>
+#else
+#include <filesystem>
+#endif
+
+#if NVBENCH_CPP_DIALECT >= 2020
+#include <bit>
+#endif
+
 namespace
 {

+bool is_little_endian()
+{
+#if NVBENCH_CPP_DIALECT >= 2020
+  return std::endian::native == std::endian::little;
+#else
+  const nvbench::uint32_t word = {0xBadDecaf};
+  nvbench::uint8_t bytes[4];
+  std::memcpy(bytes, &word, 4);
+  return bytes[0] == 0xaf;
+#endif
+}
+
 template <typename JsonNode>
 void write_named_values(JsonNode &node, const nvbench::named_values &values)
 {
  const auto value_names = values.get_names();
  for (const auto &value_name : value_names)
  {
-    auto &value            = node[value_name];
+    auto &value   = node.emplace_back();
+    value["name"] = value_name;

    const auto type = values.get_type(value_name);
    switch (type)
    {
      case nvbench::named_values::type::int64:
-        value["type"]  = "int64";
+        value["type"] = "int64";
        // Write as a string; JSON encodes all numbers as double-precision
        // floats, which would truncate int64s.
        value["value"] = fmt::to_string(values.get_int64(value_name));
        break;

      case nvbench::named_values::type::float64:
-        value["type"]  = "float64";
+        value["type"] = "float64";
        // Write as a string for consistency with int64.
        value["value"] = fmt::to_string(values.get_float64(value_name));
        break;
@@ -64,6 +97,9 @@ void write_named_values(JsonNode &node, const nvbench::named_values &values)
        value["type"]  = "string";
        value["value"] = values.get_string(value_name);
        break;
+
+      default:
+        NVBENCH_THROW(std::runtime_error, "{}", "Unrecognized value type.");
    } // end switch (value type)
  }   // end foreach value name
 }
@@ -73,15 +109,187 @@ void write_named_values(JsonNode &node, const nvbench::named_values &values)
 namespace nvbench
 {

+json_printer::version_t json_printer::get_json_file_version()
+{
+  // This version number should stay in sync with `file_version` in
+  // scripts/nvbench_json/version.py.
+  //
+  // Use semantic versioning:
+  // Major version: backwards incompatible changes
+  // Minor version: backwards compatible additions
+  // Patch version: backwards compatible bugfixes/patches
+  return {1, 0, 0};
+}
+
+std::string json_printer::version_t::get_string() const
+{
+  return fmt::format("{}.{}.{}", this->major, this->minor, this->patch);
+}
+
+void json_printer::do_process_bulk_data_float64(
+  state &state,
+  const std::string &tag,
+  const std::string &hint,
+  const std::vector<nvbench::float64_t> &data)
+{
+  printer_base::do_process_bulk_data_float64(state, tag, hint, data);
+
+  if (!m_enable_binary_output)
+  {
+    return;
+  }
+
+  if (hint == "sample_times")
+  {
+#ifdef __GNUC__
+    namespace fs = std::experimental::filesystem;
+#else
+    namespace fs = std::filesystem;
+#endif
+
+    nvbench::cpu_timer timer;
+    timer.start();
+
+    fs::path result_path{m_stream_name + "-bin/"};
+    try
+    {
+      if (!fs::exists(result_path))
+      {
+        if (!fs::create_directory(result_path))
+        {
+          NVBENCH_THROW(std::runtime_error,
+                        "{}",
+                        "Failed to create result directory '{}'.");
+        }
+      }
+      else if (!fs::is_directory(result_path))
+      {
+        NVBENCH_THROW(std::runtime_error,
+                      "{}",
+                      "'{}' exists and is not a directory.");
+      }
+
+      const auto file_id = m_num_jsonbin_files++;
+      result_path /= fmt::format("{:d}.bin", file_id);
+
+      std::ofstream out;
+      out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit);
+      out.open(result_path, std::ios::binary | std::ios::out);
+
+      // FIXME: SLOW -- Writing the binary file, 4 bytes at a time...
+      // There are a lot of optimizations that could be done here if this ends
+      // up being a noticeable bottleneck.
+      for (auto value64 : data)
+      {
+        const auto value32 = static_cast<nvbench::float32_t>(value64);
+        char buffer[4];
+        std::memcpy(buffer, &value32, 4);
+        // the c++17 implementation of is_little_endian isn't constexpr, but
+        // all supported compilers optimize this branch as if it were.
+        if (!is_little_endian())
+        {
+          using std::swap;
+          swap(buffer[0], buffer[3]);
+          swap(buffer[1], buffer[2]);
+        }
+        out.write(buffer, 4);
+      }
+    }
+    catch (std::exception &e)
+    {
+      if (auto printer_opt_ref = state.get_benchmark().get_printer();
+          printer_opt_ref.has_value())
+      {
+        auto &printer = printer_opt_ref.value().get();
+        printer.log(nvbench::log_level::warn,
+                    fmt::format("Error writing {} ({}) to {}: {}",
+                                tag,
+                                hint,
+                                result_path.string(),
+                                e.what()));
+      }
+    } // end catch
+
+    auto &summ = state.add_summary(fmt::format("nv/json/bin:{}", tag));
+    summ.set_string("name", "Samples Times File");
+    summ.set_string("hint", "file/sample_times");
+    summ.set_string("description",
+                    "Binary file containing sample times as little-endian "
+                    "float32.");
+    summ.set_string("filename", result_path.string());
+    summ.set_int64("size", static_cast<nvbench::int64_t>(data.size()));
+    summ.set_string("hide", "Not needed in table.");
+
+    timer.stop();
+    if (auto printer_opt_ref = state.get_benchmark().get_printer();
+        printer_opt_ref.has_value())
+    {
+      auto &printer = printer_opt_ref.value().get();
+      printer.log(nvbench::log_level::info,
+                  fmt::format("Wrote '{}' in {:>6.3f}ms",
+                              result_path.string(),
+                              timer.get_duration() * 1000));
+    }
+  } // end hint == sample_times
+}
+
 void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
 {
  nlohmann::ordered_json root;

+  {
+    auto &metadata = root["meta"];
+
+    {
+      auto &argv = metadata["argv"];
+      for (const auto &arg : m_argv)
+      {
+        argv.push_back(arg);
+      }
+    } // "argv"
+
+    {
+      auto &version = metadata["version"];
+
+      {
+        const auto version_info = json_printer::get_json_file_version();
+        auto &json_version      = version["json"];
+
+        json_version["major"]  = version_info.major;
+        json_version["minor"]  = version_info.minor;
+        json_version["patch"]  = version_info.patch;
+        json_version["string"] = version_info.get_string();
+      } // "json"
+
+      {
+        auto &nvb_version = version["nvbench"];
+
+        nvb_version["major"]  = NVBENCH_VERSION_MAJOR;
+        nvb_version["minor"]  = NVBENCH_VERSION_MINOR;
+        nvb_version["patch"]  = NVBENCH_VERSION_PATCH;
+        nvb_version["string"] = fmt::format("{}.{}.{}",
+                                            NVBENCH_VERSION_MAJOR,
+                                            NVBENCH_VERSION_MINOR,
+                                            NVBENCH_VERSION_PATCH);
+
+        nvb_version["git_branch"]  = NVBENCH_GIT_BRANCH;
+        nvb_version["git_sha"]     = NVBENCH_GIT_SHA1;
+        nvb_version["git_version"] = NVBENCH_GIT_VERSION;
+        nvb_version["git_is_dirty"] =
+#ifdef NVBENCH_GIT_IS_DIRTY
+          true;
+#else
+          false;
+#endif
+      } // "nvbench"
+    }   // "version"
+  }     // "meta"
+
  {
    auto &devices = root["devices"];
    for (const auto &dev_info : nvbench::device_manager::get().get_devices())
    {
-      auto &device                    = devices[devices.size()];
+      auto &device                    = devices.emplace_back();
      device["id"]                    = dev_info.get_id();
      device["name"]                  = dev_info.get_name();
      device["sm_version"]            = dev_info.get_sm_version();
@@ -106,17 +314,17 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
        dev_info.get_shared_memory_per_block();
      device["ecc_state"] = dev_info.get_ecc_state();
    }
-  }
+  } // "devices"

  {
    auto &benchmarks = root["benchmarks"];
    for (const auto &bench_ptr : benches)
    {
      const auto bench_index = benchmarks.size();
-      auto &bench            = benchmarks[bench_index];
+      auto &bench            = benchmarks.emplace_back();

-      bench["index"] = bench_index;
      bench["name"]  = bench_ptr->get_name();
+      bench["index"] = bench_index;

      bench["min_samples"] = bench_ptr->get_min_samples();
      bench["min_time"]    = bench_ptr->get_min_time();
@@ -133,8 +341,9 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
      auto &axes = bench["axes"];
      for (const auto &axis_ptr : bench_ptr->get_axes().get_axes())
      {
-        auto &axis = axes[axis_ptr->get_name()];
+        auto &axis = axes.emplace_back();

+        axis["name"]  = axis_ptr->get_name();
        axis["type"]  = axis_ptr->get_type_as_string();
        axis["flags"] = axis_ptr->get_flags_as_string();

@@ -142,8 +351,7 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
        const auto axis_size = axis_ptr->get_size();
        for (std::size_t i = 0; i < axis_size; ++i)
        {
-          const auto value_idx  = values.size();
-          auto &value           = values[value_idx];
+          auto &value           = values.emplace_back();
          value["input_string"] = axis_ptr->get_input_string(i);
          value["description"]  = axis_ptr->get_description(i);

@@ -177,13 +385,9 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
      auto &states = bench["states"];
      for (const auto &exec_state : bench_ptr->get_states())
      {
-        auto &st               = states[exec_state.get_axis_values_as_string()];
+        auto &st = states.emplace_back();

-        // TODO: Determine if these need to be part of the state key as well
-        // for uniqueness. The device already is, but the type config index is
-        // not.
-        st["device"]            = exec_state.get_device()->get_id();
-        st["type_config_index"] = exec_state.get_type_config_index();
+        st["name"] = exec_state.get_axis_values_as_string();

        st["min_samples"] = exec_state.get_min_samples();
        st["min_time"]    = exec_state.get_min_time();
@@ -191,13 +395,50 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
        st["skip_time"]   = exec_state.get_skip_time();
        st["timeout"]     = exec_state.get_timeout();

+        st["device"]            = exec_state.get_device()->get_id();
+        st["type_config_index"] = exec_state.get_type_config_index();
+
+        // TODO I'd like to replace this with:
+        //  [ {"name" : <axis name>, "index": <value_index>}, ...]
+        // but it would take some refactoring in the data structures to get
+        // that information through.
        ::write_named_values(st["axis_values"], exec_state.get_axis_values());

        auto &summaries = st["summaries"];
        for (const auto &exec_summ : exec_state.get_summaries())
        {
-          auto &summ            = summaries[exec_summ.get_name()];
-          ::write_named_values(summ, exec_summ);
+          auto &summ  = summaries.emplace_back();
+          summ["tag"] = exec_summ.get_tag();
+
+          // Write out the expected values as simple key/value pairs
+          nvbench::named_values summary_values = exec_summ;
+          if (summary_values.has_value("name"))
+          {
+            summ["name"] = summary_values.get_string("name");
+            summary_values.remove_value("name");
+          }
+          if (summary_values.has_value("description"))
+          {
+            summ["description"] = summary_values.get_string("description");
+            summary_values.remove_value("description");
+          }
+          if (summary_values.has_value("hint"))
+          {
+            summ["hint"] = summary_values.get_string("hint");
+            summary_values.remove_value("hint");
+          }
+          if (summary_values.has_value("hide"))
+          {
+            summ["hide"] = summary_values.get_string("hide");
+            summary_values.remove_value("hide");
+          }
+
+          // Write any additional values generically in
+          // ["data"] = [{name,type,value}, ...]:
+          if (summary_values.get_size() != 0)
+          {
+            ::write_named_values(summ["data"], summary_values);
+          }
        }

        st["is_skipped"] = exec_state.is_skipped();
@@ -208,7 +449,7 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
        }
      } // end foreach exec_state
    }   // end foreach benchmark
-  }
+  }     // "benchmarks"

  m_ostream << root.dump(2) << "\n";
 }
--- a/nvbench/json_printer.cuh
+++ b/nvbench/json_printer.cuh
@@ -20,19 +20,68 @@

 #include <nvbench/printer_base.cuh>

+#include <nvbench/types.cuh>
+
+#include <string>
+#include <vector>
+
 namespace nvbench
 {

 /*!
 * JSON output format.
+ *
+ * All modifications to the output file should increment the semantic version
+ * of the json files appropriately (see json_printer::get_json_file_version()).
 */
 struct json_printer : nvbench::printer_base
 {
  using printer_base::printer_base;

+  json_printer(std::ostream &stream,
+               std::string stream_name,
+               bool enable_binary_output)
+      : printer_base(stream, std::move(stream_name))
+      , m_enable_binary_output{enable_binary_output}
+  {}
+
+  /**
+   * The json schema version. Follows semantic versioning.
+   */
+  struct version_t
+  {
+    nvbench::uint16_t major;
+    nvbench::uint16_t minor;
+    nvbench::uint16_t patch;
+
+    [[nodiscard]] std::string get_string() const;
+  };
+
+  [[nodiscard]] static version_t get_json_file_version();
+
+  [[nodiscard]] bool get_enable_binary_output() const
+  {
+    return m_enable_binary_output;
+  }
+  void set_enable_binary_output(bool b) { m_enable_binary_output = b; }
+
 protected:
  // Virtual API from printer_base:
+  void do_log_argv(const std::vector<std::string>& argv) override
+  {
+    m_argv = argv;
+  }
+  void do_process_bulk_data_float64(
+    nvbench::state &state,
+    const std::string &tag,
+    const std::string &hint,
+    const std::vector<nvbench::float64_t> &data) override;
  void do_print_benchmark_results(const benchmark_vector &benches) override;
+
+  bool m_enable_binary_output{false};
+  std::size_t m_num_jsonbin_files{};
+
+  std::vector<std::string> m_argv;
 };

 } // namespace nvbench
--- a/nvbench/markdown_printer.cu
+++ b/nvbench/markdown_printer.cu
@@ -307,43 +307,43 @@ void markdown_printer::do_print_benchmark_results(
            {
              continue;
            }
-            const std::string &key    = summ.get_name();
-            const std::string &header = summ.has_value("short_name")
-                                          ? summ.get_string("short_name")
-                                          : key;
+            const std::string &tag    = summ.get_tag();
+            const std::string &header = summ.has_value("name")
+                                          ? summ.get_string("name")
+                                          : tag;

            std::string hint = summ.has_value("hint") ? summ.get_string("hint")
                                                      : std::string{};
            if (hint == "duration")
            {
-              table.add_cell(row, key, header, this->do_format_duration(summ));
+              table.add_cell(row, tag, header, this->do_format_duration(summ));
            }
            else if (hint == "item_rate")
            {
-              table.add_cell(row, key, header, this->do_format_item_rate(summ));
+              table.add_cell(row, tag, header, this->do_format_item_rate(summ));
            }
            else if (hint == "bytes")
            {
-              table.add_cell(row, key, header, this->do_format_bytes(summ));
+              table.add_cell(row, tag, header, this->do_format_bytes(summ));
            }
            else if (hint == "byte_rate")
            {
-              table.add_cell(row, key, header, this->do_format_byte_rate(summ));
+              table.add_cell(row, tag, header, this->do_format_byte_rate(summ));
            }
            else if (hint == "sample_size")
            {
              table.add_cell(row,
-                             key,
+                             tag,
                             header,
                             this->do_format_sample_size(summ));
            }
            else if (hint == "percentage")
            {
-              table.add_cell(row, key, header, this->do_format_percentage(summ));
+              table.add_cell(row, tag, header, this->do_format_percentage(summ));
            }
            else
            {
-              table.add_cell(row, key, header, this->do_format_default(summ));
+              table.add_cell(row, tag, header, this->do_format_default(summ));
            }
          }
          row++;
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -375,6 +375,8 @@ void option_parser::parse_impl()
  }

  this->update_used_device_state();
+
+  m_printer.log_argv(m_args);
 }

 void option_parser::parse_range(option_parser::arg_iterator_t first,
@@ -468,7 +470,13 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
    else if (arg == "--json")
    {
      check_params(1);
-      this->add_json_printer(first[1]);
+      this->add_json_printer(first[1], false);
+      first += 2;
+    }
+    else if (arg == "--jsonbin")
+    {
+      check_params(1);
+      this->add_json_printer(first[1], true);
      first += 2;
    }
    else if (arg == "--benchmark" || arg == "-b")
@@ -515,7 +523,7 @@ void option_parser::add_markdown_printer(const std::string &spec)
 try
 {
  std::ostream &stream = this->printer_spec_to_ostream(spec);
-  auto &printer        = m_printer.emplace<nvbench::markdown_printer>(stream);
+  auto &printer = m_printer.emplace<nvbench::markdown_printer>(stream, spec);
  if (spec == "stdout")
  {
    printer.set_color(m_color_md_stdout_printer);
@@ -533,7 +541,7 @@ void option_parser::add_csv_printer(const std::string &spec)
 try
 {
  std::ostream &stream = this->printer_spec_to_ostream(spec);
-  m_printer.emplace<nvbench::csv_printer>(stream);
+  m_printer.emplace<nvbench::csv_printer>(stream, spec);
 }
 catch (std::exception &e)
 {
@@ -543,16 +551,18 @@ catch (std::exception &e)
                e.what());
 }

-void option_parser::add_json_printer(const std::string &spec)
+void option_parser::add_json_printer(const std::string &spec,
+                                     bool enable_binary)
 try
 {
  std::ostream &stream = this->printer_spec_to_ostream(spec);
-  m_printer.emplace<nvbench::json_printer>(stream);
+  m_printer.emplace<nvbench::json_printer>(stream, spec, enable_binary);
 }
 catch (std::exception &e)
 {
  NVBENCH_THROW(std::runtime_error,
-                "Error while adding json output for `{}`:\n{}",
+                "Error while adding {} output for `{}`:\n{}",
+                enable_binary ? "jsonbin" : "json",
                spec,
                e.what());
 }
--- a/nvbench/option_parser.cuh
+++ b/nvbench/option_parser.cuh
@@ -81,7 +81,7 @@ private:

  void add_markdown_printer(const std::string &spec);
  void add_csv_printer(const std::string &spec);
-  void add_json_printer(const std::string &spec);
+  void add_json_printer(const std::string &spec, bool enable_binary);

  std::ostream &printer_spec_to_ostream(const std::string &spec);

@@ -121,7 +121,7 @@ private:

  void update_used_device_state() const;

-  // less gross argv:
+  // Command line args
  std::vector<std::string> m_args;

  // Store benchmark modifiers passed in before any benchmarks are requested as
--- a/nvbench/printer_base.cuh
+++ b/nvbench/printer_base.cuh
@@ -18,6 +18,8 @@

 #pragma once

+#include <nvbench/types.cuh>
+
 #include <iosfwd>
 #include <memory>
 #include <string>
@@ -55,7 +57,22 @@ struct printer_base
  /*!
   * Construct a new printer_base that will write to ostream.
   */
-  explicit printer_base(std::ostream &ostream);
+  explicit printer_base(std::ostream &ostream)
+      : printer_base(ostream, {})
+  {}
+
+  /*!
+   * Construct a new print_base that will write to an ostream, described by
+   * stream_name.
+   *
+   * `stream_name` is used to open any additional files needed by the printer.
+   * If `ostream` is a file stream, use the filename. Stream name may be
+   * "stdout" / "stderr" or empty.
+   * @param ostream
+   * @param stream_name
+   */
+  explicit printer_base(std::ostream &ostream, std::string stream_name);
+
  virtual ~printer_base();

  // move-only
@@ -64,6 +81,15 @@ struct printer_base
  printer_base &operator=(const printer_base &) = delete;
  printer_base &operator=(printer_base &&) = default;

+  /*!
+   * Called once with the command line arguments used to invoke the current
+   * executable.
+   */
+  void log_argv(const std::vector<std::string> &argv)
+  {
+    this->do_log_argv(argv);
+  }
+
  /*!
   * Print a summary of all detected devices, if supported.
   *
@@ -96,6 +122,31 @@ struct printer_base
    this->do_log_run_state(exec_state);
  }

+  /*!
+   * Measurements may call this to allow a printer to perform extra processing
+   * on large sets of data.
+   *
+   * @param state The `nvbench::state` associated with this measurement.
+   *
+   * @param tag A tag identifying the data. Tags must be unique within a state,
+   *            but the same tag may be reused in multiple states. Data produced
+   *            by NVBench will be prefixed with "nv/", for example, isolated
+   *            sample time measurements are tagged "nv/cold/sample_times".
+   *
+   * @param hint A hint describing the type of data. Subclasses may use these
+   *             to determine how to handle the data, and should ignore any
+   *             hints they don't understand. Common hints are:
+   *             - "sample_times": `data` contains all sample times for a
+   *               measurement (in seconds).
+   */
+  void process_bulk_data(nvbench::state &state,
+                         const std::string &tag,
+                         const std::string &hint,
+                         const std::vector<nvbench::float64_t> &data)
+  {
+    this->do_process_bulk_data_float64(state, tag, hint, data);
+  }
+
  /*!
   * Print details of the unexecuted benchmarks in `benches`. This is used for
   * `--list`.
@@ -142,11 +193,17 @@ struct printer_base

 protected:
  // Implementation hooks for subclasses:
+  virtual void do_log_argv(const std::vector<std::string>&) {}
  virtual void do_print_device_info() {}
  virtual void do_print_log_preamble() {}
  virtual void do_print_log_epilogue() {}
  virtual void do_log(nvbench::log_level, const std::string &) {}
  virtual void do_log_run_state(const nvbench::state &) {}
+  virtual void
+  do_process_bulk_data_float64(nvbench::state &,
+                               const std::string &,
+                               const std::string &,
+                               const std::vector<nvbench::float64_t> &){};
  virtual void do_print_benchmark_list(const benchmark_vector &) {}
  virtual void do_print_benchmark_results(const benchmark_vector &) {}

@@ -159,6 +216,10 @@ protected:

  std::ostream &m_ostream;

+  // May be empty, a filename,  or "stdout" / "stderr" depending on the type of
+  // stream in m_stream.
+  std::string m_stream_name;
+
  std::size_t m_completed_state_count{};
  std::size_t m_total_state_count{};
 };
--- a/nvbench/printer_base.cxx
+++ b/nvbench/printer_base.cxx
@@ -23,8 +23,9 @@
 namespace nvbench
 {

-printer_base::printer_base(std::ostream &ostream)
+printer_base::printer_base(std::ostream &ostream, std::string stream_name)
    : m_ostream{ostream}
+    , m_stream_name{std::move(stream_name)}
 {}

 // Defined here to keep <ostream> out of the header
--- a/nvbench/printer_multiplex.cuh
+++ b/nvbench/printer_multiplex.cuh
@@ -46,11 +46,17 @@ struct printer_multiplex : nvbench::printer_base
  }

 protected:
+  void do_log_argv(const std::vector<std::string> &argv) override;
  void do_print_device_info() override;
  void do_print_log_preamble() override;
  void do_print_log_epilogue() override;
  void do_log(nvbench::log_level, const std::string &) override;
  void do_log_run_state(const nvbench::state &) override;
+  void do_process_bulk_data_float64(
+    nvbench::state &,
+    const std::string &,
+    const std::string &,
+    const std::vector<nvbench::float64_t> &) override;
  void do_print_benchmark_list(const benchmark_vector &benches) override;
  void do_print_benchmark_results(const benchmark_vector &benches) override;
  void do_set_completed_state_count(std::size_t states) override;
--- a/nvbench/printer_multiplex.cxx
+++ b/nvbench/printer_multiplex.cxx
@@ -67,6 +67,18 @@ void printer_multiplex::do_log_run_state(const nvbench::state &exec_state)
  }
 }

+void printer_multiplex::do_process_bulk_data_float64(
+  state &state,
+  const std::string &tag,
+  const std::string &hint,
+  const std::vector<nvbench::float64_t> &data)
+{
+  for (auto &format_ptr : m_printers)
+  {
+    format_ptr->process_bulk_data(state, tag, hint, data);
+  }
+}
+
 void printer_multiplex::do_print_benchmark_list(const benchmark_vector &benches)
 {
  for (auto &format_ptr : m_printers)
@@ -109,5 +121,13 @@ void printer_multiplex::do_set_total_state_count(std::size_t states)
    format_ptr->set_total_state_count(states);
  }
 }
+void printer_multiplex::do_log_argv(const std::vector<std::string> &argv)
+{
+  printer_base::do_log_argv(argv);
+  for (auto &format_ptr : m_printers)
+  {
+    format_ptr->log_argv(argv);
+  }
+}

 } // namespace nvbench
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -119,7 +119,7 @@ struct state
                                std::string column_name = {});

  void add_buffer_size(std::size_t num_bytes,
-                       std::string summary_name,
+                       std::string summary_tag,
                       std::string column_name = {},
                       std::string description = {});

@@ -266,10 +266,10 @@ struct state
        || is_dram_throughput_collected();
  }

-  summary &add_summary(std::string summary_name);
+  summary &add_summary(std::string summary_tag);
  summary &add_summary(summary s);
-  [[nodiscard]] const summary &get_summary(std::string_view name) const;
-  [[nodiscard]] summary &get_summary(std::string_view name);
+  [[nodiscard]] const summary &get_summary(std::string_view tag) const;
+  [[nodiscard]] summary &get_summary(std::string_view tag);
  [[nodiscard]] const std::vector<summary> &get_summaries() const;
  [[nodiscard]] std::vector<summary> &get_summaries();

--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -109,9 +109,9 @@ catch (...)
  return default_value;
 }

-summary &state::add_summary(std::string summary_name)
+summary &state::add_summary(std::string summary_tag)
 {
-  return m_summaries.emplace_back(std::move(summary_name));
+  return m_summaries.emplace_back(std::move(summary_tag));
 }

 summary &state::add_summary(summary s)
@@ -120,29 +120,54 @@ summary &state::add_summary(summary s)
  return m_summaries.back();
 }

-const summary &state::get_summary(std::string_view name) const
+const summary &state::get_summary(std::string_view tag) const
 {
+  // Check tags first
  auto iter =
    std::find_if(m_summaries.cbegin(),
                 m_summaries.cend(),
-                 [&name](const auto &s) { return s.get_name() == name; });
-  if (iter == m_summaries.cend())
+                 [&tag](const auto &s) { return s.get_tag() == tag; });
+  if (iter != m_summaries.cend())
  {
-    NVBENCH_THROW(std::invalid_argument, "No summary named '{}'.", name);
+    return *iter;
  }
-  return *iter;
+
+  // Then names:
+  iter =
+    std::find_if(m_summaries.cbegin(),
+                 m_summaries.cend(),
+                 [&tag](const auto &s) { return s.get_string("name") == tag; });
+  if (iter != m_summaries.cend())
+  {
+    return *iter;
+  }
+
+  NVBENCH_THROW(std::invalid_argument, "No summary tagged '{}'.", tag);
 }

-summary &state::get_summary(std::string_view name)
+summary &state::get_summary(std::string_view tag)
 {
-  auto iter = std::find_if(m_summaries.begin(),
-                           m_summaries.end(),
-                           [&name](auto &s) { return s.get_name() == name; });
-  if (iter == m_summaries.end())
+  // Check tags first
+  auto iter =
+    std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
+      return s.get_tag() == tag;
+    });
+  if (iter != m_summaries.end())
  {
-    NVBENCH_THROW(std::invalid_argument, "No summary named '{}'.", name);
+    return *iter;
  }
-  return *iter;
+
+  // Then names:
+  iter =
+    std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
+      return s.get_string("name") == tag;
+    });
+  if (iter != m_summaries.end())
+  {
+    return *iter;
+  }
+
+  NVBENCH_THROW(std::invalid_argument, "No summary tagged '{}'.", tag);
 }

 const std::vector<summary> &state::get_summaries() const { return m_summaries; }
@@ -226,8 +251,9 @@ void state::add_element_count(std::size_t elements, std::string column_name)
  m_element_count += elements;
  if (!column_name.empty())
  {
-    auto &summ = this->add_summary("Element count: " + column_name);
-    summ.set_string("short_name", std::move(column_name));
+    auto &summ = this->add_summary("nv/element_count/" + column_name);
+    summ.set_string("description", "Number of elements: " + column_name);
+    summ.set_string("name", std::move(column_name));
    summ.set_int64("value", static_cast<nvbench::int64_t>(elements));
  }
 }
@@ -237,9 +263,8 @@ void state::add_global_memory_reads(std::size_t bytes, std::string column_name)
  m_global_memory_rw_bytes += bytes;
  if (!column_name.empty())
  {
-    this->add_buffer_size(bytes,
-                          "Input Buffer Size: " + column_name,
-                          std::move(column_name));
+    std::string tag = fmt::format("nv/gmem/reads/{}", column_name);
+    this->add_buffer_size(bytes, std::move(tag), std::move(column_name));
  }
 }

@@ -248,29 +273,33 @@ void state::add_global_memory_writes(std::size_t bytes, std::string column_name)
  m_global_memory_rw_bytes += bytes;
  if (!column_name.empty())
  {
-    this->add_buffer_size(bytes,
-                          "Output Buffer Size: " + column_name,
-                          std::move(column_name));
+    const std::string tag = fmt::format("nv/gmem/writes/{}", column_name);
+    this->add_buffer_size(bytes, std::move(tag), std::move(column_name));
  }
 }

 void state::add_buffer_size(std::size_t num_bytes,
-                            std::string summary_name,
+                            std::string summary_tag,
                            std::string column_name,
                            std::string description)
 {
-  auto &summ = this->add_summary(std::move(summary_name));
+  auto &summ = this->add_summary(std::move(summary_tag));
  summ.set_string("hint", "bytes");
+  summ.set_int64("value", static_cast<nvbench::int64_t>(num_bytes));

  if (!column_name.empty())
  {
-    summ.set_string("short_name", std::move(column_name));
+    summ.set_string("name", std::move(column_name));
+  }
+  else
+  {
+    summ.set_string("name", ("None"));
+    summ.set_string("hide", "No column name provided.");
  }
  if (!description.empty())
  {
    summ.set_string("description", std::move(description));
  }
-  summ.set_int64("value", static_cast<nvbench::int64_t>(num_bytes));
 }

 } // namespace nvbench
--- a/nvbench/summary.cuh
+++ b/nvbench/summary.cuh
@@ -27,50 +27,68 @@ namespace nvbench
 {

 /**
- * A named set of key/value pairs associated with a benchmark result.
+ * @brief A single value associated with a benchmark state.
 *
- * The summary name is the unabbreviated name for the measurement.
- * An abbreviated name for column headings can be suggested in a "short_name"
- * entry (see below).
+ * Each summary object contains a single value with associated metadata, such
+ * as name, description, type, and formatting hints. Each summary object
+ * corresponds to a cell in an output markdown table, with summaries grouped
+ * into columns by their tag.
 *
- * Some keys have standard meanings that output formats may use to produce
- * more readable representations of the result:
+ * The summary tag provided at construction should be a unique identifier that
+ * will be convenient and unambiguous during lookups. For example, summaries
+ * produced by NVBench will begin with `nv/` and contain a hierarchical
+ * organization of descriptors, such as `nv/cold/time/gpu/mean`.
 *
- * - "hint": Formatting hints (see below)
- * - "short_name": Abbreviated name for table headings.
- * - "description": Longer description of result.
- * - "value": Actual value.
+ * The summary may contain an arbitrary number of key/value pairs. The keys
+ * are `std::string` and the values may be `std::string`, `int64_t`, or
+ * `float64_t`. These may be used to store arbitrary user data and will be
+ * written into the json output.
+ *
+ * Some keys are reserved and have special meaning. These may be used by tooling
+ * to help interpret data:
+ *
+ * - `"name": required [string]` Compact, used for table headings.
+ * - `"description": optional [string]` Longer description.
+ * - `"value": required [string|float64|int64]` Actual value.
+ * - `"hint": optional [string]` Formatting hints (see below)
+ * - `"hide": optional [string]` If present, the summary will not be included in
+ *                               markdown output tables.
+ *
+ * Additionally, keys beginning with `nv/` are reserved for NVBench.
+ *
+ * Hints indicate the type of data stored in "value", but may be omitted.
+ * NVBench uses the following hints:
 *
- * Hints:
- * - unset: Arbitrary value is stored in "value".
 * - "duration": "value" is a float64_t time duration in seconds.
 * - "item_rate": "value" is a float64_t item rate in elements / second.
 * - "bytes": "value" is an int64_t number of bytes.
 * - "byte_rate": "value" is a float64_t byte rate in bytes / second.
- * - "sample_size": "value" is an int64_t number of samples in a measurement.
- * - "percentage": "value" is a float64_t percentage (stored as a ratio, 1. =
- *    100%).
+ * - "sample_size": "value" is an int64_t samples count.
+ * - "percentage": "value" is a float64_t percentage (100% stored as 1.0).
+ * - "file/sample_times":
+ *   - "filename" is the path to a binary file that encodes all sample
+ *     times (in seconds) as float32_t values.
+ *   - "size" is an int64_t containing the number of float32_t values stored in
+ *     the binary file.
 *
- * The key/value pair functionality is implemented by the
- * `nvbench::named_values` base class.
 *
 * Example: Adding a new summary to an nvbench::state object:
 *
 * ```
- * auto &summ = state.add_summary("Average GPU Time (Batch)");
+ * auto &summ = state.add_summary("nv/batch/gpu/time/mean");
+ * summ.set_string("name", "Batch GPU");
 * summ.set_string("hint", "duration");
- * summ.set_string("short_name", "Batch GPU");
 * summ.set_string("description",
- *                 "Average back-to-back kernel execution time as measured "
- *                 "by CUDA events.");
+ *                 "Average batch execution time measured by CUDA event
+ *                  timers.");
 * summ.set_float64("value", avg_batch_gpu_time);
 * ```
 */
 struct summary : public nvbench::named_values
 {
  summary() = default;
-  explicit summary(std::string name)
-      : m_name(std::move(name))
+  explicit summary(std::string tag)
+      : m_tag(std::move(tag))
  {}

  // move-only
@@ -79,11 +97,11 @@ struct summary : public nvbench::named_values
  summary &operator=(const summary &) = delete;
  summary &operator=(summary &&) = default;

-  void set_name(std::string name) { m_name = std::move(name); }
-  [[nodiscard]] const std::string &get_name() const { return m_name; }
+  void set_tag(std::string tag) { m_tag = std::move(tag); }
+  [[nodiscard]] const std::string &get_tag() const { return m_tag; }

 private:
-  std::string m_name;
+  std::string m_tag;
 };

 } // namespace nvbench
--- a/nvbench/type_axis.cxx
+++ b/nvbench/type_axis.cxx
@@ -54,7 +54,8 @@ bool type_axis::get_is_active(std::size_t idx) const

 std::size_t type_axis::get_active_count() const
 {
-  return std::count(m_mask.cbegin(), m_mask.cend(), true);
+  return static_cast<std::size_t>(
+    std::count(m_mask.cbegin(), m_mask.cend(), true));
 }

 std::size_t type_axis::get_type_index(const std::string &input_string) const
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,6 @@
-tabulate
 colorama
+matplotlib
+numpy
+pandas
+seaborn
+tabulate
--- a/scripts/nvbench_compare.py
+++ b/scripts/nvbench_compare.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python

 import argparse
-import json
 import math
 import os
 import sys
@@ -10,10 +9,13 @@ from colorama import Fore

 import tabulate

+from nvbench_json import reader
+
 # Parse version string into tuple, "x.y.z" -> (x, y, z)
 def version_tuple(v):
    return tuple(map(int, (v.split("."))))

+
 tabulate_version = version_tuple(tabulate.__version__)

 all_devices = []
@@ -38,8 +40,8 @@ def find_device_by_id(device_id):


 def format_int64_axis_value(axis_name, axis_value, axes):
-    axis_def = axes[axis_name]
-    axis_flags = axis_def["flags"]
+    axis = next(filter(lambda ax: ax["name"] == axis_name, axes))
+    axis_flags = axis["flags"]
    value = int(axis_value["value"])
    if axis_flags == "pow2":
        value = math.log2(value)
@@ -60,8 +62,8 @@ def format_string_axis_value(axis_name, axis_value, axes):


 def format_axis_value(axis_name, axis_value, axes):
-    axis_def = axes[axis_name]
-    axis_type = axis_def["type"]
+    axis = next(filter(lambda ax: ax["name"] == axis_name, axes))
+    axis_type = axis["type"]
    if axis_type == "int64":
        return format_int64_axis_value(axis_name, axis_value, axes)
    elif axis_type == "float64":
@@ -92,7 +94,7 @@ def format_percentage(percentage):
    # When there aren't enough samples for a meaningful noise measurement,
    # the noise is recorded as infinity. Unfortunately, JSON spec doesn't
    # allow for inf, so these get turned into null.
-    if not percentage:
+    if percentage is None:
        return "inf"
    return "%0.2f%%" % (percentage * 100.0)

@@ -110,7 +112,9 @@ def compare_benches(ref_benches, cmp_benches, threshold):
        ref_states = ref_bench["states"]
        cmp_states = cmp_bench["states"]

-        headers = list(axes.keys()) if axes else []
+        axes = axes if axes else []
+
+        headers = [x["name"] for x in axes]
        colalign = ["center"] * len(headers)

        headers.append("Ref Time")
@@ -131,9 +135,11 @@ def compare_benches(ref_benches, cmp_benches, threshold):
        for device_id in device_ids:

            rows = []
-            for cmp_state_name in cmp_states:
-                cmp_state = cmp_states[cmp_state_name]
-                ref_state = ref_states[cmp_state_name]
+            for cmp_state in cmp_states:
+                cmp_state_name = cmp_state["name"]
+                ref_state = next(filter(lambda st: st["name"] == cmp_state_name,
+                                        ref_states),
+                                 None)
                if not ref_state:
                    continue

@@ -142,8 +148,8 @@ def compare_benches(ref_benches, cmp_benches, threshold):
                    axis_values = []

                row = []
-                for axis_value_name in axis_values:
-                    axis_value = axis_values[axis_value_name]
+                for axis_value in axis_values:
+                    axis_value_name = axis_value["name"]
                    row.append(format_axis_value(axis_value_name,
                                                 axis_value,
                                                 axes))
@@ -154,14 +160,13 @@ def compare_benches(ref_benches, cmp_benches, threshold):
                if not ref_summaries or not cmp_summaries:
                    continue

-                cmp_time_summary = cmp_summaries.get("Average GPU Time (Cold)")
-                ref_time_summary = ref_summaries.get("Average GPU Time (Cold)")
-                cmp_noise_summary = cmp_summaries.get(
-                    "GPU Relative Standard Deviation (Cold)"
-                )
-                ref_noise_summary = ref_summaries.get(
-                    "GPU Relative Standard Deviation (Cold)"
-                )
+                def lookup_summary(summaries, tag):
+                    return next(filter(lambda s: s["tag"] == tag, summaries), None)
+
+                cmp_time_summary = lookup_summary(cmp_summaries, "nv/cold/time/gpu/mean")
+                ref_time_summary = lookup_summary(ref_summaries, "nv/cold/time/gpu/mean")
+                cmp_noise_summary = lookup_summary(cmp_summaries, "nv/cold/time/gpu/stdev/relative")
+                ref_noise_summary = lookup_summary(ref_summaries, "nv/cold/time/gpu/stdev/relative")

                # TODO: Use other timings, too. Maybe multiple rows, with a
                # "Timing" column + values "CPU/GPU/Batch"?
@@ -171,10 +176,16 @@ def compare_benches(ref_benches, cmp_benches, threshold):
                            ref_noise_summary]):
                    continue

-                cmp_time = cmp_time_summary["value"]["value"]
-                ref_time = ref_time_summary["value"]["value"]
-                cmp_noise = cmp_noise_summary["value"]["value"]
-                ref_noise = ref_noise_summary["value"]["value"]
+                def extract_value(summary):
+                    summary_data = summary["data"]
+                    value_data = next(filter(lambda v: v["name"] == "value", summary_data))
+                    assert(value_data["type"] == "float64")
+                    return value_data["value"]
+
+                cmp_time = extract_value(cmp_time_summary)
+                ref_time = extract_value(ref_time_summary)
+                cmp_noise = extract_value(cmp_noise_summary)
+                ref_noise = extract_value(ref_noise_summary)

                # Convert string encoding to expected numerics:
                cmp_time = float(cmp_time)
@@ -223,7 +234,6 @@ def compare_benches(ref_benches, cmp_benches, threshold):

                    rows.append(row)

-
            if len(rows) == 0:
                continue

@@ -244,13 +254,12 @@ def compare_benches(ref_benches, cmp_benches, threshold):


 def main():
-
    help_text = "%(prog)s [reference.json compare.json | reference_dir/ compare_dir/]"
    parser = argparse.ArgumentParser(prog='nvbench_compare', usage=help_text)
-    parser.add_argument('--threshold-diff',type=float, dest='threshold', default=0.0,
+    parser.add_argument('--threshold-diff', type=float, dest='threshold', default=0.0,
                        help='only show benchmarks where percentage diff is >= THRESHOLD')

-    args,files_or_dirs = parser.parse_known_args()
+    args, files_or_dirs = parser.parse_known_args()
    print(files_or_dirs)

    if len(files_or_dirs) != 2:
@@ -270,14 +279,12 @@ def main():
               os.path.getsize(r) > 0 and os.path.getsize(c) > 0:
                to_compare.append((r, c))
    else:
-        to_compare = [(files_or_dirs[0],files_or_dirs[1])]
+        to_compare = [(files_or_dirs[0], files_or_dirs[1])]

-    for ref,comp in to_compare:
+    for ref, comp in to_compare:

-        with open(ref, "r") as ref_file:
-            ref_root = json.load(ref_file)
-        with open(comp, "r") as cmp_file:
-            cmp_root = json.load(cmp_file)
+        ref_root = reader.read_file(ref)
+        cmp_root = reader.read_file(comp)

        global all_devices
        all_devices = cmp_root["devices"]
--- a/scripts/nvbench_histogram.py
+++ b/scripts/nvbench_histogram.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+
+import numpy as np
+import pandas as pd
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+import argparse
+import os
+import sys
+
+from nvbench_json import reader
+
+def parse_files():
+    help_text = "%(prog)s [nvbench.out.json | dir/] ..."
+    parser = argparse.ArgumentParser(prog='nvbench_histogram', usage=help_text)
+
+    args, files_or_dirs = parser.parse_known_args()
+
+    filenames = []
+    for file_or_dir in files_or_dirs:
+        if os.path.isdir(file_or_dir):
+            for f in os.listdir(file_or_dir):
+                if os.path.splitext(f)[1] != ".json":
+                    continue
+                filename = os.path.join(file_or_dir, f)
+                if os.path.isfile(filename) and os.path.getsize(filename) > 0:
+                    filenames.append(filename)
+        else:
+            filenames.append(file_or_dir)
+
+    filenames.sort()
+
+    if not filenames:
+        parser.print_help()
+        exit(0)
+
+    return filenames
+
+
+def parse_samples_meta(filename, state):
+    summaries = state["summaries"]
+    if not summaries:
+        return None, None
+
+    summary = next(filter(lambda s: s["tag"] == "nv/json/bin:nv/cold/sample_times",
+                          summaries),
+                   None)
+    if not summary:
+        return None, None
+
+    sample_filename = summary["filename"]["value"]
+
+    # If not absolute, the path is relative to the associated .json file:
+    if not os.path.isabs(sample_filename):
+        sample_filename = os.path.join(os.path.dirname(filename), sample_filename)
+
+    sample_count = int(summary["size"]["value"])
+    return sample_count, sample_filename
+
+
+def parse_samples(filename, state):
+    sample_count, samples_filename = parse_samples_meta(filename, state)
+    if not sample_count or not samples_filename:
+        return []
+
+    with open(samples_filename, "rb") as f:
+        samples = np.fromfile(f, "<f4")
+
+    assert (sample_count == len(samples))
+    return samples
+
+
+def to_df(data):
+    return pd.DataFrame.from_dict(dict([(k, pd.Series(v)) for k, v in data.items()]))
+
+
+def parse_json(filename):
+    json_root = reader.read_file(filename)
+
+    samples_data = {}
+
+    for bench in json_root["benchmarks"]:
+        print("Benchmark: {}".format(bench["name"]))
+        for state in bench["states"]:
+            print("State: {}".format(state["name"]))
+
+            samples = parse_samples(filename, state)
+            if len(samples) == 0:
+                continue
+
+            samples_data["{} {}".format(bench["name"], state["name"])] = samples
+
+    return to_df(samples_data)
+
+
+def main():
+    filenames = parse_files()
+
+    dfs = [parse_json(filename) for filename in filenames]
+    df = pd.concat(dfs, ignore_index=True)
+
+    sns.displot(df, rug=True, kind="kde", fill=True)
+    plt.show()
+
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/scripts/nvbench_json/.gitignore
+++ b/scripts/nvbench_json/.gitignore
@@ -0,0 +1 @@
+__pycache__/*
--- a/scripts/nvbench_json/init.py
+++ b/scripts/nvbench_json/init.py
@@ -0,0 +1,2 @@
+from . import reader
+from . import version
--- a/scripts/nvbench_json/reader.py
+++ b/scripts/nvbench_json/reader.py
@@ -0,0 +1,10 @@
+import json
+
+from . import version
+
+
+def read_file(filename):
+    with open(filename, "r") as f:
+        file_root = json.load(f)
+    version.check_file_version(filename, file_root)
+    return file_root
--- a/scripts/nvbench_json/version.py
+++ b/scripts/nvbench_json/version.py
@@ -0,0 +1,26 @@
+file_version = (1, 0, 0)
+
+file_version_string = "{}.{}.{}".format(file_version[0],
+                                        file_version[1],
+                                        file_version[2])
+
+
+def check_file_version(filename, root_node):
+    try:
+        version_node = root_node["meta"]["version"]["json"]
+    except KeyError:
+        print("WARNING:")
+        print("  {} is written in an older, unversioned format. ".format(filename))
+        print("  It may not read correctly.")
+        print("  Reader expects JSON file version {}.".format(file_version_string))
+        return
+
+    # TODO We could do something fancy here using semantic versioning, but
+    # for now just warn on mismatch.
+    if version_node["string"] != file_version_string:
+        print("WARNING:")
+        print("  {} was written using a different NVBench JSON file version."
+              .format(filename))
+        print("  It may not read correctly.")
+        print("  (file version: {} reader version: {})"
+              .format(version_node["string"], file_version_string))
--- a/scripts/nvbench_walltime.py
+++ b/scripts/nvbench_walltime.py
@@ -0,0 +1,357 @@
+#!/usr/bin/env python
+
+import argparse
+import math
+import os
+import sys
+
+from nvbench_json import reader
+
+import tabulate
+
+
+# Parse version string into tuple, "x.y.z" -> (x, y, z)
+def version_tuple(v):
+    return tuple(map(int, (v.split("."))))
+
+
+tabulate_version = version_tuple(tabulate.__version__)
+
+all_devices = []
+
+
+def format_axis_value(axis_value, axis_type):
+    if axis_type == "int64":
+        return "%d" % int(axis_value)
+    elif axis_type == "float64":
+        return "%.5g" % float(axis_value)
+    else:
+        return axis_value
+
+
+def format_walltime(seconds_in):
+    h = math.floor(seconds_in / (60 * 60))
+    m = math.floor((seconds_in / 60) % 60)
+    s = math.floor(seconds_in % 60)
+    ms = math.floor((seconds_in * 1000) % 1000)
+
+    return "{}{}{}{}".format(
+        "{:0>2d}:".format(h) if h > 1e-9 else "",
+        "{:0>2d}:".format(m) if (h > 1e-9 or m > 1e-9) else "",
+        "{:0>2d}.".format(s) if (h > 1e-9 or m > 1e-9) else "{:d}.".format(s),
+        "{:0>3d}".format(ms))
+
+
+def format_percentage(percentage):
+    # When there aren't enough samples for a meaningful noise measurement,
+    # the noise is recorded as infinity. Unfortunately, JSON spec doesn't
+    # allow for inf, so these get turned into null.
+    if percentage is None:
+        return "inf"
+    return "%0.2f%%" % (percentage * 100.0)
+
+
+measure_names = ["cold", "batch", "cupti"]
+measure_column_names = {"cold": "Isolated", "batch": "Batch", "cupti": "CUPTI"}
+
+
+def init_measures():
+    out = {}
+    for name in measure_names:
+        out[name] = 0.
+    return out
+
+
+def get_measures(state):
+    summaries = state["summaries"]
+    times = {}
+    for name in measure_names:
+        measure_walltime_tag = "nv/{}/walltime".format(name)
+        summary = next(filter(lambda s: s["tag"] == measure_walltime_tag,
+                              summaries),
+                       None)
+        if not summary:
+            continue
+
+        walltime_data = next(filter(lambda d: d["name"] == "value", summary["data"]))
+        assert(walltime_data["type"] == "float64")
+        walltime = walltime_data["value"]
+        walltime = float(walltime)
+        times[name] = walltime if walltime else 0.
+    return times
+
+
+def merge_measures(target, src):
+    for name, src_val in src.items():
+        target[name] += src_val
+
+
+def sum_measures(measures):
+    total_time = 0.
+    for time in measures.values():
+        total_time += time
+    return total_time
+
+
+def get_active_measure_names(measures):
+    names = []
+    for name, time in measures.items():
+        if time > 1e-9:
+            names.append(name)
+    return names
+
+
+def append_measure_headers(headers, active=measure_names):
+    for name in active:
+        headers.append(measure_column_names[name])
+
+
+def append_measure_values(row, measures, active=measure_names):
+    for name in active:
+        row.append(format_walltime(measures[name]))
+
+
+def consume_file(filename):
+    file_root = reader.read_file(filename)
+
+    file_out = {}
+    file_measures = init_measures()
+
+    benches = {}
+    for bench in file_root["benchmarks"]:
+        bench_data = consume_benchmark(bench, file_root)
+        merge_measures(file_measures, bench_data["measures"])
+        benches[bench["name"]] = bench_data
+
+    file_out["benches"] = benches
+    file_out["measures"] = file_measures
+    return file_out
+
+
+def consume_benchmark(bench, file_root):
+    bench_out = {}
+
+    # Initialize axis map
+    axes_out = {}
+    axes = bench["axes"]
+    if axes:
+        for axis in axes:
+            values_out = {}
+            axis_name = axis["name"]
+            axis_type = axis["type"]
+            for value in axis["values"]:
+                if axis_type == "type":
+                    value = value["input_string"]
+                else:
+                    value = format_axis_value(value["value"], axis_type)
+                values_out[value] = {"measures": init_measures()}
+            axes_out[axis_name] = values_out
+
+    states_out = {}
+    bench_measures = init_measures()
+
+    for state in bench["states"]:
+        state_name = state["name"]
+        # Get walltimes for each measurement:
+        state_measures = get_measures(state)
+        state_out = {}
+        state_out["measures"] = state_measures
+        states_out[state_name] = state_out
+
+        # Update the benchmark measures walltimes
+        merge_measures(bench_measures, state_measures)
+
+        # Update the axis measurements:
+        axis_values = state["axis_values"]
+        if axis_values:
+            for axis_value in axis_values:
+                axis_name = axis_value["name"]
+                value = format_axis_value(axis_value["value"], axis_value["type"])
+                merge_measures(axes_out[axis_name][value]["measures"], state_measures)
+
+    bench_out["axes"] = axes_out
+    bench_out["measures"] = bench_measures
+    bench_out["states"] = states_out
+    return bench_out
+
+
+def print_overview_section(data):
+    print("# Walltime Overview\n")
+
+    measures = data["measures"]
+    active_measures = get_active_measure_names(measures)
+
+    headers = ["Walltime"]
+    append_measure_headers(headers, active_measures)
+
+    colalign = ["right"] * len(headers)
+
+    rows = []
+
+    row = [format_walltime(sum_measures(measures))]
+    append_measure_values(row, measures, active_measures)
+    rows.append(row)
+
+    # colalign and github format require tabulate 0.8.3
+    if tabulate_version >= (0, 8, 3):
+        print(tabulate.tabulate(rows,
+                                headers=headers,
+                                colalign=colalign,
+                                tablefmt="github"))
+    else:
+        print(tabulate.tabulate(rows,
+                                headers=headers,
+                                tablefmt="markdown"))
+
+    print()
+
+
+# append_data_row_lambda args: (row_list, name, items[name])
+def print_measures_table(headers, colalign, items, total_measures, append_item_row_lambda):
+    total_time = sum_measures(total_measures)
+    active_measures = get_active_measure_names(total_measures)
+    num_user_columns = len(headers)
+
+    headers.append("%")
+    headers.append("Walltime")
+    append_measure_headers(headers, active_measures)
+
+    while len(colalign) < len(headers):
+        colalign.append("right")
+
+    rows = []
+
+    for name, item in items.items():
+        item_measures = item["measures"]
+        item_time = sum_measures(item_measures)
+
+        row = []
+        append_item_row_lambda(row, name, item)
+        if total_time > 1e-9:
+            row.append(format_percentage(item_time / total_time))
+        else:
+            row.append(format_percentage(0))
+        row.append(format_walltime(item_time))
+        append_measure_values(row, item_measures, active_measures)
+        rows.append(row)
+
+    # Totals:
+    row = []
+    if num_user_columns != 0:
+        row.append("Total")
+    while len(row) < num_user_columns:
+        row.append("")
+    row.append(format_percentage(1))
+    row.append(format_walltime(total_time))
+    append_measure_values(row, total_measures, active_measures)
+    rows.append(row)
+
+    # colalign and github format require tabulate 0.8.3
+    if tabulate_version >= (0, 8, 3):
+        print(tabulate.tabulate(rows,
+                                headers=headers,
+                                colalign=colalign,
+                                tablefmt="github"))
+    else:
+        print(tabulate.tabulate(rows,
+                                headers=headers,
+                                tablefmt="markdown"))
+
+
+def print_files_section(data):
+    print("# Files\n")
+
+    items = data["files"]
+    total_measures = data["measures"]
+    headers = ["Filename"]
+    colalign = ["left"]
+
+    def append_row(row, name, item):
+        row.append(name)
+
+    print_measures_table(headers, colalign, items, total_measures, append_row)
+    print()
+
+    for filename, file in items.items():
+        print_file_section(filename, file)
+
+
+def print_file_section(filename, file):
+    print("## File: {}\n".format(filename))
+
+    items = file["benches"]
+    total_measures = file["measures"]
+    headers = ["Benchmark"]
+    colalign = ["left"]
+
+    def append_row_name(row, name, item):
+        row.append(name)
+
+    print_measures_table(headers, colalign, items, total_measures, append_row_name)
+    print()
+
+    for bench_name, bench in items.items():
+        print_bench_section(bench_name, bench)
+
+
+def print_bench_section(bench_name, bench):
+    print("### Benchmark: {}\n".format(bench_name))
+
+    # TODO split this up so each axis is a column
+    items = bench["states"]
+    total_measures = bench["measures"]
+    headers = ["Configuration"]
+    colalign = ["left"]
+
+    def append_row_name(row, name, item):
+        row.append(name)
+
+    print_measures_table(headers, colalign, items, total_measures, append_row_name)
+    print()
+
+    for axis_name, axis in bench["axes"].items():
+        total_measures = bench["measures"]
+        headers = ["Axis: " + axis_name]
+        colalign = ["left"]
+        print_measures_table(headers, colalign, axis, total_measures, append_row_name)
+        print()
+
+
+def main():
+    help_text = "%(prog)s [nvbench.out.json | dir/]..."
+    parser = argparse.ArgumentParser(prog='nvbench_walltime', usage=help_text)
+
+    args, files_or_dirs = parser.parse_known_args()
+
+    filenames = []
+    for file_or_dir in files_or_dirs:
+        if os.path.isdir(file_or_dir):
+            for f in os.listdir(file_or_dir):
+                if os.path.splitext(f)[1] != ".json":
+                    continue
+                filename = os.path.join(file_or_dir, f)
+                if os.path.isfile(filename) and os.path.getsize(filename) > 0:
+                    filenames.append(filename)
+        else:
+            filenames.append(file_or_dir)
+
+    filenames.sort()
+
+    data = {}
+
+    files_out = {}
+    measures = init_measures()
+    for filename in filenames:
+        file_data = consume_file(filename)
+        merge_measures(measures, file_data["measures"])
+        files_out[filename] = file_data
+
+    data["files"] = files_out
+    data["measures"] = measures
+
+    print_overview_section(data)
+    print_files_section(data)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/scripts/test_cmp.json
+++ b/scripts/test_cmp.json
--- a/scripts/test_ref.json
+++ b/scripts/test_ref.json
--- a/testing/device/noisy_bench.cu
+++ b/testing/device/noisy_bench.cu
@@ -48,12 +48,12 @@ void noisy_bench(nvbench::state &state)
  });

  const auto measured_mean = static_cast<nvbench::float32_t>(
-    state.get_summary("Average GPU Time (Cold)").get_float64("value"));
+    state.get_summary("nv/cold/time/gpu/mean").get_float64("value"));
  const auto measured_noise = [&]() {
    try
    {
      return static_cast<nvbench::float32_t>(
-        state.get_summary("GPU Relative Standard Deviation (Cold)")
+        state.get_summary("nv/cold/time/gpu/stdev/relative")
          .get_float64("value"));
    }
    catch (std::invalid_argument &)