From 92cc3b1189f90c0bbd37492727395a0b5898d476 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Fri, 12 Feb 2021 20:02:20 -0500
Subject: [PATCH] Execute benchmarks on all devices.

---
 nvbench/benchmark_base.cu          |  21 +++
 nvbench/benchmark_base.cuh         |  33 +++--
 nvbench/detail/markdown_format.cu  | 211 ++++++++++++++++-------------
 nvbench/detail/measure_cold.cu     |  27 +++-
 nvbench/detail/measure_cold.cuh    |  11 +-
 nvbench/detail/measure_hot.cu      |  50 ++++---
 nvbench/detail/measure_hot.cuh     |   8 +-
 nvbench/detail/state_generator.cu  |  46 +++++--
 nvbench/detail/state_generator.cuh |  10 +-
 nvbench/device_info.cuh            |  33 ++++-
 nvbench/runner.cuh                 |  53 ++++++--
 nvbench/state.cuh                  |  28 +++-
 testing/create.cu                  |  13 +-
 testing/option_parser.cu           |  55 +++-----
 testing/runner.cu                  |  58 +++-----
 testing/state.cu                   |   9 +-
 testing/state_generator.cu         | 191 ++++++++++++++++----------
 17 files changed, 534 insertions(+), 323 deletions(-)
diff --git a/nvbench/benchmark_base.cu b/nvbench/benchmark_base.cu
index e5d1e64..ff7110f 100644
--- a/nvbench/benchmark_base.cu
+++ b/nvbench/benchmark_base.cu
@@ -1,8 +1,14 @@
 #include <nvbench/benchmark_base.cuh>
 
+#include <nvbench/device_manager.cuh>
+
 namespace nvbench
 {
 
+benchmark_base::benchmark_base()
+    : m_devices(nvbench::device_manager::get().get_devices())
+{}
+
 benchmark_base::~benchmark_base() = default;
 
 std::unique_ptr<benchmark_base> benchmark_base::clone() const
@@ -16,5 +22,20 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
   return std::move(result);
 }
 
+void benchmark_base::set_devices(std::vector<int> device_ids)
+{
+  std::vector<device_info> devices;
+  devices.reserve(device_ids.size());
+  for (int dev_id : device_ids)
+  {
+    devices.emplace_back(dev_id);
+  }
+  this->set_devices(std::move(devices));
+}
+
+void benchmark_base::add_device(int device_id)
+{
+  this->add_device(device_info{device_id});
+}
 
 } // namespace nvbench
diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh
index 7be67d5..a1239ac 100644
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <nvbench/axes_metadata.cuh>
+#include <nvbench/device_info.cuh>
 #include <nvbench/state.cuh>
 
 #include <memory>
@@ -21,6 +22,7 @@ struct runner;
  */
 struct benchmark_base
 {
+  benchmark_base();
   virtual ~benchmark_base();
 
   /**
@@ -77,25 +79,37 @@ struct benchmark_base
     return *this;
   }
 
-  [[nodiscard]] nvbench::axes_metadata &get_axes()
+  void set_devices(std::vector<int> device_ids);
+
+  void set_devices(std::vector<nvbench::device_info> devices)
   {
-    return m_axes;
+    m_devices = std::move(devices);
   }
 
+  void add_device(int device_id);
+
+  void add_device(nvbench::device_info device)
+  {
+    m_devices.push_back(std::move(device));
+  }
+
+  [[nodiscard]] const std::vector<nvbench::device_info> &get_devices() const
+  {
+    return m_devices;
+  }
+
+  [[nodiscard]] nvbench::axes_metadata &get_axes() { return m_axes; }
+
   [[nodiscard]] const nvbench::axes_metadata &get_axes() const
   {
     return m_axes;
   }
 
-  [[nodiscard]] const std::vector<std::vector<nvbench::state>> &
-  get_states() const
-  {
-    return m_states;
-  }
-  [[nodiscard]] std::vector<std::vector<nvbench::state>> &get_states()
+  [[nodiscard]] const std::vector<nvbench::state> &get_states() const
   {
     return m_states;
   }
+  [[nodiscard]] std::vector<nvbench::state> &get_states() { return m_states; }
 
   void run() { this->do_run(); }
 
@@ -105,7 +119,8 @@ protected:
 
   std::string m_name;
   nvbench::axes_metadata m_axes;
-  std::vector<std::vector<nvbench::state>> m_states;
+  std::vector<nvbench::device_info> m_devices;
+  std::vector<nvbench::state> m_states;
 
 private:
   // route these through virtuals so the templated subclass can inject type info
diff --git a/nvbench/detail/markdown_format.cu b/nvbench/detail/markdown_format.cu
index 6097876..ad4544e 100644
--- a/nvbench/detail/markdown_format.cu
+++ b/nvbench/detail/markdown_format.cu
@@ -218,6 +218,8 @@ void markdown_format::print_benchmark_summaries(
 
 void markdown_format::print_benchmark_results(const benchmark_vector &benchmarks)
 {
+  // This needs to be refactored and cleaned up (someday....) but here's a
+  // buncha functors that do various string formatting stuff:
   auto format_visitor = [](const auto &v) {
     using T = std::decay_t<decltype(v)>;
     if constexpr (std::is_same_v<T, nvbench::float64_t>)
@@ -312,110 +314,129 @@ void markdown_format::print_benchmark_results(const benchmark_vector &benchmarks
     return fmt::format("{:.2f}%", percentage);
   };
 
-  fmt::print("# Benchmark Summaries\n");
+  // Start printing benchmarks
+  fmt::print("# Benchmark Results\n");
 
   for (const auto &bench_ptr : benchmarks)
   {
-    const benchmark_base &bench = *bench_ptr;
-    const axes_metadata &axes   = bench.get_axes();
+    const auto &bench   = *bench_ptr;
+    const auto &devices = bench.get_devices();
+    const auto &axes    = bench.get_axes();
 
-    fmt::print("\n## {}\n\n", bench.get_name());
+    fmt::print("\n## {}\n", bench.get_name());
 
-    std::size_t row = 0;
-    table_builder table;
-
-    for (const auto &inner_states : bench.get_states())
+    // Do a single pass when no devices are specified. This happens for
+    // benchmarks with `cpu` exec_tags.
+    const std::size_t num_device_passes = devices.empty() ? 1 : devices.size();
+    for (std::size_t device_pass = 0; device_pass < num_device_passes;
+         ++device_pass)
     {
-      for (const nvbench::state &state : inner_states)
+      std::optional<nvbench::device_info> device =
+        devices.empty() ? std::nullopt
+                        : std::make_optional(devices[device_pass]);
+
+      if (device)
       {
-        const auto &axis_values = state.get_axis_values();
-        for (const auto &name : axis_values.get_names())
-        {
-          // Handle power-of-two int64 axes differently:
-          if (axis_values.get_type(name) == named_values::type::int64 &&
-              axes.get_int64_axis(name).is_power_of_two())
-          {
-            const nvbench::uint64_t value    = axis_values.get_int64(name);
-            const nvbench::uint64_t exponent = int64_axis::compute_log2(value);
-            table.add_cell(row,
-                           name + "_axis_pretty",
-                           name,
-                           fmt::format("2^{}", exponent));
-            table.add_cell(row,
-                           name + "_axis_descriptive",
-                           fmt::format("({})", name),
-                           fmt::to_string(value));
-          }
-          else
-          {
-            std::string value = std::visit(format_visitor,
-                                           axis_values.get_value(name));
-            table.add_cell(row, name + "_axis", name, std::move(value));
-          }
-        }
-
-        for (const auto &summ : state.get_summaries())
-        {
-          if (summ.has_value("hide"))
-          {
-            continue;
-          }
-          const std::string &key    = summ.get_name();
-          const std::string &header = summ.has_value("short_name")
-                                        ? summ.get_string("short_name")
-                                        : key;
-
-          std::string hint = summ.has_value("hint") ? summ.get_string("hint")
-                                                    : std::string{};
-          if (hint == "duration")
-          {
-            table.add_cell(row,
-                           key,
-                           header,
-                           format_duration(summ.get_float64("value")));
-          }
-          else if (hint == "item_rate")
-          {
-            table.add_cell(row,
-                           key,
-                           header,
-                           format_item_rate(summ.get_float64("value")));
-          }
-          else if (hint == "bytes")
-          {
-            table.add_cell(row,
-                           key,
-                           header,
-                           format_bytes(summ.get_int64("value")));
-          }
-          else if (hint == "byte_rate")
-          {
-            table.add_cell(row,
-                           key,
-                           header,
-                           format_byte_rate(summ.get_float64("value")));
-          }
-          else if (hint == "percentage")
-          {
-            table.add_cell(row,
-                           key,
-                           header,
-                           format_percentage(summ.get_float64("value")));
-          }
-          else
-          {
-            table.add_cell(row,
-                           key,
-                           header,
-                           std::visit(format_visitor, summ.get_value("value")));
-          }
-        }
-        row++;
+        fmt::print("\n### [{}] {}\n\n", device->get_id(), device->get_name());
       }
-    }
 
-    fmt::print("{}", table.to_string());
-  } // end foreach benchmark
+      std::size_t row = 0;
+      table_builder table;
+
+      for (const auto &cur_state : bench.get_states())
+      {
+        if (cur_state.get_device() == device)
+        {
+          const auto &axis_values = cur_state.get_axis_values();
+          for (const auto &name : axis_values.get_names())
+          {
+            // Handle power-of-two int64 axes differently:
+            if (axis_values.get_type(name) == named_values::type::int64 &&
+                axes.get_int64_axis(name).is_power_of_two())
+            {
+              const nvbench::int64_t value    = axis_values.get_int64(name);
+              const nvbench::int64_t exponent = int64_axis::compute_log2(value);
+              table.add_cell(row,
+                             name + "_axis_pretty",
+                             name,
+                             fmt::format("2^{}", exponent));
+              table.add_cell(row,
+                             name + "_axis_descriptive",
+                             fmt::format("({})", name),
+                             fmt::to_string(value));
+            }
+            else
+            {
+              std::string value = std::visit(format_visitor,
+                                             axis_values.get_value(name));
+              table.add_cell(row, name + "_axis", name, std::move(value));
+            }
+          }
+
+          for (const auto &summ : cur_state.get_summaries())
+          {
+            if (summ.has_value("hide"))
+            {
+              continue;
+            }
+            const std::string &key    = summ.get_name();
+            const std::string &header = summ.has_value("short_name")
+                                          ? summ.get_string("short_name")
+                                          : key;
+
+            std::string hint = summ.has_value("hint") ? summ.get_string("hint")
+                                                      : std::string{};
+            if (hint == "duration")
+            {
+              table.add_cell(row,
+                             key,
+                             header,
+                             format_duration(summ.get_float64("value")));
+            }
+            else if (hint == "item_rate")
+            {
+              table.add_cell(row,
+                             key,
+                             header,
+                             format_item_rate(summ.get_float64("value")));
+            }
+            else if (hint == "bytes")
+            {
+              table.add_cell(row,
+                             key,
+                             header,
+                             format_bytes(summ.get_int64("value")));
+            }
+            else if (hint == "byte_rate")
+            {
+              table.add_cell(row,
+                             key,
+                             header,
+                             format_byte_rate(summ.get_float64("value")));
+            }
+            else if (hint == "percentage")
+            {
+              table.add_cell(row,
+                             key,
+                             header,
+                             format_percentage(summ.get_float64("value")));
+            }
+            else
+            {
+              table.add_cell(row,
+                             key,
+                             header,
+                             std::visit(format_visitor,
+                                        summ.get_value("value")));
+            }
+          }
+          row++;
+        }
+      }
+
+      fmt::print("{}", table.to_string());
+    } // end foreach device_pass
+  }
 }
 
 } // namespace detail
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index b3def99..5e24208 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -8,6 +8,7 @@
 
 #include <algorithm>
 #include <cstdio>
+#include <stdexcept>
 #include <variant>
 
 namespace nvbench
@@ -16,9 +17,29 @@ namespace nvbench
 namespace detail
 {
 
+void measure_cold_base::check()
+{
+  const auto device = m_state.get_device();
+  if (!device)
+  {
+    throw std::runtime_error(fmt::format("{}:{}: Device required for `cold` "
+                                         "measurement.",
+                                         __FILE__,
+                                         __LINE__));
+  }
+  if (!device->is_active())
+  { // This means something went wrong higher up. Throw an error.
+    throw std::runtime_error(fmt::format("{}:{}: Internal error: Current "
+                                         "device is not active.",
+                                         __FILE__,
+                                         __LINE__));
+  }
+}
+
 void measure_cold_base::generate_summaries()
 {
-  const auto avg_cuda_time = m_total_cuda_time / m_total_iters;
+  const auto d_iters = static_cast<double>(m_total_iters);
+  const auto avg_cuda_time = m_total_cuda_time / d_iters;
   {
     auto &summ = m_state.add_summary("Average GPU Time (Cold)");
     summ.set_string("hint", "duration");
@@ -39,7 +60,7 @@ void measure_cold_base::generate_summaries()
     summ.set_float64("value", m_cuda_noise);
   }
 
-  const auto avg_cpu_time = m_total_cpu_time / m_total_iters;
+  const auto avg_cpu_time = m_total_cpu_time / d_iters;
   {
     auto &summ = m_state.add_summary("Average CPU Time (Cold)");
     summ.set_string("hint", "duration");
@@ -70,7 +91,7 @@ void measure_cold_base::generate_summaries()
 
   // Log to stdout:
   fmt::memory_buffer param_buffer;
-  fmt::format_to(param_buffer, "");
+  fmt::format_to(param_buffer, "Device={}", m_state.get_device()->get_id());
   const axes_metadata &axes = m_state.get_benchmark().get_axes();
   const auto &axis_values   = m_state.get_axis_values();
   for (const auto &name : axis_values.get_names())
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index c15ad29..fcbe1ed 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -3,6 +3,7 @@
 #include <nvbench/cpu_timer.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_timer.cuh>
+#include <nvbench/device_info.cuh>
 #include <nvbench/launch.cuh>
 #include <nvbench/state.cuh>
 
@@ -33,6 +34,9 @@ struct measure_cold_base
   measure_cold_base &operator=(measure_cold_base &&) = delete;
 
 protected:
+
+  void check();
+
   void initialize()
   {
     m_total_cuda_time = 0.;
@@ -54,15 +58,15 @@ protected:
   nvbench::cpu_timer m_cpu_timer;
   nvbench::detail::l2flush m_l2flush;
 
-  nvbench::int64_t m_min_iters{100};
+  nvbench::int64_t m_min_iters{10};
   nvbench::int64_t m_total_iters{};
 
-  nvbench::float64_t m_max_noise{1.0}; // % rel stdev
+  nvbench::float64_t m_max_noise{0.5}; // % rel stdev
   nvbench::float64_t m_cuda_noise{};   // % rel stdev
   nvbench::float64_t m_cpu_noise{};    // % rel stdev
 
   nvbench::float64_t m_min_time{0.5};
-  nvbench::float64_t m_max_time{1.0};
+  nvbench::float64_t m_max_time{3.0};
 
   nvbench::float64_t m_total_cuda_time{};
   nvbench::float64_t m_total_cpu_time{};
@@ -83,6 +87,7 @@ struct measure_cold : public measure_cold_base
 
   void operator()()
   {
+    this->check();
     this->initialize();
     this->run_warmup();
     this->run_trials();
diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu
index 35923f0..eed0fef 100644
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -10,16 +10,31 @@
 #include <cstdio>
 #include <variant>
 
-// TODO these can be removed once there's a device_manager or some such:
-#include <cuda_runtime_api.h>
-#include <nvbench/cuda_call.cuh>
-
 namespace nvbench
 {
 
 namespace detail
 {
 
+void measure_hot_base::check()
+{
+  const auto device = m_state.get_device();
+  if (!device)
+  {
+    throw std::runtime_error(fmt::format("{}:{}: Device required for `hot` "
+                                         "measurement.",
+                                         __FILE__,
+                                         __LINE__));
+  }
+  if (!device->is_active())
+  { // This means something went wrong higher up. Throw an error.
+    throw std::runtime_error(fmt::format("{}:{}: Internal error: Current "
+                                         "device is not active.",
+                                         __FILE__,
+                                         __LINE__));
+  }
+}
+
 measure_hot_base::measure_hot_base(state &exec_state)
     : m_state(exec_state)
 {
@@ -48,7 +63,8 @@ measure_hot_base::measure_hot_base(state &exec_state)
 
 void measure_hot_base::generate_summaries()
 {
-  const auto avg_cuda_time = m_total_cuda_time / m_total_iters;
+  const auto d_iters       = static_cast<double>(m_total_iters);
+  const auto avg_cuda_time = m_total_cuda_time / d_iters;
   {
     auto &summ = m_state.add_summary("Average GPU Time (Hot)");
     summ.set_string("hint", "duration");
@@ -59,7 +75,7 @@ void measure_hot_base::generate_summaries()
     summ.set_float64("value", avg_cuda_time);
   }
 
-  const auto avg_cpu_time = m_total_cpu_time / m_total_iters;
+  const auto avg_cpu_time = m_total_cpu_time / d_iters;
   {
     auto &summ = m_state.add_summary("Average CPU Time (Hot)");
     summ.set_string("hide",
@@ -86,13 +102,13 @@ void measure_hot_base::generate_summaries()
     summ.set_string("hint", "item_rate");
     summ.set_string("short_name", "Item Rate");
     summ.set_string("description", "Number of input items handled per second.");
-    summ.set_float64("value", items / avg_cuda_time);
+    summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
   }
 
   if (const auto bytes = m_state.get_global_bytes_accessed_per_launch();
       bytes != 0)
   {
-    const auto avg_used_gmem_bw = bytes / avg_cuda_time;
+    const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cuda_time;
     {
       auto &summ = m_state.add_summary("Average Global Memory Throughput");
       summ.set_string("hint", "byte_rate");
@@ -103,16 +119,10 @@ void measure_hot_base::generate_summaries()
       summ.set_float64("value", avg_used_gmem_bw);
     }
 
-    // TODO cache this in a singleton somewhere.
-    int dev_id{};
-    cudaDeviceProp prop{};
-    NVBENCH_CUDA_CALL(cudaGetDevice(&dev_id));
-    NVBENCH_CUDA_CALL(cudaGetDeviceProperties(&prop, dev_id));
-    // clock rate in khz, width in bits. Result in bytes/sec.
-    const auto peak_gmem_bw = 2 * 1000. * prop.memoryClockRate * // (sec^-1)
-                              prop.memoryBusWidth / CHAR_BIT;    // bytes
-
     {
+      const auto peak_gmem_bw = static_cast<double>(
+        m_state.get_device()->get_global_memory_bus_bandwidth());
+
       auto &summ = m_state.add_summary("Percent Peak Global Memory Throughput");
       summ.set_string("hint", "percentage");
       summ.set_string("short_name", "PeakGMem");
@@ -125,7 +135,7 @@ void measure_hot_base::generate_summaries()
 
   // Log to stdout:
   fmt::memory_buffer param_buffer;
-  fmt::format_to(param_buffer, "");
+  fmt::format_to(param_buffer, "Device={}", m_state.get_device()->get_id());
   const axes_metadata &axes = m_state.get_benchmark().get_axes();
   const auto &axis_values   = m_state.get_axis_values();
   for (const auto &name : axis_values.get_names())
@@ -140,8 +150,8 @@ void measure_hot_base::generate_summaries()
     if (axis_values.get_type(name) == named_values::type::int64 &&
         axes.get_int64_axis(name).is_power_of_two())
     {
-      const nvbench::uint64_t value    = axis_values.get_int64(name);
-      const nvbench::uint64_t exponent = int64_axis::compute_log2(value);
+      const nvbench::int64_t value    = axis_values.get_int64(name);
+      const nvbench::int64_t exponent = int64_axis::compute_log2(value);
       fmt::format_to(param_buffer, "2^{}", exponent);
     }
     else
diff --git a/nvbench/detail/measure_hot.cuh b/nvbench/detail/measure_hot.cuh
index 69c4c5f..6d3c0f3 100644
--- a/nvbench/detail/measure_hot.cuh
+++ b/nvbench/detail/measure_hot.cuh
@@ -26,6 +26,9 @@ struct measure_hot_base
   measure_hot_base &operator=(measure_hot_base &&) = delete;
 
 protected:
+
+  void check();
+
   void initialize()
   {
     m_total_cpu_time    = 0.;
@@ -43,10 +46,10 @@ protected:
   nvbench::cpu_timer m_cpu_timer;
 
   nvbench::int64_t m_total_iters{};
-  nvbench::int64_t m_min_iters{100};
+  nvbench::int64_t m_min_iters{10};
 
   nvbench::float64_t m_min_time{0.5};
-  nvbench::float64_t m_max_time{1.0};
+  nvbench::float64_t m_max_time{3.0};
 
   nvbench::float64_t m_total_cuda_time{};
   nvbench::float64_t m_total_cpu_time{};
@@ -64,6 +67,7 @@ struct measure_hot : public measure_hot_base
 
   void operator()()
   {
+    this->check();
     this->initialize();
     this->run_warmup();
     this->run_trials();
diff --git a/nvbench/detail/state_generator.cu b/nvbench/detail/state_generator.cu
index 7b2b583..27c19d6 100644
--- a/nvbench/detail/state_generator.cu
+++ b/nvbench/detail/state_generator.cu
@@ -1,6 +1,7 @@
 #include <nvbench/detail/state_generator.cuh>
 
 #include <nvbench/benchmark_base.cuh>
+#include <nvbench/device_info.cuh>
 #include <nvbench/named_values.cuh>
 #include <nvbench/type_axis.cuh>
 
@@ -207,36 +208,53 @@ void state_generator::build_axis_configs()
 
 void state_generator::build_states()
 {
-  // Assemble states into a std::vector<std::vector<nvbench::state>>, where the
-  // outer vector has one inner vector per type_config, and all configs in an
-  // inner vector use the same type config. This should probably be wrapped up
-  // into a nicer data structure, but organizing states in this way makes
-  // matching up states to kernel_generator instantiations much easier during
-  // dispatch.
-
   m_states.clear();
-  m_states.reserve(m_type_axis_configs.size());
-  for (const auto &[type_config, axis_mask] : m_type_axis_configs)
+
+  const auto &devices = m_benchmark.get_devices();
+  if (devices.empty())
   {
-    auto &inner_states = m_states.emplace_back();
+    this->add_states_for_device(std::nullopt);
+  }
+  else
+  {
+    for (const auto &device : devices)
+    {
+      this->add_states_for_device(device);
+    }
+  }
+}
+
+void state_generator::add_states_for_device(
+  const std::optional<device_info> &device)
+{
+  const auto num_type_configs = m_type_axis_configs.size();
+  for (std::size_t type_config_index = 0; type_config_index < num_type_configs;
+       ++type_config_index)
+  {
+    const auto &[type_config,
+                 axis_mask] = m_type_axis_configs[type_config_index];
 
     if (!axis_mask)
     { // Don't generate inner vector if the type config is masked out.
       continue;
     }
 
-    inner_states.reserve(m_non_type_axis_configs.size());
     for (const auto &non_type_config : m_non_type_axis_configs)
     {
+      // Concatenate the type + non_type configurations:
       nvbench::named_values config = type_config;
       config.append(non_type_config);
-      inner_states.push_back(nvbench::state{m_benchmark, config});
+
+      // Create benchmark:
+      m_states.push_back(nvbench::state{m_benchmark,
+                                        std::move(config),
+                                        device,
+                                        type_config_index});
     }
   }
 }
 
-std::vector<std::vector<nvbench::state>>
-state_generator::create(const benchmark_base &bench)
+std::vector<nvbench::state> state_generator::create(const benchmark_base &bench)
 {
   state_generator sg{bench};
   sg.build_axis_configs();
diff --git a/nvbench/detail/state_generator.cuh b/nvbench/detail/state_generator.cuh
index 90c4e43..b092b57 100644
--- a/nvbench/detail/state_generator.cuh
+++ b/nvbench/detail/state_generator.cuh
@@ -4,6 +4,7 @@
 #include <nvbench/axis_base.cuh>
 #include <nvbench/state.cuh>
 
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -11,25 +12,27 @@
 namespace nvbench
 {
 struct benchmark_base;
+struct device_info;
+
 namespace detail
 {
 
 struct state_generator
 {
-  static std::vector<std::vector<nvbench::state>>
-  create(const benchmark_base &bench);
+  static std::vector<nvbench::state> create(const benchmark_base &bench);
 
 private:
   explicit state_generator(const benchmark_base &bench);
 
   void build_axis_configs();
   void build_states();
+  void add_states_for_device(const std::optional<nvbench::device_info> &device);
 
   const benchmark_base &m_benchmark;
   // bool is a mask value; true if the config is used.
   std::vector<std::pair<nvbench::named_values, bool>> m_type_axis_configs;
   std::vector<nvbench::named_values> m_non_type_axis_configs;
-  std::vector<std::vector<nvbench::state>> m_states;
+  std::vector<nvbench::state> m_states;
 };
 
 // Detail class; Generates a cartesian product of axis indices.
@@ -73,6 +76,5 @@ struct state_iterator
   std::size_t m_total{};
 };
 
-
 } // namespace detail
 } // namespace nvbench
diff --git a/nvbench/device_info.cuh b/nvbench/device_info.cuh
index b4959d3..e3dc345 100644
--- a/nvbench/device_info.cuh
+++ b/nvbench/device_info.cuh
@@ -15,11 +15,17 @@ namespace nvbench
 namespace detail
 {
 int get_ptx_version(int);
-}
+} // namespace detail
 
 struct device_info
 {
-  explicit device_info(int id);
+  explicit device_info(int device_id);
+
+  // Mainly used by unit tests:
+  device_info(int device_id, cudaDeviceProp prop)
+      : m_id{device_id}
+      , m_prop{prop}
+  {}
 
   /// @return The device's id on the current system.
   [[nodiscard]] int get_id() const { return m_id; }
@@ -30,6 +36,18 @@ struct device_info
     return std::string_view(m_prop.name);
   }
 
+  [[nodiscard]] bool is_active() const
+  {
+    int id{-1};
+    NVBENCH_CUDA_CALL(cudaGetDevice(&id));
+    return id == m_id;
+  }
+
+  void set_active() const
+  {
+    NVBENCH_CUDA_CALL(cudaSetDevice(m_id));
+  }
+
   /// @return The SM version of the current device as (major*100) + (minor*10).
   [[nodiscard]] int get_sm_version() const
   {
@@ -145,6 +163,15 @@ struct device_info
     return m_prop;
   }
 
+  [[nodiscard]] bool operator==(const device_info &o) const
+  {
+    return m_id == o.m_id;
+  }
+  [[nodiscard]] bool operator!=(const device_info &o) const
+  {
+    return m_id != o.m_id;
+  }
+
 private:
   int m_id;
   cudaDeviceProp m_prop;
@@ -152,6 +179,8 @@ private:
 
 // get_ptx_version implementation; this needs to stay in the header so it will
 // pick up the downstream project's compilation settings.
+// TODO this is fragile and will break when called from any library
+// translation unit.
 namespace detail
 {
 // Templated to workaround ODR issues since __global__functions cannot be marked
diff --git a/nvbench/runner.cuh b/nvbench/runner.cuh
index 58aa0d1..ee5683c 100644
--- a/nvbench/runner.cuh
+++ b/nvbench/runner.cuh
@@ -29,24 +29,51 @@ struct runner
 
   void run()
   {
-    auto states_iter = m_benchmark.m_states.begin();
-    if (states_iter + num_type_configs != m_benchmark.m_states.end())
+    if (m_benchmark.m_devices.empty())
     {
-      throw std::runtime_error("State vector doesn't match type_configs.");
+      this->run_device(std::nullopt);
+    }
+    else
+    {
+      for (const auto &device : m_benchmark.m_devices)
+      {
+        this->run_device(device);
+      }
     }
-
-    nvbench::tl::foreach<type_configs>(
-      [&states_iter](auto type_config_wrapper) {
-        using type_config = typename decltype(type_config_wrapper)::type;
-        for (nvbench::state &cur_state : *states_iter)
-        {
-          kernel_generator{}(cur_state, type_config{});
-        }
-        states_iter++;
-      });
   }
 
 private:
+
+  void run_device(const std::optional<nvbench::device_info> &device)
+  {
+    if (device)
+    {
+      device->set_active();
+    }
+
+    // Iterate through type_configs:
+    std::size_t type_config_index = 0;
+    nvbench::tl::foreach<type_configs>([&states = m_benchmark.m_states,
+                                        &type_config_index,
+                                        &device](auto type_config_wrapper) {
+
+      // Get current type_config:
+      using type_config = typename decltype(type_config_wrapper)::type;
+
+      // Find states with the current device / type_config
+      for (nvbench::state &cur_state : states)
+      {
+        if (cur_state.get_device() == device &&
+            cur_state.get_type_config_index() == type_config_index)
+        {
+          kernel_generator{}(cur_state, type_config{});
+        }
+      }
+
+      ++type_config_index;
+    });
+  }
+
   benchmark_type &m_benchmark;
 };
 
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index edbad6b..c4150cc 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -1,10 +1,12 @@
 #pragma once
 
+#include <nvbench/device_info.cuh>
 #include <nvbench/named_values.cuh>
 #include <nvbench/summary.cuh>
 #include <nvbench/types.cuh>
 
 #include <functional>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -17,7 +19,7 @@ namespace detail
 {
 struct state_generator;
 struct state_tester;
-}
+} // namespace detail
 
 /**
  * Stores all information about a particular benchmark configuration.
@@ -41,6 +43,20 @@ struct state
   state &operator=(const state &) = delete;
   state &operator=(state &&) = default;
 
+  /// The CUDA device associated with with this benchmark state. May be
+  /// nullopt for CPU-only benchmarks.
+  [[nodiscard]] const std::optional<nvbench::device_info> &get_device() const
+  {
+    return m_device;
+  }
+
+  /// An index into a benchmark::type_configs type_list. Returns 0 if no type
+  /// axes in the associated benchmark.
+  [[nodiscard]] std::size_t get_type_config_index() const
+  {
+    return m_type_config_index;
+  }
+
   [[nodiscard]] nvbench::int64_t get_int64(const std::string &axis_name) const;
 
   [[nodiscard]] nvbench::float64_t
@@ -99,13 +115,21 @@ private:
       : m_benchmark{bench}
   {}
 
-  state(const benchmark_base &bench, nvbench::named_values values)
+  state(const benchmark_base &bench,
+        nvbench::named_values values,
+        std::optional<nvbench::device_info> device,
+        std::size_t type_config_index)
       : m_benchmark{bench}
       , m_axis_values{std::move(values)}
+      , m_device{std::move(device)}
+      , m_type_config_index{type_config_index}
   {}
 
   std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
   nvbench::named_values m_axis_values;
+  std::optional<nvbench::device_info> m_device;
+  std::size_t m_type_config_index{};
+
   std::vector<nvbench::summary> m_summaries;
   std::string m_skip_reason;
   nvbench::int64_t m_items_processed_per_launch{};
diff --git a/testing/create.cu b/testing/create.cu
index 5d6ab4e..4708007 100644
--- a/testing/create.cu
+++ b/testing/create.cu
@@ -83,18 +83,15 @@ std::string run_and_get_state_string(nvbench::benchmark_base &bench,
                                      std::size_t num_type_configs,
                                      std::size_t states_per_type_config)
 {
+  bench.set_devices(std::vector<int>{});
   bench.run();
   fmt::memory_buffer buffer;
   const auto &states = bench.get_states();
-  ASSERT(states.size() == num_type_configs);
-  for (const auto &inner_states : states)
+  ASSERT(states.size() == num_type_configs * states_per_type_config);
+  for (const auto &state : states)
   {
-    ASSERT(inner_states.size() == states_per_type_config);
-    for (const auto &state : inner_states)
-    {
-      ASSERT(state.is_skipped());
-      fmt::format_to(buffer, "{}\n", state.get_skip_reason());
-    }
+    ASSERT(state.is_skipped());
+    fmt::format_to(buffer, "{}\n", state.get_skip_reason());
   }
   return fmt::to_string(buffer);
 }
diff --git a/testing/option_parser.cu b/testing/option_parser.cu
index 0b9a958..7b0c825 100644
--- a/testing/option_parser.cu
+++ b/testing/option_parser.cu
@@ -32,7 +32,7 @@ namespace
 {
 
 [[nodiscard]] std::string
-states_to_string(const std::vector<std::vector<nvbench::state>> &states)
+states_to_string(const std::vector<nvbench::state> &states)
 {
   fmt::memory_buffer buffer;
   std::string table_format = "| {:^5} | {:^10} | {:^4} | {:^4} | {:^4} "
@@ -50,24 +50,19 @@ states_to_string(const std::vector<std::vector<nvbench::state>> &states)
                  "Floats",
                  "Strings");
 
-  std::size_t type_config = 0;
-  std::size_t config      = 0;
-  for (const auto &inner_states : states)
+  std::size_t config = 0;
+  for (const auto &state : states)
   {
-    for (const nvbench::state &state : inner_states)
-    {
-      fmt::format_to(buffer,
-                     table_format,
-                     config++,
-                     type_config,
-                     state.get_string("T"),
-                     state.get_string("U"),
-                     state.get_int64("Ints"),
-                     state.get_int64("PO2s"),
-                     state.get_float64("Floats"),
-                     std::string{"\'"} + state.get_string("Strings") + "'");
-    }
-    type_config++;
+    fmt::format_to(buffer,
+                   table_format,
+                   config++,
+                   state.get_type_config_index(),
+                   state.get_string("T"),
+                   state.get_string("U"),
+                   state.get_int64("Ints"),
+                   state.get_int64("PO2s"),
+                   state.get_float64("Floats"),
+                   std::string{"\'"} + state.get_string("Strings") + "'");
   }
   return fmt::to_string(buffer);
 }
@@ -333,8 +328,7 @@ void test_int64_axis_pow2_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = 7 "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = 7 "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -451,8 +445,7 @@ void test_int64_axis_none_to_pow2_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = 7 "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = 7 "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -569,8 +562,7 @@ void test_int64_axis_pow2_to_none_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ ] = 2 "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = 2 "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -584,8 +576,7 @@ void test_int64_axis_pow2_to_none_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -687,8 +678,7 @@ void test_float64_axis_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Floats [ ] = 3.5 "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = 3.5 "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -727,8 +717,7 @@ void test_float64_axis_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", "Floats=[3.5:3.6]"});
+    parser.parse({"--benchmark", "TestBench", "--axis", "Floats=[3.5:3.6]"});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -893,8 +882,7 @@ void test_type_axis_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " T [ ] = U8 "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " T [ ] = U8 "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -908,8 +896,7 @@ void test_type_axis_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " T [ ] = [ U8 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " T [ ] = [ U8 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
diff --git a/testing/runner.cu b/testing/runner.cu
index ebfc800..e61d0a3 100644
--- a/testing/runner.cu
+++ b/testing/runner.cu
@@ -73,12 +73,10 @@ void test_empty()
 
   runner.generate_states();
   ASSERT(bench.get_states().size() == 1);
-  ASSERT(bench.get_states().front().size() == 1);
-  ASSERT(bench.get_states().front().front().is_skipped() == false);
+  ASSERT(bench.get_states().front().is_skipped() == false);
   runner.run();
   ASSERT(bench.get_states().size() == 1);
-  ASSERT(bench.get_states().front().size() == 1);
-  ASSERT(bench.get_states().front().front().is_skipped() == true);
+  ASSERT(bench.get_states().front().is_skipped() == true);
 }
 
 void test_non_types()
@@ -94,18 +92,16 @@ void test_non_types()
   runner_type runner{bench};
 
   runner.generate_states();
-  ASSERT(bench.get_states().size() == 1);
-  ASSERT(bench.get_states().front().size() == 27);
-  for (const auto &state : bench.get_states().front())
+  ASSERT(bench.get_states().size() == 27);
+  for (const auto &state : bench.get_states())
   {
     ASSERT(state.is_skipped() == false);
   }
 
   fmt::memory_buffer buffer;
   runner.run();
-  ASSERT(bench.get_states().size() == 1);
-  ASSERT(bench.get_states().front().size() == 27);
-  for (const auto &state : bench.get_states().front())
+  ASSERT(bench.get_states().size() == 27);
+  for (const auto &state : bench.get_states())
   {
     ASSERT(state.is_skipped() == true);
     fmt::format_to(buffer, "{}\n", state.get_skip_reason());
@@ -150,32 +146,25 @@ void test_types()
   using runner_type    = nvbench::runner<benchmark_type>;
 
   benchmark_type bench;
+  bench.set_devices(std::vector<int>{});
   bench.set_type_axes_names({"FloatT", "IntT", "MiscT"});
 
   runner_type runner{bench};
 
   runner.generate_states();
   ASSERT(bench.get_states().size() == 8);
-  for (const auto &inner_states : bench.get_states())
+  for (const auto &state : bench.get_states())
   {
-    ASSERT(inner_states.size() == 1);
-    for (const auto &state : inner_states)
-    {
-      ASSERT(state.is_skipped() == false);
-    }
+    ASSERT(state.is_skipped() == false);
   }
 
   fmt::memory_buffer buffer;
   runner.run();
   ASSERT(bench.get_states().size() == 8);
-  for (const auto &inner_states : bench.get_states())
+  for (const auto &state : bench.get_states())
   {
-    ASSERT(inner_states.size() == 1);
-    for (const auto &state : inner_states)
-    {
-      ASSERT(state.is_skipped() == true);
-      fmt::format_to(buffer, "{}\n", state.get_skip_reason());
-    }
+    ASSERT(state.is_skipped() == true);
+    fmt::format_to(buffer, "{}\n", state.get_skip_reason());
   }
 
   const std::string ref = R"expected(Params: FloatT: F32 IntT: I32 MiscT: bool
@@ -198,6 +187,7 @@ void test_both()
   using runner_type    = nvbench::runner<benchmark_type>;
 
   benchmark_type bench;
+  bench.set_devices(std::vector<int>{});
   bench.set_type_axes_names({"FloatT", "IntT", "MiscT"});
   bench.add_int64_axis("Int", {1, 2, 3});
   bench.add_float64_axis("Float", {11.0, 12.0, 13.0});
@@ -206,27 +196,19 @@ void test_both()
   runner_type runner{bench};
 
   runner.generate_states();
-  ASSERT(bench.get_states().size() == 8);
-  for (const auto &inner_states : bench.get_states())
+  ASSERT(bench.get_states().size() == 8 * 27);
+  for (const auto &state : bench.get_states())
   {
-    ASSERT(inner_states.size() == 27);
-    for (const auto &state : inner_states)
-    {
-      ASSERT(state.is_skipped() == false);
-    }
+    ASSERT(state.is_skipped() == false);
   }
 
   fmt::memory_buffer buffer;
   runner.run();
-  ASSERT(bench.get_states().size() == 8);
-  for (const auto &inner_states : bench.get_states())
+  ASSERT(bench.get_states().size() == 8 * 27);
+  for (const auto &state : bench.get_states())
   {
-    ASSERT(inner_states.size() == 27);
-    for (const auto &state : inner_states)
-    {
-      ASSERT(state.is_skipped() == true);
-      fmt::format_to(buffer, "{}\n", state.get_skip_reason());
-    }
+    ASSERT(state.is_skipped() == true);
+    fmt::format_to(buffer, "{}\n", state.get_skip_reason());
   }
 
   const std::string ref =
diff --git a/testing/state.cu b/testing/state.cu
index 05861c3..18e0fc0 100644
--- a/testing/state.cu
+++ b/testing/state.cu
@@ -13,9 +13,11 @@ NVBENCH_DEFINE_CALLABLE(dummy_generator, dummy_callable);
 using dummy_bench = nvbench::benchmark<dummy_callable>;
 
 // Subclass to gain access to protected members for testing:
+namespace nvbench::detail
+{
 struct state_tester : public nvbench::state
 {
-  state_tester(const nvbench::benchmark_base& bench)
+  state_tester(const nvbench::benchmark_base &bench)
       : nvbench::state{bench}
   {}
 
@@ -27,6 +29,9 @@ struct state_tester : public nvbench::state
                                            std::forward<T>(value)});
   }
 };
+} // namespace nvbench::detail
+
+using nvbench::detail::state_tester;
 
 void test_params()
 {
@@ -50,7 +55,7 @@ void test_summaries()
   ASSERT(state.get_summaries().size() == 0);
 
   {
-    nvbench::summary& summary = state.add_summary("Test Summary1");
+    nvbench::summary &summary = state.add_summary("Test Summary1");
     summary.set_float64("Float", 3.14);
     summary.set_int64("Int", 128);
     summary.set_string("String", "str");
diff --git a/testing/state_generator.cu b/testing/state_generator.cu
index 805fba0..b3a2099 100644
--- a/testing/state_generator.cu
+++ b/testing/state_generator.cu
@@ -130,6 +130,7 @@ void test_basic()
 void test_create()
 {
   dummy_bench bench;
+  bench.set_devices(std::vector<int>{});
   bench.add_float64_axis("Radians", {3.14, 6.28});
   bench.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none);
   bench.add_int64_axis("NumInputs",
@@ -137,22 +138,14 @@ void test_create()
                        nvbench::int64_axis_flags::power_of_two);
   bench.add_string_axis("Strategy", {"Recursive", "Iterative"});
 
-  const std::vector<std::vector<nvbench::state>> states =
+  const std::vector<nvbench::state> states =
     nvbench::detail::state_generator::create(bench);
 
-  // Outer vector has one entry per type_config. There are no type axes, so
-  // there's only one type_config:
-  ASSERT(states.size() == 1);
-
-  // Inner vectors have one entry per non-type config:
   // 2 (Radians) * 3 (VecSize) * 3 (NumInputs) * 2 (Strategy) = 36
-  for (const auto &inner_states : states)
-  {
-    ASSERT(inner_states.size() == 36);
-  }
+  ASSERT(states.size() == 36);
 
   fmt::memory_buffer buffer;
-  std::string table_format =
+  const std::string table_format =
     "| {:^5} | {:^10} | {:^7} | {:^7} | {:^9} | {:^9} |\n";
 
   fmt::format_to(buffer, "\n");
@@ -165,22 +158,17 @@ void test_create()
                  "NumInputs",
                  "Strategy");
 
-  std::size_t type_config = 0;
-  std::size_t config      = 0;
-  for (const auto &inner_states : states)
+  std::size_t config = 0;
+  for (const auto &state : states)
   {
-    for (const nvbench::state &state : inner_states)
-    {
-      fmt::format_to(buffer,
-                     table_format,
-                     config++,
-                     type_config,
-                     state.get_float64("Radians"),
-                     state.get_int64("VecSize"),
-                     state.get_int64("NumInputs"),
-                     state.get_string("Strategy"));
-    }
-    type_config++;
+    fmt::format_to(buffer,
+                   table_format,
+                   config++,
+                   state.get_type_config_index(),
+                   state.get_float64("Radians"),
+                   state.get_int64("VecSize"),
+                   state.get_int64("NumInputs"),
+                   state.get_string("Strategy"));
   }
 
   const std::string ref =
@@ -231,6 +219,7 @@ void test_create()
 void test_create_with_types()
 {
   template_bench bench;
+  bench.set_devices(std::vector<int>{});
   bench.set_type_axes_names({"Floats", "Ints", "Misc"});
   bench.add_float64_axis("Radians", {3.14, 6.28});
   bench.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none);
@@ -239,19 +228,13 @@ void test_create_with_types()
                        nvbench::int64_axis_flags::power_of_two);
   bench.add_string_axis("Strategy", {"Recursive", "Iterative"});
 
-  const std::vector<std::vector<nvbench::state>> states =
+  const std::vector<nvbench::state> states =
     nvbench::detail::state_generator::create(bench);
 
-  // Outer vector has one entry per type_config
-  // 2 (Floats) * 2 (Ints) * 2 (Misc) = 8 total type_configs
-  ASSERT(states.size() == 8);
-
-  // Inner vectors have one entry per non-type config:
-  // 2 (Radians) * 3 (VecSize) * 3 (NumInputs) * 2 (Strategy) = 36
-  for (const auto &inner_states : states)
-  {
-    ASSERT(inner_states.size() == 36);
-  }
+  // - 2 (Floats) * 2 (Ints) * 2 (Misc) = 8 total type_configs
+  // - 2 (Radians) * 3 (VecSize) * 3 (NumInputs) * 2 (Strategy) = 36 non_type
+  //   configs
+  ASSERT(states.size() == 8 * 36);
 
   fmt::memory_buffer buffer;
   std::string table_format = "| {:^5} | {:^10} | {:^6} | {:^4} | {:^4} | {:^7} "
@@ -270,25 +253,20 @@ void test_create_with_types()
                  "NumInputs",
                  "Strategy");
 
-  std::size_t type_config = 0;
-  std::size_t config      = 0;
-  for (const auto &inner_states : states)
+  std::size_t config = 0;
+  for (const auto &state : states)
   {
-    for (const nvbench::state &state : inner_states)
-    {
-      fmt::format_to(buffer,
-                     table_format,
-                     config++,
-                     type_config,
-                     state.get_string("Floats"),
-                     state.get_string("Ints"),
-                     state.get_string("Misc"),
-                     state.get_float64("Radians"),
-                     state.get_int64("VecSize"),
-                     state.get_int64("NumInputs"),
-                     state.get_string("Strategy"));
-    }
-    type_config++;
+    fmt::format_to(buffer,
+                   table_format,
+                   config++,
+                   state.get_type_config_index(),
+                   state.get_string("Floats"),
+                   state.get_string("Ints"),
+                   state.get_string("Misc"),
+                   state.get_float64("Radians"),
+                   state.get_int64("VecSize"),
+                   state.get_int64("NumInputs"),
+                   state.get_string("Strategy"));
   }
 
   const std::string ref =
@@ -591,6 +569,7 @@ void test_create_with_types()
 void test_create_with_masked_types()
 {
   template_bench bench;
+  bench.set_devices(std::vector<int>{});
   bench.set_type_axes_names({"Floats", "Ints", "Misc"});
   bench.add_float64_axis("Radians", {3.14, 6.28});
   bench.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none);
@@ -603,7 +582,7 @@ void test_create_with_masked_types()
   bench.get_axes().get_type_axis("Floats").set_active_inputs({"F32"});
   bench.get_axes().get_type_axis("Ints").set_active_inputs({"I64"});
 
-  const std::vector<std::vector<nvbench::state>> states =
+  const std::vector<nvbench::state> states =
     nvbench::detail::state_generator::create(bench);
 
   fmt::memory_buffer buffer;
@@ -623,25 +602,20 @@ void test_create_with_masked_types()
                  "NumInputs",
                  "Strategy");
 
-  std::size_t type_config = 0;
-  std::size_t config      = 0;
-  for (const auto &inner_states : states)
+  std::size_t config = 0;
+  for (const auto &state : states)
   {
-    for (const nvbench::state &state : inner_states)
-    {
-      fmt::format_to(buffer,
-                     table_format,
-                     config++,
-                     type_config,
-                     state.get_string("Floats"),
-                     state.get_string("Ints"),
-                     state.get_string("Misc"),
-                     state.get_float64("Radians"),
-                     state.get_int64("VecSize"),
-                     state.get_int64("NumInputs"),
-                     state.get_string("Strategy"));
-    }
-    type_config++;
+    fmt::format_to(buffer,
+                   table_format,
+                   config++,
+                   state.get_type_config_index(),
+                   state.get_string("Floats"),
+                   state.get_string("Ints"),
+                   state.get_string("Misc"),
+                   state.get_float64("Radians"),
+                   state.get_int64("VecSize"),
+                   state.get_int64("NumInputs"),
+                   state.get_string("Strategy"));
   }
 
   const std::string ref =
@@ -725,7 +699,69 @@ void test_create_with_masked_types()
   ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
 }
 
+void test_devices()
+{
+  const auto device_0 = nvbench::device_info{0, {}};
+  const auto device_1 = nvbench::device_info{1, {}};
+  const auto device_2 = nvbench::device_info{2, {}};
+
+  dummy_bench bench;
+  bench.set_devices({device_0, device_1, device_2});
+  bench.add_string_axis("S", {"foo", "bar"});
+  bench.add_int64_axis("I", {2, 4});
+
+  const std::vector<nvbench::state> states =
+    nvbench::detail::state_generator::create(bench);
+
+  // 3 devices * 4 axis configs = 12 total states
+  ASSERT(states.size() == 12);
+
+  fmt::memory_buffer buffer;
+  const std::string table_format =
+    "| {:^5} | {:^6} | {:^5} | {:^3} |\n";
+
+  fmt::format_to(buffer, "\n");
+  fmt::format_to(buffer,
+                 table_format,
+                 "State",
+                 "Device",
+                 "S",
+                 "I");
+
+  std::size_t config = 0;
+  for (const auto &state : states)
+  {
+    fmt::format_to(buffer,
+                   table_format,
+                   config++,
+                   state.get_device()->get_id(),
+                   state.get_string("S"),
+                   state.get_int64("I"));
+  }
+
+  const std::string ref =
+    R"expected(
+| State | Device |   S   |  I  |
+|   0   |   0    |  foo  |  2  |
+|   1   |   0    |  bar  |  2  |
+|   2   |   0    |  foo  |  4  |
+|   3   |   0    |  bar  |  4  |
+|   4   |   1    |  foo  |  2  |
+|   5   |   1    |  bar  |  2  |
+|   6   |   1    |  foo  |  4  |
+|   7   |   1    |  bar  |  4  |
+|   8   |   2    |  foo  |  2  |
+|   9   |   2    |  bar  |  2  |
+|  10   |   2    |  foo  |  4  |
+|  11   |   2    |  bar  |  4  |
+)expected";
+
+  const std::string test = fmt::to_string(buffer);
+  ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
+}
+
 int main()
+try
 {
   test_empty();
   test_single_state();
@@ -733,4 +769,11 @@ int main()
   test_create();
   test_create_with_types();
   test_create_with_masked_types();
+  test_devices();
+  return 0;
+}
+catch (std::exception& e)
+{
+  fmt::print("{}\n", e.what());
+  return 1;
 }