diff --git a/docs/cli_help.md b/docs/cli_help.md
index 9a6e6fc..80f82f7 100644
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -94,3 +94,10 @@
     noise).
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
     before any `--benchmark` arguments.
+
+* `--run-once`
+  * Only run the benchmark once, skipping any warmup runs and batched
+    measurements.
+  * Intended for use with external profiling tools.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh
index 092e38b..c77c496 100644
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -177,6 +177,17 @@ struct benchmark_base
   }
   /// @}
 
+  /// If true, the benchmark is only run once, skipping all warmup runs and only
+  /// executing a single non-batched measurement. This is intended for use with
+  /// external profiling tools. @{
+  [[nodiscard]] bool get_run_once() const { return m_run_once; }
+  benchmark_base &set_run_once(bool v)
+  {
+    m_run_once = v;
+    return *this;
+  }
+  /// @}
+
   /// Accumulate at least this many seconds of timing data per measurement. @{
   [[nodiscard]] nvbench::float64_t get_min_time() const { return m_min_time; }
   benchmark_base &set_min_time(nvbench::float64_t min_time)
@@ -239,6 +250,8 @@ protected:
 
   optional_ref<nvbench::printer_base> m_printer;
 
+  bool m_run_once{false};
+
   nvbench::int64_t m_min_samples{10};
   nvbench::float64_t m_min_time{0.5};
   nvbench::float64_t m_max_noise{0.005}; // 0.5% relative standard deviation
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index 2ac39e7..f80c2d2 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -36,6 +36,7 @@ namespace nvbench::detail
 
 measure_cold_base::measure_cold_base(state &exec_state)
     : m_state{exec_state}
+    , m_run_once{exec_state.get_run_once()}
     , m_min_samples{exec_state.get_min_samples()}
     , m_max_noise{exec_state.get_max_noise()}
     , m_min_time{exec_state.get_min_time()}
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index ed1c2f9..b1ea953 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -98,6 +98,8 @@ protected:
   nvbench::detail::l2flush m_l2flush;
   nvbench::blocking_kernel m_blocker;
 
+  bool m_run_once{false};
+
   nvbench::int64_t m_min_samples{};
   nvbench::float64_t m_max_noise{}; // rel stdev
   nvbench::float64_t m_min_time{};
@@ -177,6 +179,11 @@ private:
   // measurement.
   void run_warmup()
   {
+    if (m_run_once)
+    { // Skip warmups
+      return;
+    }
+
     kernel_launch_timer<use_blocking_kernel> timer(*this);
     this->launch_kernel(timer);
     this->check_skip_time(m_cuda_timer.get_duration());
@@ -206,6 +213,11 @@ private:
       m_timeout_timer.stop();
       const auto total_time = m_timeout_timer.get_duration();
 
+      if (m_run_once)
+      {
+        break;
+      }
+
       if (m_total_cuda_time > m_min_time &&  // Min time okay
           m_total_samples > m_min_samples && // Min samples okay
           m_cuda_noise < m_max_noise)        // Noise okay
diff --git a/nvbench/detail/state_exec.cuh b/nvbench/detail/state_exec.cuh
index 4a71360..7e6f209 100644
--- a/nvbench/detail/state_exec.cuh
+++ b/nvbench/detail/state_exec.cuh
@@ -44,11 +44,20 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
                 "`ExecTags` argument must be a member (or combination of "
                 "members) from nvbench::exec_tag.");
 
-  // If no measurements selected, pick some defaults based on the modifiers:
   constexpr auto measure_tags = tags & measure_mask;
+  constexpr auto modifier_tags = tags & modifier_mask;
+
+  // "run once" is handled by the cold measurement:
+  if (!(modifier_tags & run_once) && this->get_run_once())
+  {
+    constexpr auto run_once_tags = modifier_tags | cold | run_once;
+    this->exec(run_once_tags, std::forward<KernelLauncher>(kernel_launcher));
+    return;
+  }
+
+  // If no measurements selected, pick some defaults based on the modifiers:
   if constexpr (!measure_tags)
   {
-    constexpr auto modifier_tags = tags & modifier_mask;
     if constexpr (modifier_tags & (timer | sync))
     { // Can't do hot timings with manual timer or sync; whole point is to not
       // sync in between executions.
diff --git a/nvbench/exec_tag.cuh b/nvbench/exec_tag.cuh
index 07e520f..b49ed36 100644
--- a/nvbench/exec_tag.cuh
+++ b/nvbench/exec_tag.cuh
@@ -34,7 +34,8 @@ enum class exec_flag
   timer    = 0x01, // KernelLauncher uses manual timing
   no_block = 0x02, // Disables use of `blocking_kernel`.
   sync     = 0x04, // KernelLauncher has indicated that it will sync
-  modifier_mask = timer | no_block | sync,
+  run_once = 0x08, // Only run the benchmark once (for profiling).
+  modifier_mask = timer | no_block | sync | run_once,
 
   // Measurement types:
   cold = 0x0100, // measure_hot
@@ -93,6 +94,7 @@ using none_t          = tag<nvbench::detail::exec_flag::none>;
 using timer_t         = tag<nvbench::detail::exec_flag::timer>;
 using no_block_t      = tag<nvbench::detail::exec_flag::no_block>;
 using sync_t          = tag<nvbench::detail::exec_flag::sync>;
+using run_once_t      = tag<nvbench::detail::exec_flag::run_once>;
 using hot_t           = tag<nvbench::detail::exec_flag::hot>;
 using cold_t          = tag<nvbench::detail::exec_flag::cold>;
 using modifier_mask_t = tag<nvbench::detail::exec_flag::modifier_mask>;
@@ -102,6 +104,7 @@ constexpr inline none_t none;
 constexpr inline timer_t timer;
 constexpr inline no_block_t no_block;
 constexpr inline sync_t sync;
+constexpr inline run_once_t run_once;
 constexpr inline cold_t cold;
 constexpr inline hot_t hot;
 constexpr inline modifier_mask_t modifier_mask;
diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh
index a6b0a99..3d663fb 100644
--- a/nvbench/option_parser.cuh
+++ b/nvbench/option_parser.cuh
@@ -88,6 +88,8 @@ private:
   void print_help() const;
   void print_help_axis() const;
 
+  void enable_run_once();
+
   void add_benchmark(const std::string &name);
   void replay_global_args();
 
diff --git a/nvbench/option_parser.cxx b/nvbench/option_parser.cxx
index 4f402d6..c45948e 100644
--- a/nvbench/option_parser.cxx
+++ b/nvbench/option_parser.cxx
@@ -385,6 +385,11 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
       this->print_list();
       std::exit(0);
     }
+    else if (arg == "--run-once")
+    {
+      this->enable_run_once();
+      first += 1;
+    }
     else if (arg == "--quiet" | arg == "-q")
     {
       // Setting this flag prevents the default stdout printer from being
@@ -542,6 +547,19 @@ void option_parser::print_help_axis() const
   fmt::print("{}\n", ::cli_help_axis_text);
 }
 
+void option_parser::enable_run_once()
+{
+  // If no active benchmark, save args as global.
+  if (m_benchmarks.empty())
+  {
+    m_global_benchmark_args.push_back("--run-once");
+    return;
+  }
+
+  benchmark_base &bench = *m_benchmarks.back();
+  bench.set_run_once(true);
+}
+
 void option_parser::add_benchmark(const std::string &name)
 try
 {
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index 5497d35..ebabab6 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -150,6 +150,13 @@ struct state
   }
   /// @}
 
+  /// If true, the benchmark is only run once, skipping all warmup runs and only
+  /// executing a single non-batched measurement. This is intended for use with
+  /// external profiling tools. @{
+  [[nodiscard]] bool get_run_once() const { return m_run_once; }
+  void set_run_once(bool v) { m_run_once = v; }
+  /// @}
+
   /// Accumulate at least this many seconds of timing data per measurement. @{
   [[nodiscard]] nvbench::float64_t get_min_time() const { return m_min_time; }
   void set_min_time(nvbench::float64_t min_time) { m_min_time = min_time; }
@@ -256,6 +263,8 @@ private:
   std::optional<nvbench::device_info> m_device;
   std::size_t m_type_config_index{};
 
+  bool m_run_once{false};
+
   nvbench::int64_t m_min_samples;
   nvbench::float64_t m_min_time;
   nvbench::float64_t m_max_noise;
diff --git a/nvbench/state.cxx b/nvbench/state.cxx
index 07501d4..db2ab6d 100644
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -33,6 +33,7 @@ namespace nvbench
 
 state::state(const benchmark_base &bench)
     : m_benchmark{bench}
+    , m_run_once{bench.get_run_once()}
     , m_min_samples{bench.get_min_samples()}
     , m_min_time{bench.get_min_time()}
     , m_max_noise{bench.get_max_noise()}
@@ -48,6 +49,7 @@ state::state(const benchmark_base &bench,
     , m_axis_values{std::move(values)}
     , m_device{std::move(device)}
     , m_type_config_index{type_config_index}
+    , m_run_once{bench.get_run_once()}
     , m_min_samples{bench.get_min_samples()}
     , m_min_time{bench.get_min_time()}
     , m_max_noise{bench.get_max_noise()}