diff --git a/docs/cli_help.md b/docs/cli_help.md index 9a6e6fc..80f82f7 100644 --- a/docs/cli_help.md +++ b/docs/cli_help.md @@ -94,3 +94,10 @@ noise). * Applies to the most recent `--benchmark`, or all benchmarks if specified before any `--benchmark` arguments. + +* `--run-once` + * Only run the benchmark once, skipping any warmup runs and batched + measurements. + * Intended for use with external profiling tools. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index 092e38b..c77c496 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -177,6 +177,17 @@ struct benchmark_base } /// @} + /// If true, the benchmark is only run once, skipping all warmup runs and only + /// executing a single non-batched measurement. This is intended for use with + /// external profiling tools. @{ + [[nodiscard]] bool get_run_once() const { return m_run_once; } + benchmark_base &set_run_once(bool v) + { + m_run_once = v; + return *this; + } + /// @} + /// Accumulate at least this many seconds of timing data per measurement. @{ [[nodiscard]] nvbench::float64_t get_min_time() const { return m_min_time; } benchmark_base &set_min_time(nvbench::float64_t min_time) @@ -239,6 +250,8 @@ protected: optional_ref m_printer; + bool m_run_once{false}; + nvbench::int64_t m_min_samples{10}; nvbench::float64_t m_min_time{0.5}; nvbench::float64_t m_max_noise{0.005}; // 0.5% relative standard deviation diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index 2ac39e7..f80c2d2 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -36,6 +36,7 @@ namespace nvbench::detail measure_cold_base::measure_cold_base(state &exec_state) : m_state{exec_state} + , m_run_once{exec_state.get_run_once()} , m_min_samples{exec_state.get_min_samples()} , m_max_noise{exec_state.get_max_noise()} , m_min_time{exec_state.get_min_time()} diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index ed1c2f9..b1ea953 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -98,6 +98,8 @@ protected: nvbench::detail::l2flush m_l2flush; nvbench::blocking_kernel m_blocker; + bool m_run_once{false}; + nvbench::int64_t m_min_samples{}; nvbench::float64_t m_max_noise{}; // rel stdev nvbench::float64_t m_min_time{}; @@ -177,6 +179,11 @@ private: // measurement. void run_warmup() { + if (m_run_once) + { // Skip warmups + return; + } + kernel_launch_timer timer(*this); this->launch_kernel(timer); this->check_skip_time(m_cuda_timer.get_duration()); @@ -206,6 +213,11 @@ private: m_timeout_timer.stop(); const auto total_time = m_timeout_timer.get_duration(); + if (m_run_once) + { + break; + } + if (m_total_cuda_time > m_min_time && // Min time okay m_total_samples > m_min_samples && // Min samples okay m_cuda_noise < m_max_noise) // Noise okay diff --git a/nvbench/detail/state_exec.cuh b/nvbench/detail/state_exec.cuh index 4a71360..7e6f209 100644 --- a/nvbench/detail/state_exec.cuh +++ b/nvbench/detail/state_exec.cuh @@ -44,11 +44,20 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher) "`ExecTags` argument must be a member (or combination of " "members) from nvbench::exec_tag."); - // If no measurements selected, pick some defaults based on the modifiers: constexpr auto measure_tags = tags & measure_mask; + constexpr auto modifier_tags = tags & modifier_mask; + + // "run once" is handled by the cold measurement: + if (!(modifier_tags & run_once) && this->get_run_once()) + { + constexpr auto run_once_tags = modifier_tags | cold | run_once; + this->exec(run_once_tags, std::forward(kernel_launcher)); + return; + } + + // If no measurements selected, pick some defaults based on the modifiers: if constexpr (!measure_tags) { - constexpr auto modifier_tags = tags & modifier_mask; if constexpr (modifier_tags & (timer | sync)) { // Can't do hot timings with manual timer or sync; whole point is to not // sync in between executions. diff --git a/nvbench/exec_tag.cuh b/nvbench/exec_tag.cuh index 07e520f..b49ed36 100644 --- a/nvbench/exec_tag.cuh +++ b/nvbench/exec_tag.cuh @@ -34,7 +34,8 @@ enum class exec_flag timer = 0x01, // KernelLauncher uses manual timing no_block = 0x02, // Disables use of `blocking_kernel`. sync = 0x04, // KernelLauncher has indicated that it will sync - modifier_mask = timer | no_block | sync, + run_once = 0x08, // Only run the benchmark once (for profiling). + modifier_mask = timer | no_block | sync | run_once, // Measurement types: cold = 0x0100, // measure_hot @@ -93,6 +94,7 @@ using none_t = tag; using timer_t = tag; using no_block_t = tag; using sync_t = tag; +using run_once_t = tag; using hot_t = tag; using cold_t = tag; using modifier_mask_t = tag; @@ -102,6 +104,7 @@ constexpr inline none_t none; constexpr inline timer_t timer; constexpr inline no_block_t no_block; constexpr inline sync_t sync; +constexpr inline run_once_t run_once; constexpr inline cold_t cold; constexpr inline hot_t hot; constexpr inline modifier_mask_t modifier_mask; diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh index a6b0a99..3d663fb 100644 --- a/nvbench/option_parser.cuh +++ b/nvbench/option_parser.cuh @@ -88,6 +88,8 @@ private: void print_help() const; void print_help_axis() const; + void enable_run_once(); + void add_benchmark(const std::string &name); void replay_global_args(); diff --git a/nvbench/option_parser.cxx b/nvbench/option_parser.cxx index 4f402d6..c45948e 100644 --- a/nvbench/option_parser.cxx +++ b/nvbench/option_parser.cxx @@ -385,6 +385,11 @@ void option_parser::parse_range(option_parser::arg_iterator_t first, this->print_list(); std::exit(0); } + else if (arg == "--run-once") + { + this->enable_run_once(); + first += 1; + } else if (arg == "--quiet" | arg == "-q") { // Setting this flag prevents the default stdout printer from being @@ -542,6 +547,19 @@ void option_parser::print_help_axis() const fmt::print("{}\n", ::cli_help_axis_text); } +void option_parser::enable_run_once() +{ + // If no active benchmark, save args as global. + if (m_benchmarks.empty()) + { + m_global_benchmark_args.push_back("--run-once"); + return; + } + + benchmark_base &bench = *m_benchmarks.back(); + bench.set_run_once(true); +} + void option_parser::add_benchmark(const std::string &name) try { diff --git a/nvbench/state.cuh b/nvbench/state.cuh index 5497d35..ebabab6 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -150,6 +150,13 @@ struct state } /// @} + /// If true, the benchmark is only run once, skipping all warmup runs and only + /// executing a single non-batched measurement. This is intended for use with + /// external profiling tools. @{ + [[nodiscard]] bool get_run_once() const { return m_run_once; } + void set_run_once(bool v) { m_run_once = v; } + /// @} + /// Accumulate at least this many seconds of timing data per measurement. @{ [[nodiscard]] nvbench::float64_t get_min_time() const { return m_min_time; } void set_min_time(nvbench::float64_t min_time) { m_min_time = min_time; } @@ -256,6 +263,8 @@ private: std::optional m_device; std::size_t m_type_config_index{}; + bool m_run_once{false}; + nvbench::int64_t m_min_samples; nvbench::float64_t m_min_time; nvbench::float64_t m_max_noise; diff --git a/nvbench/state.cxx b/nvbench/state.cxx index 07501d4..db2ab6d 100644 --- a/nvbench/state.cxx +++ b/nvbench/state.cxx @@ -33,6 +33,7 @@ namespace nvbench state::state(const benchmark_base &bench) : m_benchmark{bench} + , m_run_once{bench.get_run_once()} , m_min_samples{bench.get_min_samples()} , m_min_time{bench.get_min_time()} , m_max_noise{bench.get_max_noise()} @@ -48,6 +49,7 @@ state::state(const benchmark_base &bench, , m_axis_values{std::move(values)} , m_device{std::move(device)} , m_type_config_index{type_config_index} + , m_run_once{bench.get_run_once()} , m_min_samples{bench.get_min_samples()} , m_min_time{bench.get_min_time()} , m_max_noise{bench.get_max_noise()}