diff --git a/docs/cli_help.md b/docs/cli_help.md index 22b033b..8585409 100644 --- a/docs/cli_help.md +++ b/docs/cli_help.md @@ -93,13 +93,21 @@ * Applies to the most recent `--benchmark`, or all benchmarks if specified before any `--benchmark` arguments. -* `--warmup-runs ` - * Execute `` warmup runs before collecting cold measurement samples. +* `--cold-warmup-runs ` + * Execute up to `` warmup runs before collecting cold measurement samples. * The minimum is 1 warmup run. * Default is 1 warmup run. * Applies to the most recent `--benchmark`, or all benchmarks if specified before any `--benchmark` arguments. +* `--cold-max-warmup-walltime ` + * Stop cold warmup after the total warmup walltime exceeds ``. + * The limit is checked after each warmup run, so actual warmup time may exceed + this value by one warmup run. + * Default is -1 seconds (disabled). + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + * `--throttle-threshold ` * Set the GPU throttle threshold as percentage of the device's default clock rate. * Default is 75. diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index 3e194c6..6926f80 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -167,10 +167,23 @@ struct benchmark_base /// @} /// Execute this many warmup runs before collecting cold measurement samples. @{ - [[nodiscard]] nvbench::int64_t get_warmup_runs() const { return m_warmup_runs; } - benchmark_base &set_warmup_runs(nvbench::int64_t warmup_runs) + [[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; } + benchmark_base &set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs) { - m_warmup_runs = warmup_runs > nvbench::int64_t{0} ? warmup_runs : nvbench::int64_t{1}; + m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs + : nvbench::int64_t{1}; + return *this; + } + /// @} + + /// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{ + [[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const + { + return m_cold_max_warmup_walltime; + } + benchmark_base &set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime) + { + m_cold_max_warmup_walltime = cold_max_warmup_walltime; return *this; } /// @} @@ -330,8 +343,9 @@ protected: bool m_skip_batched{false}; nvbench::int64_t m_min_samples{10}; - nvbench::int64_t m_warmup_runs{1}; + nvbench::int64_t m_cold_warmup_runs{1}; + nvbench::float64_t m_cold_max_warmup_walltime{-1.}; nvbench::float64_t m_skip_time{-1.}; nvbench::float64_t m_timeout{15.}; diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx index 2fe4dab..6f5d331 100644 --- a/nvbench/benchmark_base.cxx +++ b/nvbench/benchmark_base.cxx @@ -43,8 +43,9 @@ std::unique_ptr benchmark_base::clone() const result->m_run_once = m_run_once; result->m_disable_blocking_kernel = m_disable_blocking_kernel; - result->m_min_samples = m_min_samples; - result->m_warmup_runs = m_warmup_runs; + result->m_min_samples = m_min_samples; + result->m_cold_warmup_runs = m_cold_warmup_runs; + result->m_cold_max_warmup_walltime = m_cold_max_warmup_walltime; result->m_skip_time = m_skip_time; result->m_timeout = m_timeout; diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index c3885d8..76eb941 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -46,7 +46,8 @@ measure_cold_base::measure_cold_base(state &exec_state) , m_run_once{exec_state.get_run_once()} , m_check_throttling(!exec_state.get_run_once()) , m_min_samples{exec_state.get_min_samples()} - , m_warmup_runs{exec_state.get_warmup_runs()} + , m_cold_warmup_runs{exec_state.get_cold_warmup_runs()} + , m_cold_max_warmup_walltime{exec_state.get_cold_max_warmup_walltime()} , m_skip_time{exec_state.get_skip_time()} , m_timeout{exec_state.get_timeout()} , m_throttle_threshold(exec_state.get_throttle_threshold()) diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index f37d207..3f228e8 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -110,8 +110,9 @@ protected: bool m_check_throttling{true}; nvbench::int64_t m_min_samples{}; - nvbench::int64_t m_warmup_runs{1}; + nvbench::int64_t m_cold_warmup_runs{1}; + nvbench::float64_t m_cold_max_warmup_walltime{}; nvbench::float64_t m_skip_time{}; nvbench::float64_t m_timeout{}; @@ -240,7 +241,7 @@ struct measure_cold : public measure_cold_base } private: - // Run the kernel m_warmup_runs times, measuring the GPU time of the last run. + // Run the kernel m_cold_warmup_runs times, measuring the GPU time of the last run. // If under skip_time, skip the measurement. void run_warmup() { @@ -258,10 +259,19 @@ private: // see https://github.com/NVIDIA/nvbench/issues/240 constexpr bool disable_blocking_kernel = true; kernel_launch_timer timer(*this, disable_blocking_kernel); + nvbench::cpu_timer warmup_walltime_timer; - for (nvbench::int64_t warmup_run = 0; warmup_run < m_warmup_runs; ++warmup_run) + warmup_walltime_timer.start(); + for (nvbench::int64_t warmup_run = 0; warmup_run < m_cold_warmup_runs; ++warmup_run) { this->launch_kernel(timer); + warmup_walltime_timer.stop(); + + if (m_cold_max_warmup_walltime > 0. && + warmup_walltime_timer.get_duration() > m_cold_max_warmup_walltime) + { + break; + } } this->check_skip_time(m_cuda_timer.get_duration()); } diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu index 850bf0f..b1143de 100644 --- a/nvbench/json_printer.cu +++ b/nvbench/json_printer.cu @@ -429,10 +429,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches) bench["name"] = bench_ptr->get_name(); bench["index"] = bench_index; - bench["min_samples"] = bench_ptr->get_min_samples(); - bench["warmup_runs"] = bench_ptr->get_warmup_runs(); - bench["skip_time"] = bench_ptr->get_skip_time(); - bench["timeout"] = bench_ptr->get_timeout(); + bench["min_samples"] = bench_ptr->get_min_samples(); + bench["cold_warmup_runs"] = bench_ptr->get_cold_warmup_runs(); + bench["cold_max_warmup_walltime"] = bench_ptr->get_cold_max_warmup_walltime(); + bench["skip_time"] = bench_ptr->get_skip_time(); + bench["timeout"] = bench_ptr->get_timeout(); auto &devices = bench["devices"]; for (const auto &dev_info : bench_ptr->get_devices()) @@ -487,10 +488,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches) st["name"] = exec_state.get_axis_values_as_string(); - st["min_samples"] = exec_state.get_min_samples(); - st["warmup_runs"] = exec_state.get_warmup_runs(); - st["skip_time"] = exec_state.get_skip_time(); - st["timeout"] = exec_state.get_timeout(); + st["min_samples"] = exec_state.get_min_samples(); + st["cold_warmup_runs"] = exec_state.get_cold_warmup_runs(); + st["cold_max_warmup_walltime"] = exec_state.get_cold_max_warmup_walltime(); + st["skip_time"] = exec_state.get_skip_time(); + st["timeout"] = exec_state.get_timeout(); st["device"] = exec_state.get_device()->get_id(); st["type_config_index"] = exec_state.get_type_config_index(); diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu index d64cb1c..6890709 100644 --- a/nvbench/option_parser.cu +++ b/nvbench/option_parser.cu @@ -526,14 +526,14 @@ void option_parser::parse_range(option_parser::arg_iterator_t first, this->update_axis(first[1]); first += 2; } - else if (arg == "--min-samples" || arg == "--warmup-runs") + else if (arg == "--min-samples" || arg == "--cold-warmup-runs") { check_params(1); this->update_int64_prop(first[0], first[1]); first += 2; } - else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" || - arg == "--throttle-recovery-delay") + else if (arg == "--skip-time" || arg == "--timeout" || arg == "--cold-max-warmup-walltime" || + arg == "--throttle-threshold" || arg == "--throttle-recovery-delay") { check_params(1); this->update_float64_prop(first[0], first[1]); @@ -990,9 +990,9 @@ try { bench.set_min_samples(value); } - else if (prop_arg == "--warmup-runs") + else if (prop_arg == "--cold-warmup-runs") { - bench.set_warmup_runs(value); + bench.set_cold_warmup_runs(value); } else { @@ -1107,6 +1107,10 @@ try { bench.set_timeout(value); } + else if (prop_arg == "--cold-max-warmup-walltime") + { + bench.set_cold_max_warmup_walltime(value); + } else if (prop_arg == "--throttle-threshold") { bench.set_throttle_threshold(static_cast(value) / 100.0f); diff --git a/nvbench/state.cuh b/nvbench/state.cuh index ee36087..506d1df 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -153,10 +153,22 @@ struct state /// @} /// Execute this many warmup runs before collecting cold measurement samples. @{ - [[nodiscard]] nvbench::int64_t get_warmup_runs() const { return m_warmup_runs; } - void set_warmup_runs(nvbench::int64_t warmup_runs) + [[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; } + void set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs) { - m_warmup_runs = warmup_runs > nvbench::int64_t{0} ? warmup_runs : nvbench::int64_t{1}; + m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs + : nvbench::int64_t{1}; + } + /// @} + + /// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{ + [[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const + { + return m_cold_max_warmup_walltime; + } + void set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime) + { + m_cold_max_warmup_walltime = cold_max_warmup_walltime; } /// @} @@ -340,8 +352,9 @@ private: std::string m_stopping_criterion; nvbench::int64_t m_min_samples; - nvbench::int64_t m_warmup_runs; + nvbench::int64_t m_cold_warmup_runs; + nvbench::float64_t m_cold_max_warmup_walltime; nvbench::float64_t m_skip_time; nvbench::float64_t m_timeout; diff --git a/nvbench/state.cxx b/nvbench/state.cxx index 15c17c8..6aaa496 100644 --- a/nvbench/state.cxx +++ b/nvbench/state.cxx @@ -39,7 +39,8 @@ state::state(const benchmark_base &bench) , m_criterion_params{bench.get_criterion_params()} , m_stopping_criterion(bench.get_stopping_criterion()) , m_min_samples{bench.get_min_samples()} - , m_warmup_runs{bench.get_warmup_runs()} + , m_cold_warmup_runs{bench.get_cold_warmup_runs()} + , m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()} , m_skip_time{bench.get_skip_time()} , m_timeout{bench.get_timeout()} , m_throttle_threshold{bench.get_throttle_threshold()} @@ -62,7 +63,8 @@ state::state(const benchmark_base &bench, , m_criterion_params{bench.get_criterion_params()} , m_stopping_criterion(bench.get_stopping_criterion()) , m_min_samples{bench.get_min_samples()} - , m_warmup_runs{bench.get_warmup_runs()} + , m_cold_warmup_runs{bench.get_cold_warmup_runs()} + , m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()} , m_skip_time{bench.get_skip_time()} , m_timeout{bench.get_timeout()} , m_throttle_threshold{bench.get_throttle_threshold()} diff --git a/testing/option_parser.cu b/testing/option_parser.cu index c2c0bd9..2aaf0f5 100644 --- a/testing/option_parser.cu +++ b/testing/option_parser.cu @@ -1155,33 +1155,33 @@ void test_min_samples() ASSERT(states[0].get_min_samples() == 12345); } -void test_warmup_runs() +void test_cold_warmup_runs() { { nvbench::option_parser parser; - parser.parse({"--benchmark", "DummyBench", "--warmup-runs", "12345"}); + parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "12345"}); const auto &states = parser_to_states(parser); ASSERT(states.size() == 1); - ASSERT(states[0].get_warmup_runs() == 12345); + ASSERT(states[0].get_cold_warmup_runs() == 12345); } { nvbench::option_parser parser; - parser.parse({"--benchmark", "DummyBench", "--warmup-runs", "0"}); + parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "0"}); const auto &states = parser_to_states(parser); ASSERT(states.size() == 1); - ASSERT(states[0].get_warmup_runs() == 1); + ASSERT(states[0].get_cold_warmup_runs() == 1); } { nvbench::option_parser parser; - parser.parse({"--benchmark", "DummyBench", "--warmup-runs", "-12345"}); + parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "-12345"}); const auto &states = parser_to_states(parser); ASSERT(states.size() == 1); - ASSERT(states[0].get_warmup_runs() == 1); + ASSERT(states[0].get_cold_warmup_runs() == 1); } } @@ -1195,6 +1195,16 @@ void test_skip_time() ASSERT(std::abs(states[0].get_skip_time() - 12345e2) < 1.); } +void test_cold_max_warmup_walltime() +{ + nvbench::option_parser parser; + parser.parse({"--benchmark", "DummyBench", "--cold-max-warmup-walltime", "12345e2"}); + const auto &states = parser_to_states(parser); + + ASSERT(states.size() == 1); + ASSERT(std::abs(states[0].get_cold_max_warmup_walltime() - 12345e2) < 1.); +} + void test_timeout() { nvbench::option_parser parser; @@ -1496,8 +1506,9 @@ try test_axis_before_benchmark(); test_min_samples(); - test_warmup_runs(); + test_cold_warmup_runs(); test_skip_time(); + test_cold_max_warmup_walltime(); test_timeout(); test_stopping_criterion(); diff --git a/testing/state_generator.cu b/testing/state_generator.cu index 3299300..383c8db 100644 --- a/testing/state_generator.cu +++ b/testing/state_generator.cu @@ -762,10 +762,11 @@ void test_devices() void test_termination_criteria() { - const nvbench::int64_t min_samples = 1000; - const nvbench::int64_t warmup_runs = 7; - const nvbench::float64_t skip_time = 4000; - const nvbench::float64_t timeout = 5000; + const nvbench::int64_t min_samples = 1000; + const nvbench::int64_t cold_warmup_runs = 7; + const nvbench::float64_t cold_max_warmup_walltime = 3000; + const nvbench::float64_t skip_time = 4000; + const nvbench::float64_t timeout = 5000; // for comparing floats auto within_one = [](auto a, auto b) { return std::abs(a - b) < 1.; }; @@ -773,7 +774,8 @@ void test_termination_criteria() dummy_bench bench; bench.set_devices(std::vector{}); bench.set_min_samples(min_samples); - bench.set_warmup_runs(warmup_runs); + bench.set_cold_warmup_runs(cold_warmup_runs); + bench.set_cold_max_warmup_walltime(cold_max_warmup_walltime); bench.set_skip_time(skip_time); bench.set_timeout(timeout); @@ -781,7 +783,8 @@ void test_termination_criteria() ASSERT(states.size() == 1); ASSERT(min_samples == states[0].get_min_samples()); - ASSERT(warmup_runs == states[0].get_warmup_runs()); + ASSERT(cold_warmup_runs == states[0].get_cold_warmup_runs()); + ASSERT(within_one(cold_max_warmup_walltime, states[0].get_cold_max_warmup_walltime())); ASSERT(within_one(skip_time, states[0].get_skip_time())); ASSERT(within_one(timeout, states[0].get_timeout())); }