diff --git a/docs/cli_help.md b/docs/cli_help.md index 15a89a7..8585409 100644 --- a/docs/cli_help.md +++ b/docs/cli_help.md @@ -93,6 +93,21 @@ * Applies to the most recent `--benchmark`, or all benchmarks if specified before any `--benchmark` arguments. +* `--cold-warmup-runs ` + * Execute up to `` warmup runs before collecting cold measurement samples. + * The minimum is 1 warmup run. + * Default is 1 warmup run. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--cold-max-warmup-walltime ` + * Stop cold warmup after the total warmup walltime exceeds ``. + * The limit is checked after each warmup run, so actual warmup time may exceed + this value by one warmup run. + * Default is -1 seconds (disabled). + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + * `--throttle-threshold ` * Set the GPU throttle threshold as percentage of the device's default clock rate. * Default is 75. diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index 939bbeb..6926f80 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -166,6 +166,28 @@ struct benchmark_base } /// @} + /// Execute this many warmup runs before collecting cold measurement samples. @{ + [[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; } + benchmark_base &set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs) + { + m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs + : nvbench::int64_t{1}; + return *this; + } + /// @} + + /// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{ + [[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const + { + return m_cold_max_warmup_walltime; + } + benchmark_base &set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime) + { + m_cold_max_warmup_walltime = cold_max_warmup_walltime; + return *this; + } + /// @} + /// If true, the benchmark measurements only record CPU time and assume no GPU work is performed. /// @{ [[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; } @@ -321,7 +343,9 @@ protected: bool m_skip_batched{false}; nvbench::int64_t m_min_samples{10}; + nvbench::int64_t m_cold_warmup_runs{1}; + nvbench::float64_t m_cold_max_warmup_walltime{-1.}; nvbench::float64_t m_skip_time{-1.}; nvbench::float64_t m_timeout{15.}; diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx index a237e5c..6f5d331 100644 --- a/nvbench/benchmark_base.cxx +++ b/nvbench/benchmark_base.cxx @@ -43,7 +43,9 @@ std::unique_ptr benchmark_base::clone() const result->m_run_once = m_run_once; result->m_disable_blocking_kernel = m_disable_blocking_kernel; - result->m_min_samples = m_min_samples; + result->m_min_samples = m_min_samples; + result->m_cold_warmup_runs = m_cold_warmup_runs; + result->m_cold_max_warmup_walltime = m_cold_max_warmup_walltime; result->m_skip_time = m_skip_time; result->m_timeout = m_timeout; diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index 38fb6f7..76eb941 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -46,6 +46,8 @@ measure_cold_base::measure_cold_base(state &exec_state) , m_run_once{exec_state.get_run_once()} , m_check_throttling(!exec_state.get_run_once()) , m_min_samples{exec_state.get_min_samples()} + , m_cold_warmup_runs{exec_state.get_cold_warmup_runs()} + , m_cold_max_warmup_walltime{exec_state.get_cold_max_warmup_walltime()} , m_skip_time{exec_state.get_skip_time()} , m_timeout{exec_state.get_timeout()} , m_throttle_threshold(exec_state.get_throttle_threshold()) diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index 89b4201..3f228e8 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -110,7 +110,9 @@ protected: bool m_check_throttling{true}; nvbench::int64_t m_min_samples{}; + nvbench::int64_t m_cold_warmup_runs{1}; + nvbench::float64_t m_cold_max_warmup_walltime{}; nvbench::float64_t m_skip_time{}; nvbench::float64_t m_timeout{}; @@ -239,8 +241,8 @@ struct measure_cold : public measure_cold_base } private: - // Run the kernel once, measuring the GPU time. If under skip_time, skip the - // measurement. + // Run the kernel m_cold_warmup_runs times, measuring the GPU time of the last run. + // If under skip_time, skip the measurement. void run_warmup() { if (m_run_once) @@ -248,12 +250,29 @@ private: return; } + // Ensure blocking kernel is loaded during the warmup + // Ref: https://github.com/NVIDIA/nvbench/issues/339 + this->block_stream(); + this->unblock_stream(); + // disable use of blocking kernel for warm-up run // see https://github.com/NVIDIA/nvbench/issues/240 constexpr bool disable_blocking_kernel = true; kernel_launch_timer timer(*this, disable_blocking_kernel); + nvbench::cpu_timer warmup_walltime_timer; - this->launch_kernel(timer); + warmup_walltime_timer.start(); + for (nvbench::int64_t warmup_run = 0; warmup_run < m_cold_warmup_runs; ++warmup_run) + { + this->launch_kernel(timer); + warmup_walltime_timer.stop(); + + if (m_cold_max_warmup_walltime > 0. && + warmup_walltime_timer.get_duration() > m_cold_max_warmup_walltime) + { + break; + } + } this->check_skip_time(m_cuda_timer.get_duration()); } diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu index b5f41c3..9ae1719 100644 --- a/nvbench/json_printer.cu +++ b/nvbench/json_printer.cu @@ -429,9 +429,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches) bench["name"] = bench_ptr->get_name(); bench["index"] = bench_index; - bench["min_samples"] = bench_ptr->get_min_samples(); - bench["skip_time"] = bench_ptr->get_skip_time(); - bench["timeout"] = bench_ptr->get_timeout(); + bench["min_samples"] = bench_ptr->get_min_samples(); + bench["cold_warmup_runs"] = bench_ptr->get_cold_warmup_runs(); + bench["cold_max_warmup_walltime"] = bench_ptr->get_cold_max_warmup_walltime(); + bench["skip_time"] = bench_ptr->get_skip_time(); + bench["timeout"] = bench_ptr->get_timeout(); auto &devices = bench["devices"]; for (const auto &dev_info : bench_ptr->get_devices()) @@ -486,9 +488,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches) st["name"] = exec_state.get_axis_values_as_string(); - st["min_samples"] = exec_state.get_min_samples(); - st["skip_time"] = exec_state.get_skip_time(); - st["timeout"] = exec_state.get_timeout(); + st["min_samples"] = exec_state.get_min_samples(); + st["cold_warmup_runs"] = exec_state.get_cold_warmup_runs(); + st["cold_max_warmup_walltime"] = exec_state.get_cold_max_warmup_walltime(); + st["skip_time"] = exec_state.get_skip_time(); + st["timeout"] = exec_state.get_timeout(); st["device"] = exec_state.get_device()->get_id(); st["type_config_index"] = exec_state.get_type_config_index(); diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu index f0570bb..8d3e1b3 100644 --- a/nvbench/option_parser.cu +++ b/nvbench/option_parser.cu @@ -549,14 +549,14 @@ void option_parser::parse_range(option_parser::arg_iterator_t first, this->update_axis(first[1]); first += 2; } - else if (arg == "--min-samples") + else if (arg == "--min-samples" || arg == "--cold-warmup-runs") { check_params(1); this->update_int64_prop(first[0], first[1]); first += 2; } - else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" || - arg == "--throttle-recovery-delay") + else if (arg == "--skip-time" || arg == "--timeout" || arg == "--cold-max-warmup-walltime" || + arg == "--throttle-threshold" || arg == "--throttle-recovery-delay") { check_params(1); this->update_float64_prop(first[0], first[1]); @@ -1015,6 +1015,10 @@ try { bench.set_min_samples(value); } + else if (prop_arg == "--cold-warmup-runs") + { + bench.set_cold_warmup_runs(value); + } else { NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg); @@ -1128,6 +1132,10 @@ try { bench.set_timeout(value); } + else if (prop_arg == "--cold-max-warmup-walltime") + { + bench.set_cold_max_warmup_walltime(value); + } else if (prop_arg == "--throttle-threshold") { bench.set_throttle_threshold(static_cast(value) / 100.0f); diff --git a/nvbench/state.cuh b/nvbench/state.cuh index aad5a2c..506d1df 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -152,6 +152,26 @@ struct state void set_min_samples(nvbench::int64_t min_samples) { m_min_samples = min_samples; } /// @} + /// Execute this many warmup runs before collecting cold measurement samples. @{ + [[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; } + void set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs) + { + m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs + : nvbench::int64_t{1}; + } + /// @} + + /// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{ + [[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const + { + return m_cold_max_warmup_walltime; + } + void set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime) + { + m_cold_max_warmup_walltime = cold_max_warmup_walltime; + } + /// @} + [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const { return m_criterion_params; @@ -332,7 +352,9 @@ private: std::string m_stopping_criterion; nvbench::int64_t m_min_samples; + nvbench::int64_t m_cold_warmup_runs; + nvbench::float64_t m_cold_max_warmup_walltime; nvbench::float64_t m_skip_time; nvbench::float64_t m_timeout; diff --git a/nvbench/state.cxx b/nvbench/state.cxx index af53502..6aaa496 100644 --- a/nvbench/state.cxx +++ b/nvbench/state.cxx @@ -39,6 +39,8 @@ state::state(const benchmark_base &bench) , m_criterion_params{bench.get_criterion_params()} , m_stopping_criterion(bench.get_stopping_criterion()) , m_min_samples{bench.get_min_samples()} + , m_cold_warmup_runs{bench.get_cold_warmup_runs()} + , m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()} , m_skip_time{bench.get_skip_time()} , m_timeout{bench.get_timeout()} , m_throttle_threshold{bench.get_throttle_threshold()} @@ -61,6 +63,8 @@ state::state(const benchmark_base &bench, , m_criterion_params{bench.get_criterion_params()} , m_stopping_criterion(bench.get_stopping_criterion()) , m_min_samples{bench.get_min_samples()} + , m_cold_warmup_runs{bench.get_cold_warmup_runs()} + , m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()} , m_skip_time{bench.get_skip_time()} , m_timeout{bench.get_timeout()} , m_throttle_threshold{bench.get_throttle_threshold()} diff --git a/testing/option_parser.cu b/testing/option_parser.cu index 3cd98a3..e665673 100644 --- a/testing/option_parser.cu +++ b/testing/option_parser.cu @@ -1204,6 +1204,36 @@ void test_min_samples() ASSERT(states[0].get_min_samples() == 12345); } +void test_cold_warmup_runs() +{ + { + nvbench::option_parser parser; + parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "12345"}); + const auto &states = parser_to_states(parser); + + ASSERT(states.size() == 1); + ASSERT(states[0].get_cold_warmup_runs() == 12345); + } + + { + nvbench::option_parser parser; + parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "0"}); + const auto &states = parser_to_states(parser); + + ASSERT(states.size() == 1); + ASSERT(states[0].get_cold_warmup_runs() == 1); + } + + { + nvbench::option_parser parser; + parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "-12345"}); + const auto &states = parser_to_states(parser); + + ASSERT(states.size() == 1); + ASSERT(states[0].get_cold_warmup_runs() == 1); + } +} + void test_skip_time() { nvbench::option_parser parser; @@ -1214,6 +1244,16 @@ void test_skip_time() ASSERT(std::abs(states[0].get_skip_time() - 12345e2) < 1.); } +void test_cold_max_warmup_walltime() +{ + nvbench::option_parser parser; + parser.parse({"--benchmark", "DummyBench", "--cold-max-warmup-walltime", "12345e2"}); + const auto &states = parser_to_states(parser); + + ASSERT(states.size() == 1); + ASSERT(std::abs(states[0].get_cold_max_warmup_walltime() - 12345e2) < 1.); +} + void test_timeout() { nvbench::option_parser parser; @@ -1531,7 +1571,9 @@ try test_axis_before_benchmark(); test_min_samples(); + test_cold_warmup_runs(); test_skip_time(); + test_cold_max_warmup_walltime(); test_timeout(); test_output_parent_directories_created(); diff --git a/testing/state_generator.cu b/testing/state_generator.cu index 14eceb5..383c8db 100644 --- a/testing/state_generator.cu +++ b/testing/state_generator.cu @@ -762,9 +762,11 @@ void test_devices() void test_termination_criteria() { - const nvbench::int64_t min_samples = 1000; - const nvbench::float64_t skip_time = 4000; - const nvbench::float64_t timeout = 5000; + const nvbench::int64_t min_samples = 1000; + const nvbench::int64_t cold_warmup_runs = 7; + const nvbench::float64_t cold_max_warmup_walltime = 3000; + const nvbench::float64_t skip_time = 4000; + const nvbench::float64_t timeout = 5000; // for comparing floats auto within_one = [](auto a, auto b) { return std::abs(a - b) < 1.; }; @@ -772,6 +774,8 @@ void test_termination_criteria() dummy_bench bench; bench.set_devices(std::vector{}); bench.set_min_samples(min_samples); + bench.set_cold_warmup_runs(cold_warmup_runs); + bench.set_cold_max_warmup_walltime(cold_max_warmup_walltime); bench.set_skip_time(skip_time); bench.set_timeout(timeout); @@ -779,6 +783,8 @@ void test_termination_criteria() ASSERT(states.size() == 1); ASSERT(min_samples == states[0].get_min_samples()); + ASSERT(cold_warmup_runs == states[0].get_cold_warmup_runs()); + ASSERT(within_one(cold_max_warmup_walltime, states[0].get_cold_max_warmup_walltime())); ASSERT(within_one(skip_time, states[0].get_skip_time())); ASSERT(within_one(timeout, states[0].get_timeout())); }