diff --git a/.coderabbit.yaml b/.coderabbit.yaml new file mode 100644 index 0000000..7336b51 --- /dev/null +++ b/.coderabbit.yaml @@ -0,0 +1,137 @@ +# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +language: en-US +tone_instructions: | + Be direct, technical, brief. No praise, emojis, headings, or collapsible menus. + Start each comment with one prefix: + - suggestion: optional improvement; + - important: must-fix/high-impact risk; + - critical: blocking correctness/security/data-loss. + +reviews: + profile: chill + + high_level_summary: true + high_level_summary_in_walkthrough: true + poem: false + in_progress_fortune: false + sequence_diagrams: false + estimate_code_review_effort: false + collapse_walkthrough: true + + # Reduce noisy status/details sections. + request_changes_workflow: false + review_status: false + review_details: false + enable_prompt_for_ai_agents: false + + auto_review: + enabled: false + drafts: false + base_branches: + - "^main$" + - "^branch/[0-9]+\\.[0-9]+\\.x$" + ignore_usernames: ["copy-pr-bot", "dependabot[bot]", "github-actions[bot]", "nv-automation-bot"] + + tools: + gitleaks: + enabled: true + markdownlint: + enabled: true + shellcheck: + enabled: true + + # Keep Autofix available, but disable the other finishing touch actions. + finishing_touches: + docstrings: + enabled: false + unit_tests: + enabled: false + simplify: + enabled: false + + pre_merge_checks: + docstrings: + mode: "off" + title: + mode: "off" + description: + mode: "off" + issue_assessment: + mode: "off" + custom_checks: [] + + path_instructions: + - path: "nvbench/**/*" + instructions: | + Focus on benchmark correctness, CUDA stream/event ordering, synchronization behavior, error handling, + resource ownership, exception safety, public API compatibility, measurement semantics, statistical + summaries, and test coverage. Prefer comments that catch correctness, API, compile-time, runtime, or + measurement-regression risks. + + - path: "python/**/*" + instructions: | + Focus on Python API stability, pybind11/C++ exception boundaries, GIL behavior, CUDA interoperability, + object lifetime, package metadata, type stubs, JSON/result parsing compatibility, and tests. Avoid + style-only comments already covered by Ruff, clang-format, or pre-commit. + + - path: "testing/**/*" + instructions: | + Focus on whether tests cover observable behavior, remain deterministic, handle GPU availability and CUDA + version differences correctly, avoid excessive runtime, and exercise install/export/package boundaries + where relevant. + + - path: "examples/**/*" + instructions: | + Check that examples are minimal, buildable, technically correct, use NVBench APIs idiomatically, avoid + excessive benchmark runtime, and demonstrate behavior that is useful to users. + + - path: "docs/**/*" + instructions: | + For documentation changes, focus on technical accuracy, buildable examples, CLI/API consistency, + version compatibility, and whether behavior changes have matching documentation updates. + + - path: "ci/**/*" + instructions: | + For CI and build scripts, focus on matrix correctness, targeted build/test behavior, cache/artifact + handling, environment setup, GPU availability assumptions, clear failures, and avoiding unnecessary + expensive jobs. + + - path: ".github/**/*" + instructions: | + For GitHub workflows and repository automation, focus on permissions, event triggers, matrix generation, + status/check behavior, security boundaries, and avoiding unnecessary CI fanout. + + - path: "cmake/**/*" + instructions: | + Focus on package exports, install-tree and build-tree compatibility, target usage requirements, static + and shared library behavior, CUDA architecture handling, and compatibility across supported CMake/CUDA + versions. + + - path: "**/CMakeLists.txt" + instructions: | + Focus on target dependencies, exported usage requirements, option behavior, install rules, tests, + examples, Python package integration, and compatibility across supported CMake/CUDA versions. + + - path: "CMakePresets.json" + instructions: | + Focus on preset inheritance, CI parity, cache variable correctness, CUDA compiler/toolchain assumptions, + and whether presets remain useful for local and automated builds. + +knowledge_base: + opt_out: false + code_guidelines: + filePatterns: + - ".clang-format" + - ".pre-commit-config.yaml" + - "README.md" + - "CMakeLists.txt" + - "CMakePresets.json" + - "pyproject.toml" + - "python/README.md" + - "python/pyproject.toml" + - "docs/benchmarks.md" + - "docs/cli_help.md" + - "docs/cli_help_axis.md" diff --git a/docs/cli_help.md b/docs/cli_help.md index 15a89a7..8585409 100644 --- a/docs/cli_help.md +++ b/docs/cli_help.md @@ -93,6 +93,21 @@ * Applies to the most recent `--benchmark`, or all benchmarks if specified before any `--benchmark` arguments. +* `--cold-warmup-runs ` + * Execute up to `` warmup runs before collecting cold measurement samples. + * The minimum is 1 warmup run. + * Default is 1 warmup run. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--cold-max-warmup-walltime ` + * Stop cold warmup after the total warmup walltime exceeds ``. + * The limit is checked after each warmup run, so actual warmup time may exceed + this value by one warmup run. + * Default is -1 seconds (disabled). + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + * `--throttle-threshold ` * Set the GPU throttle threshold as percentage of the device's default clock rate. * Default is 75. diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index 939bbeb..6926f80 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -166,6 +166,28 @@ struct benchmark_base } /// @} + /// Execute this many warmup runs before collecting cold measurement samples. @{ + [[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; } + benchmark_base &set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs) + { + m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs + : nvbench::int64_t{1}; + return *this; + } + /// @} + + /// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{ + [[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const + { + return m_cold_max_warmup_walltime; + } + benchmark_base &set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime) + { + m_cold_max_warmup_walltime = cold_max_warmup_walltime; + return *this; + } + /// @} + /// If true, the benchmark measurements only record CPU time and assume no GPU work is performed. /// @{ [[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; } @@ -321,7 +343,9 @@ protected: bool m_skip_batched{false}; nvbench::int64_t m_min_samples{10}; + nvbench::int64_t m_cold_warmup_runs{1}; + nvbench::float64_t m_cold_max_warmup_walltime{-1.}; nvbench::float64_t m_skip_time{-1.}; nvbench::float64_t m_timeout{15.}; diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx index a237e5c..6f5d331 100644 --- a/nvbench/benchmark_base.cxx +++ b/nvbench/benchmark_base.cxx @@ -43,7 +43,9 @@ std::unique_ptr benchmark_base::clone() const result->m_run_once = m_run_once; result->m_disable_blocking_kernel = m_disable_blocking_kernel; - result->m_min_samples = m_min_samples; + result->m_min_samples = m_min_samples; + result->m_cold_warmup_runs = m_cold_warmup_runs; + result->m_cold_max_warmup_walltime = m_cold_max_warmup_walltime; result->m_skip_time = m_skip_time; result->m_timeout = m_timeout; diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index 38fb6f7..76eb941 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -46,6 +46,8 @@ measure_cold_base::measure_cold_base(state &exec_state) , m_run_once{exec_state.get_run_once()} , m_check_throttling(!exec_state.get_run_once()) , m_min_samples{exec_state.get_min_samples()} + , m_cold_warmup_runs{exec_state.get_cold_warmup_runs()} + , m_cold_max_warmup_walltime{exec_state.get_cold_max_warmup_walltime()} , m_skip_time{exec_state.get_skip_time()} , m_timeout{exec_state.get_timeout()} , m_throttle_threshold(exec_state.get_throttle_threshold()) diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index 89b4201..3f228e8 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -110,7 +110,9 @@ protected: bool m_check_throttling{true}; nvbench::int64_t m_min_samples{}; + nvbench::int64_t m_cold_warmup_runs{1}; + nvbench::float64_t m_cold_max_warmup_walltime{}; nvbench::float64_t m_skip_time{}; nvbench::float64_t m_timeout{}; @@ -239,8 +241,8 @@ struct measure_cold : public measure_cold_base } private: - // Run the kernel once, measuring the GPU time. If under skip_time, skip the - // measurement. + // Run the kernel m_cold_warmup_runs times, measuring the GPU time of the last run. + // If under skip_time, skip the measurement. void run_warmup() { if (m_run_once) @@ -248,12 +250,29 @@ private: return; } + // Ensure blocking kernel is loaded during the warmup + // Ref: https://github.com/NVIDIA/nvbench/issues/339 + this->block_stream(); + this->unblock_stream(); + // disable use of blocking kernel for warm-up run // see https://github.com/NVIDIA/nvbench/issues/240 constexpr bool disable_blocking_kernel = true; kernel_launch_timer timer(*this, disable_blocking_kernel); + nvbench::cpu_timer warmup_walltime_timer; - this->launch_kernel(timer); + warmup_walltime_timer.start(); + for (nvbench::int64_t warmup_run = 0; warmup_run < m_cold_warmup_runs; ++warmup_run) + { + this->launch_kernel(timer); + warmup_walltime_timer.stop(); + + if (m_cold_max_warmup_walltime > 0. && + warmup_walltime_timer.get_duration() > m_cold_max_warmup_walltime) + { + break; + } + } this->check_skip_time(m_cuda_timer.get_duration()); } diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu index e363a45..9ae1719 100644 --- a/nvbench/json_printer.cu +++ b/nvbench/json_printer.cu @@ -49,7 +49,7 @@ namespace fs = std::filesystem; #include namespace fs = std::experimental::filesystem; #else -static_assert(false, "No or found."); +#error "No or found." #endif #if NVBENCH_CPP_DIALECT >= 2020 @@ -429,9 +429,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches) bench["name"] = bench_ptr->get_name(); bench["index"] = bench_index; - bench["min_samples"] = bench_ptr->get_min_samples(); - bench["skip_time"] = bench_ptr->get_skip_time(); - bench["timeout"] = bench_ptr->get_timeout(); + bench["min_samples"] = bench_ptr->get_min_samples(); + bench["cold_warmup_runs"] = bench_ptr->get_cold_warmup_runs(); + bench["cold_max_warmup_walltime"] = bench_ptr->get_cold_max_warmup_walltime(); + bench["skip_time"] = bench_ptr->get_skip_time(); + bench["timeout"] = bench_ptr->get_timeout(); auto &devices = bench["devices"]; for (const auto &dev_info : bench_ptr->get_devices()) @@ -486,9 +488,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches) st["name"] = exec_state.get_axis_values_as_string(); - st["min_samples"] = exec_state.get_min_samples(); - st["skip_time"] = exec_state.get_skip_time(); - st["timeout"] = exec_state.get_timeout(); + st["min_samples"] = exec_state.get_min_samples(); + st["cold_warmup_runs"] = exec_state.get_cold_warmup_runs(); + st["cold_max_warmup_walltime"] = exec_state.get_cold_max_warmup_walltime(); + st["skip_time"] = exec_state.get_skip_time(); + st["timeout"] = exec_state.get_timeout(); st["device"] = exec_state.get_device()->get_id(); st["type_config_index"] = exec_state.get_type_config_index(); diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu index 530ae94..8d3e1b3 100644 --- a/nvbench/option_parser.cu +++ b/nvbench/option_parser.cu @@ -48,9 +48,20 @@ #include #include #include +#include #include #include +#if __has_include() +#include +namespace fs = std::filesystem; +#elif __has_include() +#include +namespace fs = std::experimental::filesystem; +#else +#error "No or found." +#endif + namespace { @@ -115,6 +126,18 @@ catch (const std::exception &) void parse(std::string_view input, std::string &val) { val = input; } +void create_output_parent_directories(const std::string &spec) +{ + const fs::path output_path{spec}; + const fs::path parent_path = output_path.parent_path(); + if (parent_path.empty()) + { + return; + } + + fs::create_directories(parent_path); +} + // Parses a list of values ", , , ..." into a vector: template std::vector parse_list_values(std::string_view list_spec) @@ -526,14 +549,14 @@ void option_parser::parse_range(option_parser::arg_iterator_t first, this->update_axis(first[1]); first += 2; } - else if (arg == "--min-samples") + else if (arg == "--min-samples" || arg == "--cold-warmup-runs") { check_params(1); this->update_int64_prop(first[0], first[1]); first += 2; } - else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" || - arg == "--throttle-recovery-delay") + else if (arg == "--skip-time" || arg == "--timeout" || arg == "--cold-max-warmup-walltime" || + arg == "--throttle-threshold" || arg == "--throttle-recovery-delay") { check_params(1); this->update_float64_prop(first[0], first[1]); @@ -622,6 +645,8 @@ std::ostream &option_parser::printer_spec_to_ostream(const std::string &spec) } else // spec is a filename: { + ::create_output_parent_directories(spec); + auto file_stream = std::make_unique(); // Throw if file can't open file_stream->exceptions(file_stream->exceptions() | std::ios::failbit); @@ -990,6 +1015,10 @@ try { bench.set_min_samples(value); } + else if (prop_arg == "--cold-warmup-runs") + { + bench.set_cold_warmup_runs(value); + } else { NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg); @@ -1103,6 +1132,10 @@ try { bench.set_timeout(value); } + else if (prop_arg == "--cold-max-warmup-walltime") + { + bench.set_cold_max_warmup_walltime(value); + } else if (prop_arg == "--throttle-threshold") { bench.set_throttle_threshold(static_cast(value) / 100.0f); diff --git a/nvbench/state.cuh b/nvbench/state.cuh index aad5a2c..506d1df 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -152,6 +152,26 @@ struct state void set_min_samples(nvbench::int64_t min_samples) { m_min_samples = min_samples; } /// @} + /// Execute this many warmup runs before collecting cold measurement samples. @{ + [[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; } + void set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs) + { + m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs + : nvbench::int64_t{1}; + } + /// @} + + /// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{ + [[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const + { + return m_cold_max_warmup_walltime; + } + void set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime) + { + m_cold_max_warmup_walltime = cold_max_warmup_walltime; + } + /// @} + [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const { return m_criterion_params; @@ -332,7 +352,9 @@ private: std::string m_stopping_criterion; nvbench::int64_t m_min_samples; + nvbench::int64_t m_cold_warmup_runs; + nvbench::float64_t m_cold_max_warmup_walltime; nvbench::float64_t m_skip_time; nvbench::float64_t m_timeout; diff --git a/nvbench/state.cxx b/nvbench/state.cxx index af53502..6aaa496 100644 --- a/nvbench/state.cxx +++ b/nvbench/state.cxx @@ -39,6 +39,8 @@ state::state(const benchmark_base &bench) , m_criterion_params{bench.get_criterion_params()} , m_stopping_criterion(bench.get_stopping_criterion()) , m_min_samples{bench.get_min_samples()} + , m_cold_warmup_runs{bench.get_cold_warmup_runs()} + , m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()} , m_skip_time{bench.get_skip_time()} , m_timeout{bench.get_timeout()} , m_throttle_threshold{bench.get_throttle_threshold()} @@ -61,6 +63,8 @@ state::state(const benchmark_base &bench, , m_criterion_params{bench.get_criterion_params()} , m_stopping_criterion(bench.get_stopping_criterion()) , m_min_samples{bench.get_min_samples()} + , m_cold_warmup_runs{bench.get_cold_warmup_runs()} + , m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()} , m_skip_time{bench.get_skip_time()} , m_timeout{bench.get_timeout()} , m_throttle_threshold{bench.get_throttle_threshold()} diff --git a/testing/option_parser.cu b/testing/option_parser.cu index 6104e5e..e665673 100644 --- a/testing/option_parser.cu +++ b/testing/option_parser.cu @@ -22,6 +22,21 @@ #include +#include +#include +#include +#include + +#if __has_include() +#include +namespace fs = std::filesystem; +#elif __has_include() +#include +namespace fs = std::experimental::filesystem; +#else +#error "No or found." +#endif + #include "test_asserts.cuh" //============================================================================== @@ -49,6 +64,40 @@ NVBENCH_BENCH_TYPES(TestBench, NVBENCH_TYPE_AXES(Ts, Us)) namespace { +struct temp_tree +{ + explicit temp_tree(fs::path root) + : root_path{std::move(root)} + { + std::error_code ec; + fs::remove_all(root_path, ec); + if (ec) + { + throw std::runtime_error{fmt::format("Failed to remove temporary directory `{}`: {}", + root_path.string(), + ec.message())}; + } + } + + ~temp_tree() + { + std::error_code ec; + fs::remove_all(root_path, ec); + if (ec) + { + std::cerr << "Failed to remove temporary directory `" << root_path.string() + << "`: " << ec.message() << "\n"; + } + } + + temp_tree(const temp_tree &) = delete; + temp_tree(temp_tree &&) = delete; + temp_tree &operator=(const temp_tree &) = delete; + temp_tree &operator=(temp_tree &&) = delete; + + fs::path root_path; +}; + [[nodiscard]] std::string states_to_string(const std::vector &states) { fmt::memory_buffer buffer; @@ -1155,6 +1204,36 @@ void test_min_samples() ASSERT(states[0].get_min_samples() == 12345); } +void test_cold_warmup_runs() +{ + { + nvbench::option_parser parser; + parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "12345"}); + const auto &states = parser_to_states(parser); + + ASSERT(states.size() == 1); + ASSERT(states[0].get_cold_warmup_runs() == 12345); + } + + { + nvbench::option_parser parser; + parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "0"}); + const auto &states = parser_to_states(parser); + + ASSERT(states.size() == 1); + ASSERT(states[0].get_cold_warmup_runs() == 1); + } + + { + nvbench::option_parser parser; + parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "-12345"}); + const auto &states = parser_to_states(parser); + + ASSERT(states.size() == 1); + ASSERT(states[0].get_cold_warmup_runs() == 1); + } +} + void test_skip_time() { nvbench::option_parser parser; @@ -1165,6 +1244,16 @@ void test_skip_time() ASSERT(std::abs(states[0].get_skip_time() - 12345e2) < 1.); } +void test_cold_max_warmup_walltime() +{ + nvbench::option_parser parser; + parser.parse({"--benchmark", "DummyBench", "--cold-max-warmup-walltime", "12345e2"}); + const auto &states = parser_to_states(parser); + + ASSERT(states.size() == 1); + ASSERT(std::abs(states[0].get_cold_max_warmup_walltime() - 12345e2) < 1.); +} + void test_timeout() { nvbench::option_parser parser; @@ -1175,6 +1264,22 @@ void test_timeout() ASSERT(std::abs(states[0].get_timeout() - 12345e2) < 1.); } +void test_output_parent_directories_created() +{ + const auto unique_suffix = std::chrono::steady_clock::now().time_since_epoch().count(); + const temp_tree temp{fs::temp_directory_path() / + fmt::format("nvbench_option_parser_test_{}", unique_suffix)}; + const auto output_path = temp.root_path / "nested" / "results.json"; + + { + nvbench::option_parser parser; + parser.parse({"--json", output_path.string()}); + } + + ASSERT(fs::is_directory(output_path.parent_path())); + ASSERT(fs::exists(output_path)); +} + void test_stopping_criterion() { { // Per benchmark criterion @@ -1466,8 +1571,11 @@ try test_axis_before_benchmark(); test_min_samples(); + test_cold_warmup_runs(); test_skip_time(); + test_cold_max_warmup_walltime(); test_timeout(); + test_output_parent_directories_created(); test_stopping_criterion(); diff --git a/testing/state_generator.cu b/testing/state_generator.cu index 14eceb5..383c8db 100644 --- a/testing/state_generator.cu +++ b/testing/state_generator.cu @@ -762,9 +762,11 @@ void test_devices() void test_termination_criteria() { - const nvbench::int64_t min_samples = 1000; - const nvbench::float64_t skip_time = 4000; - const nvbench::float64_t timeout = 5000; + const nvbench::int64_t min_samples = 1000; + const nvbench::int64_t cold_warmup_runs = 7; + const nvbench::float64_t cold_max_warmup_walltime = 3000; + const nvbench::float64_t skip_time = 4000; + const nvbench::float64_t timeout = 5000; // for comparing floats auto within_one = [](auto a, auto b) { return std::abs(a - b) < 1.; }; @@ -772,6 +774,8 @@ void test_termination_criteria() dummy_bench bench; bench.set_devices(std::vector{}); bench.set_min_samples(min_samples); + bench.set_cold_warmup_runs(cold_warmup_runs); + bench.set_cold_max_warmup_walltime(cold_max_warmup_walltime); bench.set_skip_time(skip_time); bench.set_timeout(timeout); @@ -779,6 +783,8 @@ void test_termination_criteria() ASSERT(states.size() == 1); ASSERT(min_samples == states[0].get_min_samples()); + ASSERT(cold_warmup_runs == states[0].get_cold_warmup_runs()); + ASSERT(within_one(cold_max_warmup_walltime, states[0].get_cold_max_warmup_walltime())); ASSERT(within_one(skip_time, states[0].get_skip_time())); ASSERT(within_one(timeout, states[0].get_timeout())); }