Merge remote-tracking branch 'upstream/main' into add-bench-result

This commit is contained in:
Oleksandr Pavlyk
2026-05-12 15:23:39 -05:00
12 changed files with 393 additions and 17 deletions

137
.coderabbit.yaml Normal file
View File

@@ -0,0 +1,137 @@
# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
language: en-US
tone_instructions: |
Be direct, technical, brief. No praise, emojis, headings, or collapsible menus.
Start each comment with one prefix:
- suggestion: optional improvement;
- important: must-fix/high-impact risk;
- critical: blocking correctness/security/data-loss.
reviews:
profile: chill
high_level_summary: true
high_level_summary_in_walkthrough: true
poem: false
in_progress_fortune: false
sequence_diagrams: false
estimate_code_review_effort: false
collapse_walkthrough: true
# Reduce noisy status/details sections.
request_changes_workflow: false
review_status: false
review_details: false
enable_prompt_for_ai_agents: false
auto_review:
enabled: false
drafts: false
base_branches:
- "^main$"
- "^branch/[0-9]+\\.[0-9]+\\.x$"
ignore_usernames: ["copy-pr-bot", "dependabot[bot]", "github-actions[bot]", "nv-automation-bot"]
tools:
gitleaks:
enabled: true
markdownlint:
enabled: true
shellcheck:
enabled: true
# Keep Autofix available, but disable the other finishing touch actions.
finishing_touches:
docstrings:
enabled: false
unit_tests:
enabled: false
simplify:
enabled: false
pre_merge_checks:
docstrings:
mode: "off"
title:
mode: "off"
description:
mode: "off"
issue_assessment:
mode: "off"
custom_checks: []
path_instructions:
- path: "nvbench/**/*"
instructions: |
Focus on benchmark correctness, CUDA stream/event ordering, synchronization behavior, error handling,
resource ownership, exception safety, public API compatibility, measurement semantics, statistical
summaries, and test coverage. Prefer comments that catch correctness, API, compile-time, runtime, or
measurement-regression risks.
- path: "python/**/*"
instructions: |
Focus on Python API stability, pybind11/C++ exception boundaries, GIL behavior, CUDA interoperability,
object lifetime, package metadata, type stubs, JSON/result parsing compatibility, and tests. Avoid
style-only comments already covered by Ruff, clang-format, or pre-commit.
- path: "testing/**/*"
instructions: |
Focus on whether tests cover observable behavior, remain deterministic, handle GPU availability and CUDA
version differences correctly, avoid excessive runtime, and exercise install/export/package boundaries
where relevant.
- path: "examples/**/*"
instructions: |
Check that examples are minimal, buildable, technically correct, use NVBench APIs idiomatically, avoid
excessive benchmark runtime, and demonstrate behavior that is useful to users.
- path: "docs/**/*"
instructions: |
For documentation changes, focus on technical accuracy, buildable examples, CLI/API consistency,
version compatibility, and whether behavior changes have matching documentation updates.
- path: "ci/**/*"
instructions: |
For CI and build scripts, focus on matrix correctness, targeted build/test behavior, cache/artifact
handling, environment setup, GPU availability assumptions, clear failures, and avoiding unnecessary
expensive jobs.
- path: ".github/**/*"
instructions: |
For GitHub workflows and repository automation, focus on permissions, event triggers, matrix generation,
status/check behavior, security boundaries, and avoiding unnecessary CI fanout.
- path: "cmake/**/*"
instructions: |
Focus on package exports, install-tree and build-tree compatibility, target usage requirements, static
and shared library behavior, CUDA architecture handling, and compatibility across supported CMake/CUDA
versions.
- path: "**/CMakeLists.txt"
instructions: |
Focus on target dependencies, exported usage requirements, option behavior, install rules, tests,
examples, Python package integration, and compatibility across supported CMake/CUDA versions.
- path: "CMakePresets.json"
instructions: |
Focus on preset inheritance, CI parity, cache variable correctness, CUDA compiler/toolchain assumptions,
and whether presets remain useful for local and automated builds.
knowledge_base:
opt_out: false
code_guidelines:
filePatterns:
- ".clang-format"
- ".pre-commit-config.yaml"
- "README.md"
- "CMakeLists.txt"
- "CMakePresets.json"
- "pyproject.toml"
- "python/README.md"
- "python/pyproject.toml"
- "docs/benchmarks.md"
- "docs/cli_help.md"
- "docs/cli_help_axis.md"

View File

@@ -93,6 +93,21 @@
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--cold-warmup-runs <count>`
* Execute up to `<count>` warmup runs before collecting cold measurement samples.
* The minimum is 1 warmup run.
* Default is 1 warmup run.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--cold-max-warmup-walltime <seconds>`
* Stop cold warmup after the total warmup walltime exceeds `<seconds>`.
* The limit is checked after each warmup run, so actual warmup time may exceed
this value by one warmup run.
* Default is -1 seconds (disabled).
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--throttle-threshold <value>`
* Set the GPU throttle threshold as percentage of the device's default clock rate.
* Default is 75.

View File

@@ -166,6 +166,28 @@ struct benchmark_base
}
/// @}
/// Execute this many warmup runs before collecting cold measurement samples. @{
[[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; }
benchmark_base &set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs)
{
m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs
: nvbench::int64_t{1};
return *this;
}
/// @}
/// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{
[[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const
{
return m_cold_max_warmup_walltime;
}
benchmark_base &set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime)
{
m_cold_max_warmup_walltime = cold_max_warmup_walltime;
return *this;
}
/// @}
/// If true, the benchmark measurements only record CPU time and assume no GPU work is performed.
/// @{
[[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; }
@@ -321,7 +343,9 @@ protected:
bool m_skip_batched{false};
nvbench::int64_t m_min_samples{10};
nvbench::int64_t m_cold_warmup_runs{1};
nvbench::float64_t m_cold_max_warmup_walltime{-1.};
nvbench::float64_t m_skip_time{-1.};
nvbench::float64_t m_timeout{15.};

View File

@@ -43,7 +43,9 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
result->m_run_once = m_run_once;
result->m_disable_blocking_kernel = m_disable_blocking_kernel;
result->m_min_samples = m_min_samples;
result->m_min_samples = m_min_samples;
result->m_cold_warmup_runs = m_cold_warmup_runs;
result->m_cold_max_warmup_walltime = m_cold_max_warmup_walltime;
result->m_skip_time = m_skip_time;
result->m_timeout = m_timeout;

View File

@@ -46,6 +46,8 @@ measure_cold_base::measure_cold_base(state &exec_state)
, m_run_once{exec_state.get_run_once()}
, m_check_throttling(!exec_state.get_run_once())
, m_min_samples{exec_state.get_min_samples()}
, m_cold_warmup_runs{exec_state.get_cold_warmup_runs()}
, m_cold_max_warmup_walltime{exec_state.get_cold_max_warmup_walltime()}
, m_skip_time{exec_state.get_skip_time()}
, m_timeout{exec_state.get_timeout()}
, m_throttle_threshold(exec_state.get_throttle_threshold())

View File

@@ -110,7 +110,9 @@ protected:
bool m_check_throttling{true};
nvbench::int64_t m_min_samples{};
nvbench::int64_t m_cold_warmup_runs{1};
nvbench::float64_t m_cold_max_warmup_walltime{};
nvbench::float64_t m_skip_time{};
nvbench::float64_t m_timeout{};
@@ -239,8 +241,8 @@ struct measure_cold : public measure_cold_base
}
private:
// Run the kernel once, measuring the GPU time. If under skip_time, skip the
// measurement.
// Run the kernel m_cold_warmup_runs times, measuring the GPU time of the last run.
// If under skip_time, skip the measurement.
void run_warmup()
{
if (m_run_once)
@@ -248,12 +250,29 @@ private:
return;
}
// Ensure blocking kernel is loaded during the warmup
// Ref: https://github.com/NVIDIA/nvbench/issues/339
this->block_stream();
this->unblock_stream();
// disable use of blocking kernel for warm-up run
// see https://github.com/NVIDIA/nvbench/issues/240
constexpr bool disable_blocking_kernel = true;
kernel_launch_timer timer(*this, disable_blocking_kernel);
nvbench::cpu_timer warmup_walltime_timer;
this->launch_kernel(timer);
warmup_walltime_timer.start();
for (nvbench::int64_t warmup_run = 0; warmup_run < m_cold_warmup_runs; ++warmup_run)
{
this->launch_kernel(timer);
warmup_walltime_timer.stop();
if (m_cold_max_warmup_walltime > 0. &&
warmup_walltime_timer.get_duration() > m_cold_max_warmup_walltime)
{
break;
}
}
this->check_skip_time(m_cuda_timer.get_duration());
}

View File

@@ -49,7 +49,7 @@ namespace fs = std::filesystem;
#include <experimental/filesystem>
namespace fs = std::experimental::filesystem;
#else
static_assert(false, "No <filesystem> or <experimental/filesystem> found.");
#error "No <filesystem> or <experimental/filesystem> found."
#endif
#if NVBENCH_CPP_DIALECT >= 2020
@@ -429,9 +429,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
bench["name"] = bench_ptr->get_name();
bench["index"] = bench_index;
bench["min_samples"] = bench_ptr->get_min_samples();
bench["skip_time"] = bench_ptr->get_skip_time();
bench["timeout"] = bench_ptr->get_timeout();
bench["min_samples"] = bench_ptr->get_min_samples();
bench["cold_warmup_runs"] = bench_ptr->get_cold_warmup_runs();
bench["cold_max_warmup_walltime"] = bench_ptr->get_cold_max_warmup_walltime();
bench["skip_time"] = bench_ptr->get_skip_time();
bench["timeout"] = bench_ptr->get_timeout();
auto &devices = bench["devices"];
for (const auto &dev_info : bench_ptr->get_devices())
@@ -486,9 +488,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
st["name"] = exec_state.get_axis_values_as_string();
st["min_samples"] = exec_state.get_min_samples();
st["skip_time"] = exec_state.get_skip_time();
st["timeout"] = exec_state.get_timeout();
st["min_samples"] = exec_state.get_min_samples();
st["cold_warmup_runs"] = exec_state.get_cold_warmup_runs();
st["cold_max_warmup_walltime"] = exec_state.get_cold_max_warmup_walltime();
st["skip_time"] = exec_state.get_skip_time();
st["timeout"] = exec_state.get_timeout();
st["device"] = exec_state.get_device()->get_id();
st["type_config_index"] = exec_state.get_type_config_index();

View File

@@ -48,9 +48,20 @@
#include <stdexcept>
#include <string>
#include <string_view>
#include <system_error>
#include <tuple>
#include <vector>
#if __has_include(<filesystem>)
#include <filesystem>
namespace fs = std::filesystem;
#elif __has_include(<experimental/filesystem>)
#include <experimental/filesystem>
namespace fs = std::experimental::filesystem;
#else
#error "No <filesystem> or <experimental/filesystem> found."
#endif
namespace
{
@@ -115,6 +126,18 @@ catch (const std::exception &)
void parse(std::string_view input, std::string &val) { val = input; }
void create_output_parent_directories(const std::string &spec)
{
const fs::path output_path{spec};
const fs::path parent_path = output_path.parent_path();
if (parent_path.empty())
{
return;
}
fs::create_directories(parent_path);
}
// Parses a list of values "<val1>, <val2>, <val3>, ..." into a vector:
template <typename T>
std::vector<T> parse_list_values(std::string_view list_spec)
@@ -526,14 +549,14 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
this->update_axis(first[1]);
first += 2;
}
else if (arg == "--min-samples")
else if (arg == "--min-samples" || arg == "--cold-warmup-runs")
{
check_params(1);
this->update_int64_prop(first[0], first[1]);
first += 2;
}
else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" ||
arg == "--throttle-recovery-delay")
else if (arg == "--skip-time" || arg == "--timeout" || arg == "--cold-max-warmup-walltime" ||
arg == "--throttle-threshold" || arg == "--throttle-recovery-delay")
{
check_params(1);
this->update_float64_prop(first[0], first[1]);
@@ -622,6 +645,8 @@ std::ostream &option_parser::printer_spec_to_ostream(const std::string &spec)
}
else // spec is a filename:
{
::create_output_parent_directories(spec);
auto file_stream = std::make_unique<std::ofstream>();
// Throw if file can't open
file_stream->exceptions(file_stream->exceptions() | std::ios::failbit);
@@ -990,6 +1015,10 @@ try
{
bench.set_min_samples(value);
}
else if (prop_arg == "--cold-warmup-runs")
{
bench.set_cold_warmup_runs(value);
}
else
{
NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg);
@@ -1103,6 +1132,10 @@ try
{
bench.set_timeout(value);
}
else if (prop_arg == "--cold-max-warmup-walltime")
{
bench.set_cold_max_warmup_walltime(value);
}
else if (prop_arg == "--throttle-threshold")
{
bench.set_throttle_threshold(static_cast<nvbench::float32_t>(value) / 100.0f);

View File

@@ -152,6 +152,26 @@ struct state
void set_min_samples(nvbench::int64_t min_samples) { m_min_samples = min_samples; }
/// @}
/// Execute this many warmup runs before collecting cold measurement samples. @{
[[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; }
void set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs)
{
m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs
: nvbench::int64_t{1};
}
/// @}
/// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{
[[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const
{
return m_cold_max_warmup_walltime;
}
void set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime)
{
m_cold_max_warmup_walltime = cold_max_warmup_walltime;
}
/// @}
[[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
{
return m_criterion_params;
@@ -332,7 +352,9 @@ private:
std::string m_stopping_criterion;
nvbench::int64_t m_min_samples;
nvbench::int64_t m_cold_warmup_runs;
nvbench::float64_t m_cold_max_warmup_walltime;
nvbench::float64_t m_skip_time;
nvbench::float64_t m_timeout;

View File

@@ -39,6 +39,8 @@ state::state(const benchmark_base &bench)
, m_criterion_params{bench.get_criterion_params()}
, m_stopping_criterion(bench.get_stopping_criterion())
, m_min_samples{bench.get_min_samples()}
, m_cold_warmup_runs{bench.get_cold_warmup_runs()}
, m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()}
, m_skip_time{bench.get_skip_time()}
, m_timeout{bench.get_timeout()}
, m_throttle_threshold{bench.get_throttle_threshold()}
@@ -61,6 +63,8 @@ state::state(const benchmark_base &bench,
, m_criterion_params{bench.get_criterion_params()}
, m_stopping_criterion(bench.get_stopping_criterion())
, m_min_samples{bench.get_min_samples()}
, m_cold_warmup_runs{bench.get_cold_warmup_runs()}
, m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()}
, m_skip_time{bench.get_skip_time()}
, m_timeout{bench.get_timeout()}
, m_throttle_threshold{bench.get_throttle_threshold()}

View File

@@ -22,6 +22,21 @@
#include <fmt/format.h>
#include <chrono>
#include <iostream>
#include <stdexcept>
#include <system_error>
#if __has_include(<filesystem>)
#include <filesystem>
namespace fs = std::filesystem;
#elif __has_include(<experimental/filesystem>)
#include <experimental/filesystem>
namespace fs = std::experimental::filesystem;
#else
#error "No <filesystem> or <experimental/filesystem> found."
#endif
#include "test_asserts.cuh"
//==============================================================================
@@ -49,6 +64,40 @@ NVBENCH_BENCH_TYPES(TestBench, NVBENCH_TYPE_AXES(Ts, Us))
namespace
{
struct temp_tree
{
explicit temp_tree(fs::path root)
: root_path{std::move(root)}
{
std::error_code ec;
fs::remove_all(root_path, ec);
if (ec)
{
throw std::runtime_error{fmt::format("Failed to remove temporary directory `{}`: {}",
root_path.string(),
ec.message())};
}
}
~temp_tree()
{
std::error_code ec;
fs::remove_all(root_path, ec);
if (ec)
{
std::cerr << "Failed to remove temporary directory `" << root_path.string()
<< "`: " << ec.message() << "\n";
}
}
temp_tree(const temp_tree &) = delete;
temp_tree(temp_tree &&) = delete;
temp_tree &operator=(const temp_tree &) = delete;
temp_tree &operator=(temp_tree &&) = delete;
fs::path root_path;
};
[[nodiscard]] std::string states_to_string(const std::vector<nvbench::state> &states)
{
fmt::memory_buffer buffer;
@@ -1155,6 +1204,36 @@ void test_min_samples()
ASSERT(states[0].get_min_samples() == 12345);
}
void test_cold_warmup_runs()
{
{
nvbench::option_parser parser;
parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "12345"});
const auto &states = parser_to_states(parser);
ASSERT(states.size() == 1);
ASSERT(states[0].get_cold_warmup_runs() == 12345);
}
{
nvbench::option_parser parser;
parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "0"});
const auto &states = parser_to_states(parser);
ASSERT(states.size() == 1);
ASSERT(states[0].get_cold_warmup_runs() == 1);
}
{
nvbench::option_parser parser;
parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "-12345"});
const auto &states = parser_to_states(parser);
ASSERT(states.size() == 1);
ASSERT(states[0].get_cold_warmup_runs() == 1);
}
}
void test_skip_time()
{
nvbench::option_parser parser;
@@ -1165,6 +1244,16 @@ void test_skip_time()
ASSERT(std::abs(states[0].get_skip_time() - 12345e2) < 1.);
}
void test_cold_max_warmup_walltime()
{
nvbench::option_parser parser;
parser.parse({"--benchmark", "DummyBench", "--cold-max-warmup-walltime", "12345e2"});
const auto &states = parser_to_states(parser);
ASSERT(states.size() == 1);
ASSERT(std::abs(states[0].get_cold_max_warmup_walltime() - 12345e2) < 1.);
}
void test_timeout()
{
nvbench::option_parser parser;
@@ -1175,6 +1264,22 @@ void test_timeout()
ASSERT(std::abs(states[0].get_timeout() - 12345e2) < 1.);
}
void test_output_parent_directories_created()
{
const auto unique_suffix = std::chrono::steady_clock::now().time_since_epoch().count();
const temp_tree temp{fs::temp_directory_path() /
fmt::format("nvbench_option_parser_test_{}", unique_suffix)};
const auto output_path = temp.root_path / "nested" / "results.json";
{
nvbench::option_parser parser;
parser.parse({"--json", output_path.string()});
}
ASSERT(fs::is_directory(output_path.parent_path()));
ASSERT(fs::exists(output_path));
}
void test_stopping_criterion()
{
{ // Per benchmark criterion
@@ -1466,8 +1571,11 @@ try
test_axis_before_benchmark();
test_min_samples();
test_cold_warmup_runs();
test_skip_time();
test_cold_max_warmup_walltime();
test_timeout();
test_output_parent_directories_created();
test_stopping_criterion();

View File

@@ -762,9 +762,11 @@ void test_devices()
void test_termination_criteria()
{
const nvbench::int64_t min_samples = 1000;
const nvbench::float64_t skip_time = 4000;
const nvbench::float64_t timeout = 5000;
const nvbench::int64_t min_samples = 1000;
const nvbench::int64_t cold_warmup_runs = 7;
const nvbench::float64_t cold_max_warmup_walltime = 3000;
const nvbench::float64_t skip_time = 4000;
const nvbench::float64_t timeout = 5000;
// for comparing floats
auto within_one = [](auto a, auto b) { return std::abs(a - b) < 1.; };
@@ -772,6 +774,8 @@ void test_termination_criteria()
dummy_bench bench;
bench.set_devices(std::vector<int>{});
bench.set_min_samples(min_samples);
bench.set_cold_warmup_runs(cold_warmup_runs);
bench.set_cold_max_warmup_walltime(cold_max_warmup_walltime);
bench.set_skip_time(skip_time);
bench.set_timeout(timeout);
@@ -779,6 +783,8 @@ void test_termination_criteria()
ASSERT(states.size() == 1);
ASSERT(min_samples == states[0].get_min_samples());
ASSERT(cold_warmup_runs == states[0].get_cold_warmup_runs());
ASSERT(within_one(cold_max_warmup_walltime, states[0].get_cold_max_warmup_walltime()));
ASSERT(within_one(skip_time, states[0].get_skip_time()));
ASSERT(within_one(timeout, states[0].get_timeout()));
}