Auto format

This commit is contained in:
clang-format
2022-08-18 22:09:24 +04:00
committed by Georgy Evtushenko
parent 87ce9ee576
commit 8f1152d4a2
74 changed files with 835 additions and 1479 deletions

View File

@@ -41,8 +41,8 @@ struct axes_metadata
template <typename... TypeAxes>
explicit axes_metadata(nvbench::type_list<TypeAxes...>);
axes_metadata() = default;
axes_metadata(axes_metadata &&) = default;
axes_metadata() = default;
axes_metadata(axes_metadata &&) = default;
axes_metadata &operator=(axes_metadata &&) = default;
axes_metadata(const axes_metadata &);
@@ -58,20 +58,16 @@ struct axes_metadata
void add_string_axis(std::string name, std::vector<std::string> data);
[[nodiscard]] const nvbench::int64_axis &
get_int64_axis(std::string_view name) const;
[[nodiscard]] const nvbench::int64_axis &get_int64_axis(std::string_view name) const;
[[nodiscard]] nvbench::int64_axis &get_int64_axis(std::string_view name);
[[nodiscard]] const nvbench::float64_axis &
get_float64_axis(std::string_view name) const;
[[nodiscard]] const nvbench::float64_axis &get_float64_axis(std::string_view name) const;
[[nodiscard]] nvbench::float64_axis &get_float64_axis(std::string_view name);
[[nodiscard]] const nvbench::string_axis &
get_string_axis(std::string_view name) const;
[[nodiscard]] const nvbench::string_axis &get_string_axis(std::string_view name) const;
[[nodiscard]] nvbench::string_axis &get_string_axis(std::string_view name);
[[nodiscard]] const nvbench::type_axis &
get_type_axis(std::string_view name) const;
[[nodiscard]] const nvbench::type_axis &get_type_axis(std::string_view name) const;
[[nodiscard]] nvbench::type_axis &get_type_axis(std::string_view name);
[[nodiscard]] const nvbench::type_axis &get_type_axis(std::size_t index) const;
@@ -83,10 +79,9 @@ struct axes_metadata
[[nodiscard]] const nvbench::axis_base &get_axis(std::string_view name) const;
[[nodiscard]] nvbench::axis_base &get_axis(std::string_view name);
[[nodiscard]] const nvbench::axis_base &
get_axis(std::string_view name, nvbench::axis_type type) const;
[[nodiscard]] nvbench::axis_base &get_axis(std::string_view name,
nvbench::axis_type type);
[[nodiscard]] const nvbench::axis_base &get_axis(std::string_view name,
nvbench::axis_type type) const;
[[nodiscard]] nvbench::axis_base &get_axis(std::string_view name, nvbench::axis_type type);
[[nodiscard]] static std::vector<std::string>
generate_default_type_axis_names(std::size_t num_type_axes);
@@ -101,7 +96,7 @@ axes_metadata::axes_metadata(nvbench::type_list<TypeAxes...>)
{
using type_axes_list = nvbench::type_list<TypeAxes...>;
constexpr auto num_type_axes = nvbench::tl::size<type_axes_list>::value;
auto names = axes_metadata::generate_default_type_axis_names(num_type_axes);
auto names = axes_metadata::generate_default_type_axis_names(num_type_axes);
auto names_iter = names.begin(); // contents will be moved from
nvbench::tl::foreach<type_axes_list>(
@@ -114,8 +109,7 @@ axes_metadata::axes_metadata(nvbench::type_list<TypeAxes...>)
// The word "type" appears 6 times in the next line.
// Every. Single. Token.
typedef typename decltype(wrapped_type)::type type_list;
auto axis = std::make_unique<nvbench::type_axis>(std::move(*names_iter++),
type_axis_index);
auto axis = std::make_unique<nvbench::type_axis>(std::move(*names_iter++), type_axis_index);
axis->template set_inputs<type_list>();
axes.push_back(std::move(axis));
});

View File

@@ -64,9 +64,7 @@ try
auto &axis = *m_axes[i];
if (axis.get_type() != nvbench::axis_type::type)
{
NVBENCH_THROW(std::runtime_error,
"Number of names exceeds number of type axes ({})",
i);
NVBENCH_THROW(std::runtime_error, "Number of names exceeds number of type axes ({})", i);
}
axis.set_name(std::move(names[i]));
@@ -81,8 +79,7 @@ catch (std::exception &e)
names);
}
void axes_metadata::add_float64_axis(std::string name,
std::vector<nvbench::float64_t> data)
void axes_metadata::add_float64_axis(std::string name, std::vector<nvbench::float64_t> data)
{
auto axis = std::make_unique<nvbench::float64_axis>(std::move(name));
axis->set_inputs(std::move(data));
@@ -98,8 +95,7 @@ void axes_metadata::add_int64_axis(std::string name,
m_axes.push_back(std::move(axis));
}
void axes_metadata::add_string_axis(std::string name,
std::vector<std::string> data)
void axes_metadata::add_string_axis(std::string name, std::vector<std::string> data)
{
auto axis = std::make_unique<nvbench::string_axis>(std::move(name));
axis->set_inputs(std::move(data));
@@ -188,10 +184,9 @@ nvbench::type_axis &axes_metadata::get_type_axis(std::size_t index)
const axis_base &axes_metadata::get_axis(std::string_view name) const
{
auto iter =
std::find_if(m_axes.cbegin(), m_axes.cend(), [&name](const auto &axis) {
return axis->get_name() == name;
});
auto iter = std::find_if(m_axes.cbegin(), m_axes.cend(), [&name](const auto &axis) {
return axis->get_name() == name;
});
if (iter == m_axes.cend())
{
@@ -203,10 +198,9 @@ const axis_base &axes_metadata::get_axis(std::string_view name) const
axis_base &axes_metadata::get_axis(std::string_view name)
{
auto iter =
std::find_if(m_axes.begin(), m_axes.end(), [&name](const auto &axis) {
return axis->get_name() == name;
});
auto iter = std::find_if(m_axes.begin(), m_axes.end(), [&name](const auto &axis) {
return axis->get_name() == name;
});
if (iter == m_axes.end())
{
@@ -216,8 +210,7 @@ axis_base &axes_metadata::get_axis(std::string_view name)
return **iter;
}
const axis_base &axes_metadata::get_axis(std::string_view name,
nvbench::axis_type type) const
const axis_base &axes_metadata::get_axis(std::string_view name, nvbench::axis_type type) const
{
const auto &axis = this->get_axis(name);
if (axis.get_type() != type)
@@ -231,8 +224,7 @@ const axis_base &axes_metadata::get_axis(std::string_view name,
return axis;
}
axis_base &axes_metadata::get_axis(std::string_view name,
nvbench::axis_type type)
axis_base &axes_metadata::get_axis(std::string_view name, nvbench::axis_type type)
{
auto &axis = this->get_axis(name);
if (axis.get_type() != type)
@@ -246,8 +238,7 @@ axis_base &axes_metadata::get_axis(std::string_view name,
return axis;
}
std::vector<std::string>
axes_metadata::generate_default_type_axis_names(std::size_t num_type_axes)
std::vector<std::string> axes_metadata::generate_default_type_axis_names(std::size_t num_type_axes)
{
switch (num_type_axes)
{

View File

@@ -47,10 +47,7 @@ struct axis_base
[[nodiscard]] axis_type get_type() const { return m_type; }
[[nodiscard]] std::string_view get_type_as_string() const
{
return axis_type_to_string(m_type);
}
[[nodiscard]] std::string_view get_type_as_string() const { return axis_type_to_string(m_type); }
[[nodiscard]] std::string_view get_flags_as_string() const
{

View File

@@ -23,9 +23,6 @@ namespace nvbench
axis_base::~axis_base() = default;
std::unique_ptr<axis_base> axis_base::clone() const
{
return this->do_clone();
}
std::unique_ptr<axis_base> axis_base::clone() const { return this->do_clone(); }
} // namespace nvbench

View File

@@ -57,18 +57,14 @@ struct benchmark final : public benchmark_base
using type_axes = TypeAxes;
using type_configs = nvbench::tl::cartesian_product<type_axes>;
static constexpr std::size_t num_type_configs =
nvbench::tl::size<type_configs>{};
static constexpr std::size_t num_type_configs = nvbench::tl::size<type_configs>{};
benchmark()
: benchmark_base(type_axes{})
{}
private:
std::unique_ptr<benchmark_base> do_clone() const final
{
return std::make_unique<benchmark>();
}
std::unique_ptr<benchmark_base> do_clone() const final { return std::make_unique<benchmark>(); }
void do_set_type_axes_names(std::vector<std::string> names) final
{

View File

@@ -80,32 +80,28 @@ struct benchmark_base
return *this;
}
benchmark_base &add_float64_axis(std::string name,
std::vector<nvbench::float64_t> data)
benchmark_base &add_float64_axis(std::string name, std::vector<nvbench::float64_t> data)
{
m_axes.add_float64_axis(std::move(name), std::move(data));
return *this;
}
benchmark_base &add_int64_axis(
std::string name,
std::vector<nvbench::int64_t> data,
nvbench::int64_axis_flags flags = nvbench::int64_axis_flags::none)
benchmark_base &add_int64_axis(std::string name,
std::vector<nvbench::int64_t> data,
nvbench::int64_axis_flags flags = nvbench::int64_axis_flags::none)
{
m_axes.add_int64_axis(std::move(name), std::move(data), flags);
return *this;
}
benchmark_base &add_int64_power_of_two_axis(std::string name,
std::vector<nvbench::int64_t> data)
benchmark_base &add_int64_power_of_two_axis(std::string name, std::vector<nvbench::int64_t> data)
{
return this->add_int64_axis(std::move(name),
std::move(data),
nvbench::int64_axis_flags::power_of_two);
}
benchmark_base &add_string_axis(std::string name,
std::vector<std::string> data)
benchmark_base &add_string_axis(std::string name, std::vector<std::string> data)
{
m_axes.add_string_axis(std::move(name), std::move(data));
return *this;
@@ -133,48 +129,30 @@ struct benchmark_base
return *this;
}
[[nodiscard]] const std::vector<nvbench::device_info> &get_devices() const
{
return m_devices;
}
[[nodiscard]] const std::vector<nvbench::device_info> &get_devices() const { return m_devices; }
[[nodiscard]] nvbench::axes_metadata &get_axes() { return m_axes; }
[[nodiscard]] const nvbench::axes_metadata &get_axes() const
{
return m_axes;
}
[[nodiscard]] const nvbench::axes_metadata &get_axes() const { return m_axes; }
// Computes the number of configs in the benchmark.
// Unlike get_states().size(), this method may be used prior to calling run().
[[nodiscard]] std::size_t get_config_count() const;
// Is empty until run() is called.
[[nodiscard]] const std::vector<nvbench::state> &get_states() const
{
return m_states;
}
[[nodiscard]] const std::vector<nvbench::state> &get_states() const { return m_states; }
[[nodiscard]] std::vector<nvbench::state> &get_states() { return m_states; }
void run() { this->do_run(); }
void set_printer(nvbench::printer_base &printer)
{
m_printer = std::ref(printer);
}
void set_printer(nvbench::printer_base &printer) { m_printer = std::ref(printer); }
void clear_printer() { m_printer = std::nullopt; }
[[nodiscard]] optional_ref<nvbench::printer_base> get_printer() const
{
return m_printer;
}
[[nodiscard]] optional_ref<nvbench::printer_base> get_printer() const { return m_printer; }
/// Execute at least this many trials per measurement. @{
[[nodiscard]] nvbench::int64_t get_min_samples() const
{
return m_min_samples;
}
[[nodiscard]] nvbench::int64_t get_min_samples() const { return m_min_samples; }
benchmark_base &set_min_samples(nvbench::int64_t min_samples)
{
m_min_samples = min_samples;
@@ -193,7 +171,7 @@ struct benchmark_base
}
/// @}
/// If true, the benchmark does not use the blocking_kernel. This is intended
/// If true, the benchmark does not use the blocking_kernel. This is intended
/// for use with external profiling tools. @{
[[nodiscard]] bool get_disable_blocking_kernel() const { return m_disable_blocking_kernel; }
benchmark_base &set_disable_blocking_kernel(bool v)

View File

@@ -68,8 +68,7 @@ std::size_t benchmark_base::get_config_count() const
std::size_t{1},
std::multiplies<>{},
[](const auto &axis_ptr) {
if (const auto *type_axis_ptr =
dynamic_cast<const nvbench::type_axis *>(axis_ptr.get());
if (const auto *type_axis_ptr = dynamic_cast<const nvbench::type_axis *>(axis_ptr.get());
type_axis_ptr != nullptr)
{
return type_axis_ptr->get_active_count();

View File

@@ -31,8 +31,7 @@ namespace nvbench
*/
struct benchmark_manager
{
using benchmark_vector =
std::vector<std::unique_ptr<nvbench::benchmark_base>>;
using benchmark_vector = std::vector<std::unique_ptr<nvbench::benchmark_base>>;
/**
* @return The singleton benchmark_manager instance.
@@ -53,25 +52,21 @@ struct benchmark_manager
* Get a non-mutable reference to benchmark with the specified name/index.
* @{
*/
[[nodiscard]] const benchmark_base &
get_benchmark(const std::string &name) const;
[[nodiscard]] const benchmark_base &get_benchmark(const std::string &name) const;
[[nodiscard]] const benchmark_base &get_benchmark(std::size_t idx) const
{
return *m_benchmarks.at(idx);
}
/**@}*/
[[nodiscard]] const benchmark_vector &get_benchmarks() const
{
return m_benchmarks;
};
[[nodiscard]] const benchmark_vector &get_benchmarks() const { return m_benchmarks; };
private:
benchmark_manager() = default;
benchmark_manager(const benchmark_manager &) = delete;
benchmark_manager(benchmark_manager &&) = delete;
benchmark_manager() = default;
benchmark_manager(const benchmark_manager &) = delete;
benchmark_manager(benchmark_manager &&) = delete;
benchmark_manager &operator=(const benchmark_manager &) = delete;
benchmark_manager &operator=(benchmark_manager &&) = delete;
benchmark_manager &operator=(benchmark_manager &&) = delete;
benchmark_vector m_benchmarks;
};

View File

@@ -43,21 +43,18 @@ benchmark_base &benchmark_manager::add(std::unique_ptr<benchmark_base> bench)
benchmark_manager::benchmark_vector benchmark_manager::clone_benchmarks() const
{
benchmark_vector result(m_benchmarks.size());
std::transform(m_benchmarks.cbegin(),
m_benchmarks.cend(),
result.begin(),
[](const auto &bench) { return bench->clone(); });
std::transform(m_benchmarks.cbegin(), m_benchmarks.cend(), result.begin(), [](const auto &bench) {
return bench->clone();
});
return result;
}
const benchmark_base &
benchmark_manager::get_benchmark(const std::string &name) const
const benchmark_base &benchmark_manager::get_benchmark(const std::string &name) const
{
auto iter = std::find_if(m_benchmarks.cbegin(),
m_benchmarks.cend(),
[&name](const auto &bench_ptr) {
return bench_ptr->get_name() == name;
});
auto iter =
std::find_if(m_benchmarks.cbegin(), m_benchmarks.cend(), [&name](const auto &bench_ptr) {
return bench_ptr->get_name() == name;
});
if (iter == m_benchmarks.cend())
{
NVBENCH_THROW(std::out_of_range, "No benchmark named '{}'.", name);

View File

@@ -42,8 +42,8 @@ __global__ void block_stream(const volatile nvbench::int32_t *flag,
nvbench::float64_t timeout)
{
const auto start_point = cuda::std::chrono::high_resolution_clock::now();
const auto timeout_ns = cuda::std::chrono::nanoseconds(
static_cast<nvbench::int64_t>(timeout * 1e9));
const auto timeout_ns =
cuda::std::chrono::nanoseconds(static_cast<nvbench::int64_t>(timeout * 1e9));
const auto timeout_point = start_point + timeout_ns;
const bool use_timeout = timeout >= 0.;
@@ -57,41 +57,40 @@ __global__ void block_stream(const volatile nvbench::int32_t *flag,
{
*timeout_flag = 1;
__threadfence_system(); // Ensure timeout flag visibility on host.
printf(
"\n"
"######################################################################\n"
"##################### Possible Deadlock Detected #####################\n"
"######################################################################\n"
"\n"
"Forcing unblock: The current measurement appears to have deadlocked\n"
"and the results cannot be trusted.\n"
"\n"
"This happens when the KernelLauncher synchronizes the CUDA device.\n"
"If this is the case, pass the `sync` exec_tag to the `exec` call:\n"
"\n"
" state.exec(<KernelLauncher>); // Deadlock\n"
" state.exec(nvbench::exec_tag::sync, <KernelLauncher>); // Safe\n"
"\n"
"This tells NVBench about the sync so it can run the benchmark safely.\n"
"\n"
"If the KernelLauncher does not synchronize but has a very long \n"
"execution time, this may be a false positive. If so, disable this\n"
"check with:\n"
"\n"
" state.set_blocking_kernel_timeout(-1);\n"
"\n"
"The current timeout is set to %0.5g seconds.\n"
"\n"
"For more information, see the 'Benchmarks that sync' section of the\n"
"NVBench documentation.\n"
"\n"
"If this happens while profiling with an external tool,\n"
"pass the `--disable-blocking-kernel` flag or the `--profile` flag\n"
"(to also only run the benchmark once) to the executable.\n"
"\n"
"For more information, see the 'Benchmark Properties' section of the\n"
"NVBench documentation.\n\n",
timeout);
printf("\n"
"######################################################################\n"
"##################### Possible Deadlock Detected #####################\n"
"######################################################################\n"
"\n"
"Forcing unblock: The current measurement appears to have deadlocked\n"
"and the results cannot be trusted.\n"
"\n"
"This happens when the KernelLauncher synchronizes the CUDA device.\n"
"If this is the case, pass the `sync` exec_tag to the `exec` call:\n"
"\n"
" state.exec(<KernelLauncher>); // Deadlock\n"
" state.exec(nvbench::exec_tag::sync, <KernelLauncher>); // Safe\n"
"\n"
"This tells NVBench about the sync so it can run the benchmark safely.\n"
"\n"
"If the KernelLauncher does not synchronize but has a very long \n"
"execution time, this may be a false positive. If so, disable this\n"
"check with:\n"
"\n"
" state.set_blocking_kernel_timeout(-1);\n"
"\n"
"The current timeout is set to %0.5g seconds.\n"
"\n"
"For more information, see the 'Benchmarks that sync' section of the\n"
"NVBench documentation.\n"
"\n"
"If this happens while profiling with an external tool,\n"
"pass the `--disable-blocking-kernel` flag or the `--profile` flag\n"
"(to also only run the benchmark once) to the executable.\n"
"\n"
"For more information, see the 'Benchmark Properties' section of the\n"
"NVBench documentation.\n\n",
timeout);
}
}
@@ -102,15 +101,11 @@ namespace nvbench
blocking_kernel::blocking_kernel()
{
NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_flag,
sizeof(m_host_flag),
cudaHostRegisterMapped));
NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_flag, sizeof(m_host_flag), cudaHostRegisterMapped));
NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_flag, &m_host_flag, 0));
NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_timeout_flag,
sizeof(m_host_timeout_flag),
cudaHostRegisterMapped));
NVBENCH_CUDA_CALL(
cudaHostGetDevicePointer(&m_device_timeout_flag, &m_host_timeout_flag, 0));
cudaHostRegister(&m_host_timeout_flag, sizeof(m_host_timeout_flag), cudaHostRegisterMapped));
NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_timeout_flag, &m_host_timeout_flag, 0));
}
blocking_kernel::~blocking_kernel()
@@ -119,14 +114,11 @@ blocking_kernel::~blocking_kernel()
NVBENCH_CUDA_CALL_NOEXCEPT(cudaHostUnregister(&m_host_timeout_flag));
}
void blocking_kernel::block(const nvbench::cuda_stream &stream,
nvbench::float64_t timeout)
void blocking_kernel::block(const nvbench::cuda_stream &stream, nvbench::float64_t timeout)
{
m_host_flag = 0;
m_host_timeout_flag = 0;
block_stream<<<1, 1, 0, stream>>>(m_device_flag,
m_device_timeout_flag,
timeout);
block_stream<<<1, 1, 0, stream>>>(m_device_flag, m_device_timeout_flag, timeout);
}
void blocking_kernel::timeout_detected()

View File

@@ -97,10 +97,10 @@ struct blocking_kernel
}
// move-only
blocking_kernel(const blocking_kernel &) = delete;
blocking_kernel(blocking_kernel &&) = default;
blocking_kernel(const blocking_kernel &) = delete;
blocking_kernel(blocking_kernel &&) = default;
blocking_kernel &operator=(const blocking_kernel &) = delete;
blocking_kernel &operator=(blocking_kernel &&) = default;
blocking_kernel &operator=(blocking_kernel &&) = default;
private:
nvbench::int32_t m_host_flag{};

View File

@@ -30,35 +30,29 @@ struct state;
// Define a simple callable wrapper around a function. This allows the function
// to be used as a class template parameter. Intended for use with kernel
// generators and `NVBENCH_BENCH` macros.
#define NVBENCH_DEFINE_UNIQUE_CALLABLE(function) \
#define NVBENCH_DEFINE_UNIQUE_CALLABLE(function) \
NVBENCH_DEFINE_CALLABLE(function, NVBENCH_UNIQUE_IDENTIFIER(function))
#define NVBENCH_DEFINE_CALLABLE(function, callable_name) \
struct callable_name \
{ \
void operator()(nvbench::state &state, nvbench::type_list<>) \
{ \
function(state); \
} \
#define NVBENCH_DEFINE_CALLABLE(function, callable_name) \
struct callable_name \
{ \
void operator()(nvbench::state &state, nvbench::type_list<>) { function(state); } \
}
#define NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(function) \
NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, \
NVBENCH_UNIQUE_IDENTIFIER(function))
#define NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(function) \
NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, NVBENCH_UNIQUE_IDENTIFIER(function))
#define NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, callable_name) \
struct callable_name \
{ \
template <typename... Ts> \
void operator()(nvbench::state &state, nvbench::type_list<Ts...>) \
{ \
function(state, nvbench::type_list<Ts...>{}); \
} \
#define NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, callable_name) \
struct callable_name \
{ \
template <typename... Ts> \
void operator()(nvbench::state &state, nvbench::type_list<Ts...>) \
{ \
function(state, nvbench::type_list<Ts...>{}); \
} \
}
#define NVBENCH_UNIQUE_IDENTIFIER(prefix) \
NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, __LINE__)
#define NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, unique_id) \
#define NVBENCH_UNIQUE_IDENTIFIER(prefix) NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, __LINE__)
#define NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, unique_id) \
NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id)
#define NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id) \
prefix##_line_##unique_id
#define NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id) prefix##_line_##unique_id

View File

@@ -30,27 +30,20 @@ struct cpu_timer
__forceinline__ cpu_timer() = default;
// move-only
cpu_timer(const cpu_timer &) = delete;
cpu_timer(cpu_timer &&) = default;
cpu_timer(const cpu_timer &) = delete;
cpu_timer(cpu_timer &&) = default;
cpu_timer &operator=(const cpu_timer &) = delete;
cpu_timer &operator=(cpu_timer &&) = default;
cpu_timer &operator=(cpu_timer &&) = default;
__forceinline__ void start()
{
m_start = std::chrono::high_resolution_clock::now();
}
__forceinline__ void start() { m_start = std::chrono::high_resolution_clock::now(); }
__forceinline__ void stop()
{
m_stop = std::chrono::high_resolution_clock::now();
}
__forceinline__ void stop() { m_stop = std::chrono::high_resolution_clock::now(); }
// In seconds:
[[nodiscard]] __forceinline__ nvbench::float64_t get_duration()
{
const auto duration = m_stop - m_start;
const auto ns =
std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
const auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
return static_cast<nvbench::float64_t>(ns) * (1e-9);
}

View File

@@ -27,19 +27,17 @@
#define NVBENCH_TYPE_AXES(...) nvbench::type_list<__VA_ARGS__>
#define NVBENCH_BENCH(KernelGenerator) \
NVBENCH_DEFINE_UNIQUE_CALLABLE(KernelGenerator); \
nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) = \
nvbench::benchmark_manager::get() \
.add(std::make_unique< \
nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator)>>()) \
#define NVBENCH_BENCH(KernelGenerator) \
NVBENCH_DEFINE_UNIQUE_CALLABLE(KernelGenerator); \
nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) = \
nvbench::benchmark_manager::get() \
.add(std::make_unique<nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator)>>()) \
.set_name(#KernelGenerator)
#define NVBENCH_BENCH_TYPES(KernelGenerator, TypeAxes) \
NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(KernelGenerator); \
nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) = \
nvbench::benchmark_manager::get() \
.add(std::make_unique< \
nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator), \
TypeAxes>>()) \
#define NVBENCH_BENCH_TYPES(KernelGenerator, TypeAxes) \
NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(KernelGenerator); \
nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) = \
nvbench::benchmark_manager::get() \
.add(std::make_unique< \
nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator), TypeAxes>>()) \
.set_name(#KernelGenerator)

View File

@@ -66,10 +66,8 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
{
std::optional<nvbench::device_info> device = cur_state.get_device();
std::string device_id = device ? fmt::to_string(device->get_id())
: std::string{};
std::string device_name = device ? std::string{device->get_name()}
: std::string{};
std::string device_id = device ? fmt::to_string(device->get_id()) : std::string{};
std::string device_name = device ? std::string{device->get_name()} : std::string{};
table.add_cell(row, "_bench_name", "Benchmark", bench_name);
table.add_cell(row, "_device_id", "Device", std::move(device_id));
@@ -88,15 +86,11 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
name + "_axis_pow2_pretty",
name + " (pow2)",
fmt::format("2^{}", exponent));
table.add_cell(row,
name + "_axis_plain",
fmt::format("{}", name),
fmt::to_string(value));
table.add_cell(row, name + "_axis_plain", fmt::format("{}", name), fmt::to_string(value));
}
else
{
std::string value = std::visit(format_visitor,
axis_values.get_value(name));
std::string value = std::visit(format_visitor, axis_values.get_value(name));
table.add_cell(row, name + "_axis", name, std::move(value));
}
}
@@ -117,14 +111,10 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
continue;
}
const std::string &tag = summ.get_tag();
const std::string &header = summ.has_value("name")
? summ.get_string("name")
: tag;
const std::string &header = summ.has_value("name") ? summ.get_string("name") : tag;
const std::string hint = summ.has_value("hint")
? summ.get_string("hint")
: std::string{};
std::string value = std::visit(format_visitor, summ.get_value("value"));
const std::string hint = summ.has_value("hint") ? summ.get_string("hint") : std::string{};
std::string value = std::visit(format_visitor, summ.get_value("value"));
if (hint == "duration")
{
table.add_cell(row, tag, header + " (sec)", std::move(value));
@@ -182,10 +172,7 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
std::size_t remaining = table.m_columns.size();
for (const auto &col : table.m_columns)
{
fmt::format_to(buffer,
"{}{}",
col.rows[i],
(--remaining == 0) ? "" : ",");
fmt::format_to(buffer, "{}{}", col.rows[i], (--remaining == 0) ? "" : ",");
}
fmt::format_to(buffer, "\n");
}

View File

@@ -16,8 +16,8 @@
* limitations under the License.
*/
#include <nvbench/cuda_call.cuh>
#include <nvbench/config.cuh>
#include <nvbench/cuda_call.cuh>
#include <fmt/format.h>
@@ -66,11 +66,7 @@ void throw_error(const std::string &filename,
command));
}
#else
void throw_error(const std::string &,
std::size_t,
const std::string &,
CUresult)
{}
void throw_error(const std::string &, std::size_t, const std::string &, CUresult) {}
#endif
void exit_error(const std::string &filename,

View File

@@ -18,52 +18,43 @@
#pragma once
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <string>
/// Throws a std::runtime_error if `call` doesn't return `cudaSuccess`.
#define NVBENCH_CUDA_CALL(call) \
do \
{ \
const cudaError_t nvbench_cuda_call_error = call; \
if (nvbench_cuda_call_error != cudaSuccess) \
{ \
nvbench::cuda_call::throw_error(__FILE__, \
__LINE__, \
#call, \
nvbench_cuda_call_error); \
} \
#define NVBENCH_CUDA_CALL(call) \
do \
{ \
const cudaError_t nvbench_cuda_call_error = call; \
if (nvbench_cuda_call_error != cudaSuccess) \
{ \
nvbench::cuda_call::throw_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error); \
} \
} while (false)
/// Throws a std::runtime_error if `call` doesn't return `CUDA_SUCCESS`.
#define NVBENCH_DRIVER_API_CALL(call) \
do \
{ \
const CUresult nvbench_cuda_call_error = call; \
if (nvbench_cuda_call_error != CUDA_SUCCESS) \
{ \
nvbench::cuda_call::throw_error(__FILE__, \
__LINE__, \
#call, \
nvbench_cuda_call_error); \
} \
#define NVBENCH_DRIVER_API_CALL(call) \
do \
{ \
const CUresult nvbench_cuda_call_error = call; \
if (nvbench_cuda_call_error != CUDA_SUCCESS) \
{ \
nvbench::cuda_call::throw_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error); \
} \
} while (false)
/// Terminates process with failure status if `call` doesn't return
/// `cudaSuccess`.
#define NVBENCH_CUDA_CALL_NOEXCEPT(call) \
do \
{ \
const cudaError_t nvbench_cuda_call_error = call; \
if (nvbench_cuda_call_error != cudaSuccess) \
{ \
nvbench::cuda_call::exit_error(__FILE__, \
__LINE__, \
#call, \
nvbench_cuda_call_error); \
} \
#define NVBENCH_CUDA_CALL_NOEXCEPT(call) \
do \
{ \
const cudaError_t nvbench_cuda_call_error = call; \
if (nvbench_cuda_call_error != cudaSuccess) \
{ \
nvbench::cuda_call::exit_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error); \
} \
} while (false)
namespace nvbench::cuda_call

View File

@@ -66,10 +66,10 @@ struct cuda_stream
~cuda_stream() = default;
// move-only
cuda_stream(const cuda_stream &) = delete;
cuda_stream(const cuda_stream &) = delete;
cuda_stream &operator=(const cuda_stream &) = delete;
cuda_stream(cuda_stream &&) = default;
cuda_stream &operator=(cuda_stream &&) = default;
cuda_stream &operator=(cuda_stream &&) = default;
/**
* @return The `cudaStream_t` managed by this `cuda_stream`.

View File

@@ -42,10 +42,10 @@ struct cuda_timer
}
// move-only
cuda_timer(const cuda_timer &) = delete;
cuda_timer(cuda_timer &&) = default;
cuda_timer(const cuda_timer &) = delete;
cuda_timer(cuda_timer &&) = default;
cuda_timer &operator=(const cuda_timer &) = delete;
cuda_timer &operator=(cuda_timer &&) = default;
cuda_timer &operator=(cuda_timer &&) = default;
__forceinline__ void start(cudaStream_t stream)
{

View File

@@ -21,14 +21,13 @@
#include <nvbench/config.cuh>
#include <nvbench/device_info.cuh>
#include <optional>
#include <string>
#include <vector>
#include <optional>
namespace nvbench::detail
{
#ifdef NVBENCH_HAS_CUPTI
/**
* Pass required metrics in the constructor and organize your code as follows
@@ -62,7 +61,7 @@ namespace nvbench::detail
*/
class cupti_profiler
{
bool m_available {};
bool m_available{};
std::string m_chip_name;
// Counter data
@@ -87,11 +86,10 @@ public:
cupti_profiler(cupti_profiler &&) noexcept;
cupti_profiler &operator=(cupti_profiler &&) noexcept;
cupti_profiler(const cupti_profiler &) = delete;
cupti_profiler(const cupti_profiler &) = delete;
cupti_profiler &operator=(const cupti_profiler &) = delete;
cupti_profiler(nvbench::device_info device,
std::vector<std::string> &&metric_names);
cupti_profiler(nvbench::device_info device, std::vector<std::string> &&metric_names);
~cupti_profiler();
[[nodiscard]] bool is_initialized() const;
@@ -125,5 +123,4 @@ private:
};
#endif
} // namespace nvbench::detail

View File

@@ -59,8 +59,7 @@ void nvpw_call(const NVPA_Status status)
} // namespace
cupti_profiler::cupti_profiler(nvbench::device_info device,
std::vector<std::string> &&metric_names)
cupti_profiler::cupti_profiler(nvbench::device_info device, std::vector<std::string> &&metric_names)
: m_metric_names(metric_names)
, m_device(device)
{
@@ -154,12 +153,10 @@ class eval_request
NVPW_MetricsEvaluator *evaluator_ptr;
public:
eval_request(NVPW_MetricsEvaluator *evaluator_ptr,
const std::string &metric_name)
eval_request(NVPW_MetricsEvaluator *evaluator_ptr, const std::string &metric_name)
: evaluator_ptr(evaluator_ptr)
{
NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params =
{};
NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params = {};
params.structSize =
NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params_STRUCT_SIZE;
@@ -168,8 +165,7 @@ public:
params.pMetricEvalRequest = &request;
params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
nvpw_call(
NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(&params));
nvpw_call(NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(&params));
}
[[nodiscard]] std::vector<const char *> get_raw_dependencies()
@@ -178,10 +174,9 @@ public:
NVPW_MetricsEvaluator_GetMetricRawDependencies_Params params{};
params.structSize =
NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE;
params.pMetricsEvaluator = evaluator_ptr;
params.pMetricEvalRequests = &request;
params.structSize = NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE;
params.pMetricsEvaluator = evaluator_ptr;
params.pMetricEvalRequests = &request;
params.numMetricEvalRequests = 1;
params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
@@ -211,26 +206,23 @@ public:
const std::uint8_t *counter_data_image = nullptr,
const std::size_t counter_data_image_size = 0)
{
NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params
scratch_buffer_param{};
NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params scratch_buffer_param{};
scratch_buffer_param.structSize =
NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params_STRUCT_SIZE;
scratch_buffer_param.pChipName = chip_name.c_str();
scratch_buffer_param.pCounterAvailabilityImage = counter_availability_image;
nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(
&scratch_buffer_param));
nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(&scratch_buffer_param));
scratch_buffer.resize(scratch_buffer_param.scratchBufferSize);
NVPW_CUDA_MetricsEvaluator_Initialize_Params evaluator_params{};
evaluator_params.structSize =
NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE;
evaluator_params.scratchBufferSize = scratch_buffer.size();
evaluator_params.pScratchBuffer = scratch_buffer.data();
evaluator_params.pChipName = chip_name.c_str();
evaluator_params.structSize = NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE;
evaluator_params.scratchBufferSize = scratch_buffer.size();
evaluator_params.pScratchBuffer = scratch_buffer.data();
evaluator_params.pChipName = chip_name.c_str();
evaluator_params.pCounterAvailabilityImage = counter_availability_image;
evaluator_params.pCounterDataImage = counter_data_image;
evaluator_params.counterDataImageSize = counter_data_image_size;
@@ -247,7 +239,7 @@ public:
{
NVPW_MetricsEvaluator_Destroy_Params params{};
params.structSize = NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE;
params.structSize = NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE;
params.pMetricsEvaluator = evaluator_ptr;
nvpw_call(NVPW_MetricsEvaluator_Destroy(&params));
@@ -259,10 +251,7 @@ public:
return {evaluator_ptr, metric_name};
}
[[nodiscard]] operator NVPW_MetricsEvaluator *() const
{
return evaluator_ptr;
}
[[nodiscard]] operator NVPW_MetricsEvaluator *() const { return evaluator_ptr; }
};
} // namespace
@@ -270,10 +259,10 @@ public:
namespace
{
[[nodiscard]] std::vector<NVPA_RawMetricRequest> get_raw_metric_requests(
const std::string &chip_name,
const std::vector<std::string> &metric_names,
const std::uint8_t *counter_availability_image = nullptr)
[[nodiscard]] std::vector<NVPA_RawMetricRequest>
get_raw_metric_requests(const std::string &chip_name,
const std::vector<std::string> &metric_names,
const std::uint8_t *counter_availability_image = nullptr)
{
metric_evaluator evaluator(chip_name, counter_availability_image);
@@ -282,8 +271,7 @@ namespace
for (auto &metric_name : metric_names)
{
for (auto &raw_dependency :
evaluator.create_request(metric_name).get_raw_dependencies())
for (auto &raw_dependency : evaluator.create_request(metric_name).get_raw_dependencies())
{
raw_metric_names.push_back(raw_dependency);
}
@@ -295,10 +283,10 @@ namespace
for (auto &raw_name : raw_metric_names)
{
NVPA_RawMetricRequest metricRequest{};
metricRequest.structSize = NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE;
metricRequest.pMetricName = raw_name;
metricRequest.isolated = true;
metricRequest.keepInstances = true;
metricRequest.structSize = NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE;
metricRequest.pMetricName = raw_name;
metricRequest.isolated = true;
metricRequest.keepInstances = true;
raw_requests.push_back(metricRequest);
}
@@ -309,12 +297,11 @@ class metrics_config
{
bool initialized{};
void create(const std::string &chip_name,
const std::uint8_t *availability_image)
void create(const std::string &chip_name, const std::uint8_t *availability_image)
{
NVPW_CUDA_RawMetricsConfig_Create_V2_Params params{};
params.structSize = NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE;
params.structSize = NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE;
params.activityKind = NVPA_ACTIVITY_KIND_PROFILER;
params.pChipName = chip_name.c_str();
params.pCounterAvailabilityImage = availability_image;
@@ -329,9 +316,8 @@ class metrics_config
{
NVPW_RawMetricsConfig_SetCounterAvailability_Params params{};
params.structSize =
NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE;
params.pRawMetricsConfig = raw_metrics_config;
params.structSize = NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE;
params.pRawMetricsConfig = raw_metrics_config;
params.pCounterAvailabilityImage = availability_image;
nvpw_call(NVPW_RawMetricsConfig_SetCounterAvailability(&params));
@@ -341,7 +327,7 @@ class metrics_config
{
NVPW_RawMetricsConfig_BeginPassGroup_Params params{};
params.structSize = NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE;
params.structSize = NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE;
params.pRawMetricsConfig = raw_metrics_config;
nvpw_call(NVPW_RawMetricsConfig_BeginPassGroup(&params));
@@ -351,7 +337,7 @@ class metrics_config
{
NVPW_RawMetricsConfig_AddMetrics_Params params{};
params.structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE;
params.structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE;
params.pRawMetricsConfig = raw_metrics_config;
params.pRawMetricRequests = raw_metric_requests.data();
params.numMetricRequests = raw_metric_requests.size();
@@ -363,7 +349,7 @@ class metrics_config
{
NVPW_RawMetricsConfig_EndPassGroup_Params params{};
params.structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE;
params.structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE;
params.pRawMetricsConfig = raw_metrics_config;
nvpw_call(NVPW_RawMetricsConfig_EndPassGroup(&params));
@@ -373,8 +359,7 @@ class metrics_config
{
NVPW_RawMetricsConfig_GenerateConfigImage_Params params{};
params.structSize =
NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE;
params.structSize = NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE;
params.pRawMetricsConfig = raw_metrics_config;
nvpw_call(NVPW_RawMetricsConfig_GenerateConfigImage(&params));
@@ -398,7 +383,7 @@ public:
{
NVPW_RawMetricsConfig_GetConfigImage_Params params{};
params.structSize = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE;
params.structSize = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE;
params.pRawMetricsConfig = raw_metrics_config;
params.bytesAllocated = 0;
params.pBuffer = nullptr;
@@ -419,7 +404,7 @@ public:
{
NVPW_RawMetricsConfig_Destroy_Params params{};
params.structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE;
params.structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE;
params.pRawMetricsConfig = raw_metrics_config;
NVPW_RawMetricsConfig_Destroy(&params);
@@ -433,13 +418,12 @@ public:
void cupti_profiler::initialize_config_image()
{
m_config_image =
metrics_config(m_chip_name,
get_raw_metric_requests(m_chip_name,
m_metric_names,
m_availability_image.data()),
m_availability_image.data())
.get_config_image();
m_config_image = metrics_config(m_chip_name,
get_raw_metric_requests(m_chip_name,
m_metric_names,
m_availability_image.data()),
m_availability_image.data())
.get_config_image();
}
namespace
@@ -450,12 +434,11 @@ class counter_data_builder
bool initialized{};
public:
counter_data_builder(const std::string &chip_name,
const std::uint8_t *pCounterAvailabilityImage)
counter_data_builder(const std::string &chip_name, const std::uint8_t *pCounterAvailabilityImage)
{
NVPW_CUDA_CounterDataBuilder_Create_Params params{};
params.structSize = NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE;
params.structSize = NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE;
params.pChipName = chip_name.c_str();
params.pCounterAvailabilityImage = pCounterAvailabilityImage;
@@ -471,7 +454,7 @@ public:
{
NVPW_CounterDataBuilder_Destroy_Params params{};
params.structSize = NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE;
params.structSize = NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE;
params.pCounterDataBuilder = builder;
NVPW_CounterDataBuilder_Destroy(&params);
@@ -488,16 +471,14 @@ void cupti_profiler::initialize_counter_data_prefix_image()
const std::uint8_t *counter_availability_image = nullptr;
std::vector<NVPA_RawMetricRequest> raw_metric_requests =
get_raw_metric_requests(m_chip_name,
m_metric_names,
counter_availability_image);
get_raw_metric_requests(m_chip_name, m_metric_names, counter_availability_image);
counter_data_builder data_builder(m_chip_name, counter_availability_image);
{
NVPW_CounterDataBuilder_AddMetrics_Params params{};
params.structSize = NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE;
params.structSize = NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE;
params.pCounterDataBuilder = data_builder.builder;
params.pRawMetricRequests = raw_metric_requests.data();
params.numMetricRequests = raw_metric_requests.size();
@@ -508,8 +489,7 @@ void cupti_profiler::initialize_counter_data_prefix_image()
{
NVPW_CounterDataBuilder_GetCounterDataPrefix_Params params{};
params.structSize =
NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE;
params.structSize = NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE;
params.pCounterDataBuilder = data_builder.builder;
params.bytesAllocated = 0;
params.pBuffer = nullptr;
@@ -532,11 +512,9 @@ get_counter_data_image_size(CUpti_Profiler_CounterDataImageOptions *options)
{
CUpti_Profiler_CounterDataImage_CalculateSize_Params params{};
params.structSize =
CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE;
params.pOptions = options;
params.sizeofCounterDataImageOptions =
CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
params.structSize = CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE;
params.pOptions = options;
params.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
cupti_call(cuptiProfilerCounterDataImageCalculateSize(&params));
return params.counterDataImageSize;
@@ -559,12 +537,10 @@ void cupti_profiler::initialize_counter_data_image()
{
CUpti_Profiler_CounterDataImage_Initialize_Params params{};
params.structSize =
CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE;
params.sizeofCounterDataImageOptions =
CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
params.pOptions = &counter_data_image_options;
params.counterDataImageSize = m_data_image.size();
params.structSize = CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE;
params.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
params.pOptions = &counter_data_image_options;
params.counterDataImageSize = m_data_image.size();
params.pCounterDataImage = &m_data_image[0];
cupti_call(cuptiProfilerCounterDataImageInitialize(&params));
@@ -578,8 +554,7 @@ void cupti_profiler::initialize_counter_data_image()
params.counterDataImageSize = m_data_image.size();
params.pCounterDataImage = &m_data_image[0];
cupti_call(
cuptiProfilerCounterDataImageCalculateScratchBufferSize(&params));
cupti_call(cuptiProfilerCounterDataImageCalculateScratchBufferSize(&params));
m_data_scratch_buffer.resize(params.counterDataScratchBufferSize);
}
@@ -587,8 +562,7 @@ void cupti_profiler::initialize_counter_data_image()
{
CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params params{};
params.structSize =
CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE;
params.structSize = CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE;
params.counterDataImageSize = m_data_image.size();
params.pCounterDataImage = &m_data_image[0];
params.counterDataScratchBufferSize = m_data_scratch_buffer.size();
@@ -608,17 +582,14 @@ cupti_profiler::~cupti_profiler()
}
}
bool cupti_profiler::is_initialized() const
{
return m_available;
}
bool cupti_profiler::is_initialized() const { return m_available; }
void cupti_profiler::prepare_user_loop()
{
{
CUpti_Profiler_BeginSession_Params params{};
params.structSize = CUpti_Profiler_BeginSession_Params_STRUCT_SIZE;
params.structSize = CUpti_Profiler_BeginSession_Params_STRUCT_SIZE;
params.ctx = nullptr;
params.counterDataImageSize = m_data_image.size();
params.pCounterDataImage = &m_data_image[0];
@@ -735,9 +706,7 @@ std::vector<double> cupti_profiler::get_counter_values()
if (params.numRanges != 1)
{
NVBENCH_THROW(std::runtime_error,
"{}",
"Something's gone wrong, one range is expected");
NVBENCH_THROW(std::runtime_error, "{}", "Something's gone wrong, one range is expected");
}
}
@@ -752,8 +721,7 @@ std::vector<double> cupti_profiler::get_counter_values()
{
NVPW_MetricsEvaluator_SetDeviceAttributes_Params params{};
params.structSize =
NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE;
params.structSize = NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE;
params.pMetricsEvaluator = evaluator;
params.pCounterDataImage = m_data_image.data();
params.counterDataImageSize = m_data_image.size();
@@ -764,11 +732,10 @@ std::vector<double> cupti_profiler::get_counter_values()
{
NVPW_MetricsEvaluator_EvaluateToGpuValues_Params params{};
params.structSize =
NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE;
params.pMetricsEvaluator = evaluator;
params.pMetricEvalRequests = &request.request;
params.numMetricEvalRequests = 1;
params.structSize = NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE;
params.pMetricsEvaluator = evaluator;
params.pMetricEvalRequests = &request.request;
params.numMetricEvalRequests = 1;
params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
params.pCounterDataImage = m_data_image.data();

View File

@@ -39,9 +39,9 @@ struct [[maybe_unused]] device_scope
~device_scope() { NVBENCH_CUDA_CALL(cudaSetDevice(m_old_device_id)); }
// move-only
device_scope(device_scope &&) = default;
device_scope &operator=(device_scope &&) = default;
device_scope(const device_scope &) = delete;
device_scope(device_scope &&) = default;
device_scope &operator=(device_scope &&) = default;
device_scope(const device_scope &) = delete;
device_scope &operator=(const device_scope &) = delete;
private:

View File

@@ -33,7 +33,7 @@ namespace detail
template <typename KernelLauncher>
struct kernel_launch_timer_wrapper
{
explicit kernel_launch_timer_wrapper(KernelLauncher &launcher)
explicit kernel_launch_timer_wrapper(KernelLauncher &launcher)
: m_kernel_launcher{launcher}
{}

View File

@@ -31,13 +31,12 @@ struct l2flush
{
int dev_id{};
NVBENCH_CUDA_CALL(cudaGetDevice(&dev_id));
NVBENCH_CUDA_CALL(
cudaDeviceGetAttribute(&m_l2_size, cudaDevAttrL2CacheSize, dev_id));
NVBENCH_CUDA_CALL(cudaDeviceGetAttribute(&m_l2_size, cudaDevAttrL2CacheSize, dev_id));
if (m_l2_size > 0)
{
void* buffer = m_l2_buffer;
void *buffer = m_l2_buffer;
NVBENCH_CUDA_CALL(cudaMalloc(&buffer, m_l2_size));
m_l2_buffer = reinterpret_cast<int*>(buffer);
m_l2_buffer = reinterpret_cast<int *>(buffer);
}
}

View File

@@ -54,15 +54,11 @@ void measure_cold_base::check()
const auto device = m_state.get_device();
if (!device)
{
NVBENCH_THROW(std::runtime_error,
"{}",
"Device required for `cold` measurement.");
NVBENCH_THROW(std::runtime_error, "{}", "Device required for `cold` measurement.");
}
if (!device->is_active())
{ // This means something went wrong higher up. Throw an error.
NVBENCH_THROW(std::runtime_error,
"{}",
"Internal error: Current device is not active.");
NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
}
}
@@ -92,13 +88,11 @@ void measure_cold_base::record_measurements()
++m_total_samples;
// Compute convergence statistics using CUDA timings:
const auto mean_cuda_time = m_total_cuda_time /
static_cast<nvbench::float64_t>(m_total_samples);
const auto cuda_stdev =
nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
m_cuda_times.cend(),
mean_cuda_time);
auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
const auto mean_cuda_time = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
const auto cuda_stdev = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
m_cuda_times.cend(),
mean_cuda_time);
auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
if (std::isfinite(cuda_rel_stdev))
{
m_noise_tracker.push_back(cuda_rel_stdev);
@@ -132,10 +126,10 @@ bool measure_cold_base::is_finished()
{
// Use the current noise as the stdev reference.
const auto current_noise = m_noise_tracker.back();
const auto noise_stdev = nvbench::detail::statistics::standard_deviation(
m_noise_tracker.cbegin(),
m_noise_tracker.cend(),
current_noise);
const auto noise_stdev =
nvbench::detail::statistics::standard_deviation(m_noise_tracker.cbegin(),
m_noise_tracker.cend(),
current_noise);
const auto noise_rel_stdev = noise_stdev / current_noise;
// If the rel stdev of the last N cuda noise measurements is less than
@@ -162,13 +156,11 @@ bool measure_cold_base::is_finished()
void measure_cold_base::run_trials_epilogue()
{
// Only need to compute this at the end, not per iteration.
const auto cpu_mean = m_total_cuda_time /
static_cast<nvbench::float64_t>(m_total_samples);
const auto cpu_stdev =
nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
m_cpu_times.cend(),
cpu_mean);
m_cpu_noise = cpu_stdev / cpu_mean;
const auto cpu_mean = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
const auto cpu_stdev = nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
m_cpu_times.cend(),
cpu_mean);
m_cpu_noise = cpu_stdev / cpu_mean;
m_walltime_timer.stop();
}
@@ -199,8 +191,7 @@ void measure_cold_base::generate_summaries()
auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative");
summ.set_string("name", "Noise");
summ.set_string("hint", "percentage");
summ.set_string("description",
"Relative standard deviation of isolated CPU times");
summ.set_string("description", "Relative standard deviation of isolated CPU times");
summ.set_float64("value", m_cpu_noise);
}
@@ -219,12 +210,10 @@ void measure_cold_base::generate_summaries()
auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative");
summ.set_string("name", "Noise");
summ.set_string("hint", "percentage");
summ.set_string("description",
"Relative standard deviation of isolated GPU times");
summ.set_string("description", "Relative standard deviation of isolated GPU times");
summ.set_float64("value",
m_noise_tracker.empty()
? std::numeric_limits<nvbench::float64_t>::infinity()
: m_noise_tracker.back());
m_noise_tracker.empty() ? std::numeric_limits<nvbench::float64_t>::infinity()
: m_noise_tracker.back());
}
if (const auto items = m_state.get_element_count(); items != 0)
@@ -232,8 +221,7 @@ void measure_cold_base::generate_summaries()
auto &summ = m_state.add_summary("nv/cold/bw/item_rate");
summ.set_string("name", "Elem/s");
summ.set_string("hint", "item_rate");
summ.set_string("description",
"Number of input elements processed per second");
summ.set_string("description", "Number of input elements processed per second");
summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
}
@@ -251,8 +239,8 @@ void measure_cold_base::generate_summaries()
}
{
const auto peak_gmem_bw = static_cast<double>(
m_state.get_device()->get_global_memory_bus_bandwidth());
const auto peak_gmem_bw =
static_cast<double>(m_state.get_device()->get_global_memory_bus_bandwidth());
auto &summ = m_state.add_summary("nv/cold/bw/global/utilization");
summ.set_string("name", "BWUtil");
@@ -274,8 +262,7 @@ void measure_cold_base::generate_summaries()
}
// Log if a printer exists:
if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
printer_opt_ref.has_value())
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
{
auto &printer = printer_opt_ref.value().get();
@@ -324,10 +311,7 @@ void measure_cold_base::generate_summaries()
m_walltime_timer.get_duration(),
m_total_samples));
printer.process_bulk_data(m_state,
"nv/cold/sample_times",
"sample_times",
m_cuda_times);
printer.process_bulk_data(m_state, "nv/cold/sample_times", "sample_times", m_cuda_times);
}
}

View File

@@ -49,10 +49,10 @@ namespace detail
struct measure_cold_base
{
explicit measure_cold_base(nvbench::state &exec_state);
measure_cold_base(const measure_cold_base &) = delete;
measure_cold_base(measure_cold_base &&) = delete;
measure_cold_base(const measure_cold_base &) = delete;
measure_cold_base(measure_cold_base &&) = delete;
measure_cold_base &operator=(const measure_cold_base &) = delete;
measure_cold_base &operator=(measure_cold_base &&) = delete;
measure_cold_base &operator=(measure_cold_base &&) = delete;
protected:
template <bool use_blocking_kernel>
@@ -68,10 +68,7 @@ protected:
void check_skip_time(nvbench::float64_t warmup_time);
__forceinline__ void flush_device_l2()
{
m_l2flush.flush(m_launch.get_stream());
}
__forceinline__ void flush_device_l2() { m_l2flush.flush(m_launch.get_stream()); }
__forceinline__ void sync_stream() const
{

View File

@@ -50,8 +50,7 @@ struct metric_traits;
template <>
struct metric_traits<metric_id::dram_peak_sustained_throughput>
{
static constexpr const char *metric_name =
"dram__throughput.avg.pct_of_peak_sustained_elapsed";
static constexpr const char *metric_name = "dram__throughput.avg.pct_of_peak_sustained_elapsed";
static constexpr const char *name = "HBWPeak";
static constexpr const char *hint = "percentage";
@@ -119,10 +118,7 @@ struct metric_traits<metric_id::l1_hit_rate>
static constexpr const char *description = "Hit rate at L1 cache.";
static constexpr double divider = 100.0;
static bool is_collected(nvbench::state &m_state)
{
return m_state.is_l1_hit_rate_collected();
};
static bool is_collected(nvbench::state &m_state) { return m_state.is_l1_hit_rate_collected(); };
};
template <>
@@ -134,10 +130,7 @@ struct metric_traits<metric_id::l2_hit_rate>
static constexpr const char *description = "Hit rate at L2 cache.";
static constexpr double divider = 100.0;
static bool is_collected(nvbench::state &m_state)
{
return m_state.is_l2_hit_rate_collected();
};
static bool is_collected(nvbench::state &m_state) { return m_state.is_l2_hit_rate_collected(); };
};
template <metric_id id = metric_id::dram_peak_sustained_throughput>
@@ -153,8 +146,7 @@ void add_metrics_impl(nvbench::state &state, std::vector<std::string> &metrics)
}
template <>
void add_metrics_impl<metric_id::count>(nvbench::state &,
std::vector<std::string> &)
void add_metrics_impl<metric_id::count>(nvbench::state &, std::vector<std::string> &)
{}
std::vector<std::string> add_metrics(nvbench::state &state)
@@ -179,13 +171,11 @@ try
// clang-format on
catch (const std::exception &ex)
{
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
printer_opt_ref)
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref)
{
auto &printer = printer_opt_ref.value().get();
printer.log(nvbench::log_level::warn,
fmt::format("CUPTI failed to construct profiler: {}",
ex.what()));
fmt::format("CUPTI failed to construct profiler: {}", ex.what()));
}
}
@@ -194,15 +184,11 @@ void measure_cupti_base::check()
const auto device = m_state.get_device();
if (!device)
{
NVBENCH_THROW(std::runtime_error,
"{}",
"Device required for `cupti` measurement.");
NVBENCH_THROW(std::runtime_error, "{}", "Device required for `cupti` measurement.");
}
if (!device->is_active())
{ // This means something went wrong higher up. Throw an error.
NVBENCH_THROW(std::runtime_error,
"{}",
"Internal error: Current device is not active.");
NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
}
}
@@ -210,16 +196,13 @@ namespace
{
template <metric_id id = metric_id::dram_peak_sustained_throughput>
void gen_summary(std::size_t result_id,
nvbench::state &m_state,
const std::vector<double> &result)
void gen_summary(std::size_t result_id, nvbench::state &m_state, const std::vector<double> &result)
{
using metric = metric_traits<id>;
if (metric::is_collected(m_state))
{
auto &summ =
m_state.add_summary(fmt::format("nv/cupti/{}", metric::metric_name));
auto &summ = m_state.add_summary(fmt::format("nv/cupti/{}", metric::metric_name));
summ.set_string("name", metric::name);
summ.set_string("hint", metric::hint);
summ.set_string("description", metric::description);
@@ -231,9 +214,7 @@ void gen_summary(std::size_t result_id,
}
template <>
void gen_summary<metric_id::count>(std::size_t,
nvbench::state &,
const std::vector<double> &)
void gen_summary<metric_id::count>(std::size_t, nvbench::state &, const std::vector<double> &)
{}
void gen_summaries(nvbench::state &state, const std::vector<double> &result)
@@ -266,8 +247,7 @@ try
}
// Log if a printer exists:
if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
printer_opt_ref.has_value())
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
{
auto &printer = printer_opt_ref.value().get();
printer.log(nvbench::log_level::pass,
@@ -278,13 +258,11 @@ try
}
catch (const std::exception &ex)
{
if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
printer_opt_ref)
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref)
{
auto &printer = printer_opt_ref.value().get();
printer.log(nvbench::log_level::warn,
fmt::format("CUPTI failed to generate the summary: {}",
ex.what()));
fmt::format("CUPTI failed to generate the summary: {}", ex.what()));
}
}

View File

@@ -50,10 +50,10 @@ namespace detail
struct measure_cupti_base
{
explicit measure_cupti_base(nvbench::state &exec_state);
measure_cupti_base(const measure_cupti_base &) = delete;
measure_cupti_base(measure_cupti_base &&) = delete;
measure_cupti_base(const measure_cupti_base &) = delete;
measure_cupti_base(measure_cupti_base &&) = delete;
measure_cupti_base &operator=(const measure_cupti_base &) = delete;
measure_cupti_base &operator=(measure_cupti_base &&) = delete;
measure_cupti_base &operator=(measure_cupti_base &&) = delete;
protected:
struct kernel_launch_timer;
@@ -61,10 +61,7 @@ protected:
void check();
void generate_summaries();
__forceinline__ void flush_device_l2()
{
m_l2flush.flush(m_launch.get_stream());
}
__forceinline__ void flush_device_l2() { m_l2flush.flush(m_launch.get_stream()); }
__forceinline__ void sync_stream() const
{

View File

@@ -47,9 +47,8 @@ measure_hot_base::measure_hot_base(state &exec_state)
// to match the cold result if available.
try
{
nvbench::int64_t cold_samples =
m_state.get_summary("nv/cold/sample_size").get_int64("value");
m_min_samples = std::max(m_min_samples, cold_samples);
nvbench::int64_t cold_samples = m_state.get_summary("nv/cold/sample_size").get_int64("value");
m_min_samples = std::max(m_min_samples, cold_samples);
// If the cold measurement ran successfully, disable skip_time. It'd just
// be annoying to skip now.
@@ -72,15 +71,11 @@ void measure_hot_base::check()
const auto device = m_state.get_device();
if (!device)
{
NVBENCH_THROW(std::runtime_error,
"{}",
"Device required for `hot` measurement.");
NVBENCH_THROW(std::runtime_error, "{}", "Device required for `hot` measurement.");
}
if (!device->is_active())
{ // This means something went wrong higher up. Throw an error.
NVBENCH_THROW(std::runtime_error,
"{}",
"Internal error: Current device is not active.");
NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
}
}
@@ -116,8 +111,7 @@ void measure_hot_base::generate_summaries()
}
// Log if a printer exists:
if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
printer_opt_ref.has_value())
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
{
auto &printer = printer_opt_ref.value().get();

View File

@@ -41,10 +41,10 @@ namespace detail
struct measure_hot_base
{
explicit measure_hot_base(nvbench::state &exec_state);
measure_hot_base(const measure_hot_base &) = delete;
measure_hot_base(measure_hot_base &&) = delete;
measure_hot_base(const measure_hot_base &) = delete;
measure_hot_base(measure_hot_base &&) = delete;
measure_hot_base &operator=(const measure_hot_base &) = delete;
measure_hot_base &operator=(measure_hot_base &&) = delete;
measure_hot_base &operator=(measure_hot_base &&) = delete;
protected:
void check();
@@ -131,7 +131,7 @@ private:
// The .95 factor here pads the batch_size a bit to avoid needing a second
// batch due to noise.
const auto time_estimate = m_cuda_timer.get_duration() * 0.95;
auto batch_size = static_cast<nvbench::int64_t>(m_min_time / time_estimate);
auto batch_size = static_cast<nvbench::int64_t>(m_min_time / time_estimate);
do
{
@@ -142,7 +142,7 @@ private:
// Block stream until some work is queued.
// Limit the number of kernel executions while blocked to prevent
// deadlocks. See warnings on blocking_kernel.
const auto blocked_launches = std::min(batch_size, nvbench::int64_t{2});
const auto blocked_launches = std::min(batch_size, nvbench::int64_t{2});
const auto unblocked_launches = batch_size - blocked_launches;
this->block_stream();
@@ -189,7 +189,6 @@ private:
break; // Stop iterating
}
m_walltime_timer.stop();
if (m_walltime_timer.get_duration() > m_timeout)
{

View File

@@ -58,18 +58,12 @@ struct ring_buffer
/**
* The number of valid values in the ring buffer. Always <= capacity().
*/
[[nodiscard]] std::size_t size() const
{
return m_full ? m_buffer.size() : m_index;
}
[[nodiscard]] std::size_t size() const { return m_full ? m_buffer.size() : m_index; }
/**
* The maximum size of the ring buffer.
*/
[[nodiscard]] std::size_t capacity() const
{
return m_buffer.size();
}
[[nodiscard]] std::size_t capacity() const { return m_buffer.size(); }
/**
* @return True if the ring buffer is empty.

View File

@@ -53,7 +53,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
"`ExecTags` argument must be a member (or combination of "
"members) from nvbench::exec_tag.");
constexpr auto measure_tags = tags & measure_mask;
constexpr auto measure_tags = tags & measure_mask;
constexpr auto modifier_tags = tags & modifier_mask;
// "run once" is handled by the cold measurement:
@@ -81,8 +81,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
}
else
{
this->exec(cold | hot | tags,
std::forward<KernelLauncher>(kernel_launcher));
this->exec(cold | hot | tags, std::forward<KernelLauncher>(kernel_launcher));
}
return;
}
@@ -99,8 +98,8 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
constexpr bool use_blocking_kernel = !(tags & no_block);
if constexpr (tags & timer)
{
// Estimate bandwidth here
#ifdef NVBENCH_HAS_CUPTI
// Estimate bandwidth here
#ifdef NVBENCH_HAS_CUPTI
if constexpr (!(modifier_tags & run_once))
{
if (this->is_cupti_required())
@@ -110,7 +109,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
measure();
}
}
#endif
#endif
using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
measure_t measure{*this, kernel_launcher};
@@ -121,8 +120,8 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
wrapper_t wrapper{kernel_launcher};
// Estimate bandwidth here
#ifdef NVBENCH_HAS_CUPTI
// Estimate bandwidth here
#ifdef NVBENCH_HAS_CUPTI
if constexpr (!(modifier_tags & run_once))
{
if (this->is_cupti_required())
@@ -132,10 +131,9 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
measure();
}
}
#endif
#endif
using measure_t =
nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
measure_t measure(*this, wrapper);
measure();
}
@@ -143,12 +141,10 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
if constexpr (tags & hot)
{
static_assert(!(tags & sync),
"Hot measurement doesn't support the `sync` exec_tag.");
static_assert(!(tags & timer),
"Hot measurement doesn't support the `timer` exec_tag.");
static_assert(!(tags & sync), "Hot measurement doesn't support the `sync` exec_tag.");
static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
constexpr bool use_blocking_kernel = !(tags & no_block);
using measure_t = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
using measure_t = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
measure_t measure{*this, kernel_launcher};
measure();
}

View File

@@ -40,9 +40,7 @@ void state_iterator::add_axis(const nvbench::axis_base &axis)
this->add_axis(axis.get_name(), axis.get_type(), axis.get_size());
}
void state_iterator::add_axis(std::string axis,
nvbench::axis_type type,
std::size_t size)
void state_iterator::add_axis(std::string axis, nvbench::axis_type type, std::size_t size)
{
m_indices.push_back({std::move(axis), type, std::size_t{0}, size});
}
@@ -74,10 +72,7 @@ state_iterator::get_current_indices() const
return m_indices;
}
[[nodiscard]] bool state_iterator::iter_valid() const
{
return m_current < m_total;
}
[[nodiscard]] bool state_iterator::iter_valid() const { return m_current < m_total; }
void state_iterator::next()
{
@@ -102,7 +97,7 @@ state_generator::state_generator(const benchmark_base &bench)
void state_generator::build_axis_configs()
{
const axes_metadata &axes = m_benchmark.get_axes();
const axes_metadata &axes = m_benchmark.get_axes();
const std::vector<std::unique_ptr<axis_base>> &axes_vec = axes.get_axes();
// Construct two state_generators:
@@ -118,35 +113,29 @@ void state_generator::build_axis_configs()
type_axes.reserve(axes_vec.size());
// Filter all axes by into type and non-type:
std::for_each(axes_vec.cbegin(),
axes_vec.cend(),
[&non_type_si, &type_axes](const auto &axis) {
if (axis->get_type() == nvbench::axis_type::type)
{
type_axes.push_back(
std::cref(static_cast<const type_axis &>(*axis)));
}
else
{
non_type_si.add_axis(*axis);
}
});
std::for_each(axes_vec.cbegin(), axes_vec.cend(), [&non_type_si, &type_axes](const auto &axis) {
if (axis->get_type() == nvbench::axis_type::type)
{
type_axes.push_back(std::cref(static_cast<const type_axis &>(*axis)));
}
else
{
non_type_si.add_axis(*axis);
}
});
// Reverse sort type axes by index. This way the state_generator's cartesian
// product of the type axes values will be enumerated in the same order as
// nvbench::tl::cartesian_product<type_axes>. This is necessary to ensure
// that the correct states are passed to the corresponding benchmark
// instantiations.
std::sort(type_axes.begin(),
type_axes.end(),
[](const auto &axis_1, const auto &axis_2) {
return axis_1.get().get_axis_index() >
axis_2.get().get_axis_index();
});
std::sort(type_axes.begin(), type_axes.end(), [](const auto &axis_1, const auto &axis_2) {
return axis_1.get().get_axis_index() > axis_2.get().get_axis_index();
});
std::for_each(type_axes.cbegin(),
type_axes.cend(),
[&type_si](const auto &axis) { type_si.add_axis(axis); });
std::for_each(type_axes.cbegin(), type_axes.cend(), [&type_si](const auto &axis) {
type_si.add_axis(axis);
});
}
// type_axis_configs generation:
@@ -157,8 +146,8 @@ void state_generator::build_axis_configs()
// Build type_axis_configs
for (type_si.init(); type_si.iter_valid(); type_si.next())
{
auto &[config, active_mask] = m_type_axis_configs.emplace_back(
std::make_pair(nvbench::named_values{}, true));
auto &[config, active_mask] =
m_type_axis_configs.emplace_back(std::make_pair(nvbench::named_values{}, true));
// Reverse the indices so they're once again in the same order as
// specified:
@@ -173,8 +162,7 @@ void state_generator::build_axis_configs()
active_mask = false;
}
config.set_string(axis_info.axis,
axis.get_input_string(axis_info.index));
config.set_string(axis_info.axis, axis.get_input_string(axis_info.index));
}
} // type_si
} // type_axis_config generation
@@ -199,21 +187,18 @@ void state_generator::build_axis_configs()
break;
case axis_type::int64:
config.set_int64(
axis_info.axis,
axes.get_int64_axis(axis_info.axis).get_value(axis_info.index));
config.set_int64(axis_info.axis,
axes.get_int64_axis(axis_info.axis).get_value(axis_info.index));
break;
case axis_type::float64:
config.set_float64(
axis_info.axis,
axes.get_float64_axis(axis_info.axis).get_value(axis_info.index));
config.set_float64(axis_info.axis,
axes.get_float64_axis(axis_info.axis).get_value(axis_info.index));
break;
case axis_type::string:
config.set_string(
axis_info.axis,
axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
config.set_string(axis_info.axis,
axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
break;
} // switch (type)
} // for (axis_info : current_indices)
@@ -239,15 +224,12 @@ void state_generator::build_states()
}
}
void state_generator::add_states_for_device(
const std::optional<device_info> &device)
void state_generator::add_states_for_device(const std::optional<device_info> &device)
{
const auto num_type_configs = m_type_axis_configs.size();
for (std::size_t type_config_index = 0; type_config_index < num_type_configs;
++type_config_index)
for (std::size_t type_config_index = 0; type_config_index < num_type_configs; ++type_config_index)
{
const auto &[type_config,
axis_mask] = m_type_axis_configs[type_config_index];
const auto &[type_config, axis_mask] = m_type_axis_configs[type_config_index];
if (!axis_mask)
{ // Don't generate inner vector if the type config is masked out.
@@ -261,10 +243,7 @@ void state_generator::add_states_for_device(
config.append(non_type_config);
// Create benchmark:
m_states.push_back(nvbench::state{m_benchmark,
std::move(config),
device,
type_config_index});
m_states.push_back(nvbench::state{m_benchmark, std::move(config), device, type_config_index});
}
}
}

View File

@@ -36,8 +36,7 @@ namespace nvbench::detail::statistics
*
* If the input has fewer than 5 sample, infinity is returned.
*/
template <typename Iter,
typename ValueType = typename std::iterator_traits<Iter>::value_type>
template <typename Iter, typename ValueType = typename std::iterator_traits<Iter>::value_type>
ValueType standard_deviation(Iter first, Iter last, ValueType mean)
{
static_assert(std::is_floating_point_v<ValueType>);

View File

@@ -21,17 +21,15 @@
#include <fmt/format.h>
#include <stdexcept>
#define NVBENCH_THROW(exception_type, format_str, ...) \
throw exception_type(fmt::format("{}:{}: {}", \
__FILE__, \
__LINE__, \
fmt::format(format_str, __VA_ARGS__)))
#define NVBENCH_THROW(exception_type, format_str, ...) \
throw exception_type( \
fmt::format("{}:{}: {}", __FILE__, __LINE__, fmt::format(format_str, __VA_ARGS__)))
#define NVBENCH_THROW_IF(condition, exception_type, format_str, ...) \
do \
{ \
if (condition) \
{ \
NVBENCH_THROW(exception_type, format_str, __VA_ARGS__); \
} \
#define NVBENCH_THROW_IF(condition, exception_type, format_str, ...) \
do \
{ \
if (condition) \
{ \
NVBENCH_THROW(exception_type, format_str, __VA_ARGS__); \
} \
} while (false)

View File

@@ -27,10 +27,7 @@
namespace nvbench::detail
{
template <typename InIterT,
typename InitValueT,
typename ReduceOp,
typename TransformOp>
template <typename InIterT, typename InitValueT, typename ReduceOp, typename TransformOp>
InitValueT transform_reduce(InIterT first,
InIterT last,
InitValueT init,

View File

@@ -20,12 +20,10 @@ namespace tl::detail
{
template <typename... Ts>
auto size(nvbench::type_list<Ts...>)
-> std::integral_constant<std::size_t, sizeof...(Ts)>;
auto size(nvbench::type_list<Ts...>) -> std::integral_constant<std::size_t, sizeof...(Ts)>;
template <std::size_t I, typename... Ts>
auto get(nvbench::type_list<Ts...>)
-> std::tuple_element_t<I, std::tuple<Ts...>>;
auto get(nvbench::type_list<Ts...>) -> std::tuple_element_t<I, std::tuple<Ts...>>;
template <typename... Ts, typename... Us>
auto concat(nvbench::type_list<Ts...>, nvbench::type_list<Us...>)
@@ -44,9 +42,8 @@ struct prepend_each<T, nvbench::type_list<>>
template <typename T, typename TL, typename... TLTail>
struct prepend_each<T, nvbench::type_list<TL, TLTail...>>
{
using cur = decltype(detail::concat(nvbench::type_list<T>{}, TL{}));
using next =
typename detail::prepend_each<T, nvbench::type_list<TLTail...>>::type;
using cur = decltype(detail::concat(nvbench::type_list<T>{}, TL{}));
using next = typename detail::prepend_each<T, nvbench::type_list<TLTail...>>::type;
using type = decltype(detail::concat(nvbench::type_list<cur>{}, next{}));
};
@@ -71,23 +68,20 @@ struct cartesian_product<nvbench::type_list<nvbench::type_list<>, TLTail...>>
template <typename T, typename... Ts>
struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Ts...>>>
{
using cur = nvbench::type_list<nvbench::type_list<T>>;
using next =
std::conditional_t<sizeof...(Ts) != 0,
typename detail::cartesian_product<
nvbench::type_list<nvbench::type_list<Ts...>>>::type,
nvbench::type_list<>>;
using cur = nvbench::type_list<nvbench::type_list<T>>;
using next = std::conditional_t<
sizeof...(Ts) != 0,
typename detail::cartesian_product<nvbench::type_list<nvbench::type_list<Ts...>>>::type,
nvbench::type_list<>>;
using type = decltype(detail::concat(cur{}, next{}));
};
template <typename T, typename... Tail, typename TL, typename... TLTail>
struct cartesian_product<
nvbench::type_list<nvbench::type_list<T, Tail...>, TL, TLTail...>>
struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Tail...>, TL, TLTail...>>
{
using tail_prod =
typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
using cur = typename detail::prepend_each<T, tail_prod>::type;
using next = typename detail::cartesian_product<
using tail_prod = typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
using cur = typename detail::prepend_each<T, tail_prod>::type;
using next = typename detail::cartesian_product<
nvbench::type_list<nvbench::type_list<Tail...>, TL, TLTail...>>::type;
using type = decltype(detail::concat(cur{}, next{}));
};

View File

@@ -65,17 +65,15 @@ void device_info::set_persistence_mode(bool state)
#else // NVBENCH_HAS_NVML
try
{
NVBENCH_NVML_CALL(nvmlDeviceSetPersistenceMode(
m_nvml_device,
state ? NVML_FEATURE_ENABLED : NVML_FEATURE_DISABLED));
NVBENCH_NVML_CALL(
nvmlDeviceSetPersistenceMode(m_nvml_device,
state ? NVML_FEATURE_ENABLED : NVML_FEATURE_DISABLED));
}
catch (nvml::call_failed &e)
{
if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED)
{
NVBENCH_THROW(std::runtime_error,
"{}",
"Persistence mode is only supported on Linux.");
NVBENCH_THROW(std::runtime_error, "{}", "Persistence mode is only supported on Linux.");
}
else if (e.get_error_code() == NVML_ERROR_NO_PERMISSION)
{
@@ -104,30 +102,26 @@ try
break;
case clock_rate::base:
NVBENCH_NVML_CALL(nvmlDeviceSetGpuLockedClocks(
m_nvml_device,
static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP),
static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP)));
NVBENCH_NVML_CALL(
nvmlDeviceSetGpuLockedClocks(m_nvml_device,
static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP),
static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP)));
break;
case clock_rate::maximum: {
const auto max_mhz = static_cast<unsigned int>(
this->get_sm_default_clock_rate() / (1000 * 1000));
NVBENCH_NVML_CALL(
nvmlDeviceSetGpuLockedClocks(m_nvml_device, max_mhz, max_mhz));
const auto max_mhz =
static_cast<unsigned int>(this->get_sm_default_clock_rate() / (1000 * 1000));
NVBENCH_NVML_CALL(nvmlDeviceSetGpuLockedClocks(m_nvml_device, max_mhz, max_mhz));
break;
}
default:
NVBENCH_THROW(std::runtime_error,
"Unrecognized clock rate: {}",
static_cast<int>(rate));
NVBENCH_THROW(std::runtime_error, "Unrecognized clock rate: {}", static_cast<int>(rate));
}
}
catch (nvml::call_failed &e)
{
if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED &&
this->get_sm_version() < 700)
if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED && this->get_sm_version() < 700)
{
NVBENCH_THROW(std::runtime_error,
"GPU clock rates can only be modified for Volta and later. "
@@ -156,9 +150,7 @@ catch (nvml::call_failed &e)
{
if (!is_active())
{
NVBENCH_THROW(std::runtime_error,
"{}",
"get_context is called for inactive device");
NVBENCH_THROW(std::runtime_error, "{}", "get_context is called for inactive device");
}
CUcontext cu_context;

View File

@@ -54,10 +54,7 @@ struct device_info
[[nodiscard]] int get_id() const { return m_id; }
/// @return The name of the device.
[[nodiscard]] std::string_view get_name() const
{
return std::string_view(m_prop.name);
}
[[nodiscard]] std::string_view get_name() const { return std::string_view(m_prop.name); }
[[nodiscard]] bool is_active() const
{
@@ -83,7 +80,6 @@ struct device_info
/// @note Requires root / admin privileges.
void set_persistence_mode(bool state);
/// Symbolic values for special clock rates
enum class clock_rate
{
@@ -101,10 +97,7 @@ struct device_info
void lock_gpu_clocks(clock_rate rate);
/// @return The SM version of the current device as (major*100) + (minor*10).
[[nodiscard]] int get_sm_version() const
{
return m_prop.major * 100 + m_prop.minor * 10;
}
[[nodiscard]] int get_sm_version() const { return m_prop.major * 100 + m_prop.minor * 10; }
/// @return The PTX version of the current device, e.g. sm_80 returns 800.
[[nodiscard]] __forceinline__ int get_ptx_version() const
@@ -119,46 +112,25 @@ struct device_info
}
/// @return The number of physical streaming multiprocessors on this device.
[[nodiscard]] int get_number_of_sms() const
{
return m_prop.multiProcessorCount;
}
[[nodiscard]] int get_number_of_sms() const { return m_prop.multiProcessorCount; }
/// @return The maximum number of resident blocks per SM.
[[nodiscard]] int get_max_blocks_per_sm() const
{
return m_prop.maxBlocksPerMultiProcessor;
}
[[nodiscard]] int get_max_blocks_per_sm() const { return m_prop.maxBlocksPerMultiProcessor; }
/// @return The maximum number of resident threads per SM.
[[nodiscard]] int get_max_threads_per_sm() const
{
return m_prop.maxThreadsPerMultiProcessor;
}
[[nodiscard]] int get_max_threads_per_sm() const { return m_prop.maxThreadsPerMultiProcessor; }
/// @return The maximum number of threads per block.
[[nodiscard]] int get_max_threads_per_block() const
{
return m_prop.maxThreadsPerBlock;
}
[[nodiscard]] int get_max_threads_per_block() const { return m_prop.maxThreadsPerBlock; }
/// @return The number of registers per SM.
[[nodiscard]] int get_registers_per_sm() const
{
return m_prop.regsPerMultiprocessor;
}
[[nodiscard]] int get_registers_per_sm() const { return m_prop.regsPerMultiprocessor; }
/// @return The number of registers per block.
[[nodiscard]] int get_registers_per_block() const
{
return m_prop.regsPerBlock;
}
[[nodiscard]] int get_registers_per_block() const { return m_prop.regsPerBlock; }
/// @return The total number of bytes available in global memory.
[[nodiscard]] std::size_t get_global_memory_size() const
{
return m_prop.totalGlobalMem;
}
[[nodiscard]] std::size_t get_global_memory_size() const { return m_prop.totalGlobalMem; }
struct memory_info
{
@@ -176,10 +148,7 @@ struct device_info
}
/// @return The width of the global memory bus in bits.
[[nodiscard]] int get_global_memory_bus_width() const
{
return m_prop.memoryBusWidth;
}
[[nodiscard]] int get_global_memory_bus_width() const { return m_prop.memoryBusWidth; }
//// @return The global memory bus bandwidth in bytes/sec.
[[nodiscard]] std::size_t get_global_memory_bus_bandwidth() const
@@ -201,10 +170,7 @@ struct device_info
}
/// @return The available amount of shared memory in bytes per block.
[[nodiscard]] std::size_t get_shared_memory_per_block() const
{
return m_prop.sharedMemPerBlock;
}
[[nodiscard]] std::size_t get_shared_memory_per_block() const { return m_prop.sharedMemPerBlock; }
/// @return True if ECC is enabled on this device.
[[nodiscard]] bool get_ecc_state() const { return m_prop.ECCEnabled; }
@@ -224,23 +190,11 @@ struct device_info
#endif
/// @return A cached copy of the device's cudaDeviceProp.
[[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const
{
return m_prop;
}
[[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const { return m_prop; }
[[nodiscard]] bool operator<(const device_info &o) const
{
return m_id < o.m_id;
}
[[nodiscard]] bool operator==(const device_info &o) const
{
return m_id == o.m_id;
}
[[nodiscard]] bool operator!=(const device_info &o) const
{
return m_id != o.m_id;
}
[[nodiscard]] bool operator<(const device_info &o) const { return m_id < o.m_id; }
[[nodiscard]] bool operator==(const device_info &o) const { return m_id == o.m_id; }
[[nodiscard]] bool operator!=(const device_info &o) const { return m_id != o.m_id; }
private:
int m_id;
@@ -267,11 +221,10 @@ try
{
nvbench::detail::device_scope _{dev_id};
cudaFuncAttributes attr{};
NVBENCH_CUDA_CALL(
cudaFuncGetAttributes(&attr, ((const void*)nvbench::detail::noop_kernel_ptr) ));
NVBENCH_CUDA_CALL(cudaFuncGetAttributes(&attr, ((const void *)nvbench::detail::noop_kernel_ptr)));
return attr.ptxVersion * 10;
}
catch(...)
catch (...)
{ // Fail gracefully when no appropriate PTX is found for this device.
return -1;
}

View File

@@ -40,10 +40,7 @@ struct device_manager
/**
* @return The total number of detected CUDA devices.
*/
[[nodiscard]] int get_number_of_devices() const
{
return static_cast<int>(m_devices.size());
}
[[nodiscard]] int get_number_of_devices() const { return static_cast<int>(m_devices.size()); }
/**
* @return The number of devices actually used by all benchmarks.
@@ -57,39 +54,27 @@ struct device_manager
/**
* @return The device_info object corresponding to `id`.
*/
[[nodiscard]] const nvbench::device_info &get_device(int id)
{
return m_devices.at(id);
}
[[nodiscard]] const nvbench::device_info &get_device(int id) { return m_devices.at(id); }
/**
* @return A vector containing device_info objects for all detected CUDA
* devices.
*/
[[nodiscard]] const device_info_vector &get_devices() const
{
return m_devices;
}
[[nodiscard]] const device_info_vector &get_devices() const { return m_devices; }
/**
* @return A vector containing device_info objects for devices that are
* actively used by all benchmarks.
* @note This is only valid after nvbench::option_parser::parse executes.
*/
[[nodiscard]] const device_info_vector &get_used_devices() const
{
return m_used_devices;
}
[[nodiscard]] const device_info_vector &get_used_devices() const { return m_used_devices; }
private:
device_manager();
friend struct option_parser;
void set_used_devices(device_info_vector devices)
{
m_used_devices = std::move(devices);
}
void set_used_devices(device_info_vector devices) { m_used_devices = std::move(devices); }
device_info_vector m_devices;
device_info_vector m_used_devices;

View File

@@ -64,10 +64,7 @@ struct type_strings<nvbench::enum_type<Value, T>>
return std::to_string(Value);
}
static std::string description()
{
return nvbench::demangle<nvbench::enum_type<Value, T>>();
}
static std::string description() { return nvbench::demangle<nvbench::enum_type<Value, T>>(); }
};
} // namespace nvbench
@@ -86,15 +83,13 @@ struct type_strings<nvbench::enum_type<Value, T>>
* \relatesalso enum_type_list
* \relatesalso nvbench::enum_type_list
*/
#define NVBENCH_DECLARE_ENUM_TYPE_STRINGS(T, \
input_generator, \
description_generator) \
namespace nvbench \
{ \
template <T Value> \
struct type_strings<enum_type<Value, T>> \
{ \
static std::string input_string() { return input_generator(Value); } \
static std::string description() { return description_generator(Value); } \
}; \
#define NVBENCH_DECLARE_ENUM_TYPE_STRINGS(T, input_generator, description_generator) \
namespace nvbench \
{ \
template <T Value> \
struct type_strings<enum_type<Value, T>> \
{ \
static std::string input_string() { return input_generator(Value); } \
static std::string description() { return description_generator(Value); } \
}; \
}

View File

@@ -31,16 +31,16 @@ enum class exec_flag
none = 0x0,
// Modifiers:
timer = 0x01, // KernelLauncher uses manual timing
no_block = 0x02, // Disables use of `blocking_kernel`.
sync = 0x04, // KernelLauncher has indicated that it will sync
run_once = 0x08, // Only run the benchmark once (for profiling).
timer = 0x01, // KernelLauncher uses manual timing
no_block = 0x02, // Disables use of `blocking_kernel`.
sync = 0x04, // KernelLauncher has indicated that it will sync
run_once = 0x08, // Only run the benchmark once (for profiling).
modifier_mask = timer | no_block | sync | run_once,
// Measurement types:
cold = 0x0100, // measure_hot
hot = 0x0200, // measure_cold
measure_mask = cold | hot
cold = 0x0100, // measure_hot
hot = 0x0200, // measure_cold
measure_mask = cold | hot
};
} // namespace nvbench::detail
@@ -120,7 +120,6 @@ constexpr inline auto timer = nvbench::exec_tag::impl::timer;
/// Modifier used to indicate that the KernelGenerator will perform CUDA
/// synchronizations. Without this flag such benchmarks will deadlock.
constexpr inline auto sync = nvbench::exec_tag::impl::no_block |
nvbench::exec_tag::impl::sync;
constexpr inline auto sync = nvbench::exec_tag::impl::no_block | nvbench::exec_tag::impl::sync;
} // namespace nvbench::exec_tag

View File

@@ -20,24 +20,24 @@
#include <type_traits>
#define NVBENCH_DECLARE_FLAGS(T) \
constexpr inline T operator|(T v1, T v2) \
{ \
using UT = std::underlying_type_t<T>; \
return static_cast<T>(static_cast<UT>(v1) | static_cast<UT>(v2)); \
} \
constexpr inline T operator&(T v1, T v2) \
{ \
using UT = std::underlying_type_t<T>; \
return static_cast<T>(static_cast<UT>(v1) & static_cast<UT>(v2)); \
} \
constexpr inline T operator^(T v1, T v2) \
{ \
using UT = std::underlying_type_t<T>; \
return static_cast<T>(static_cast<UT>(v1) ^ static_cast<UT>(v2)); \
} \
constexpr inline T operator~(T v1) \
{ \
using UT = std::underlying_type_t<T>; \
return static_cast<T>(~static_cast<UT>(v1)); \
#define NVBENCH_DECLARE_FLAGS(T) \
constexpr inline T operator|(T v1, T v2) \
{ \
using UT = std::underlying_type_t<T>; \
return static_cast<T>(static_cast<UT>(v1) | static_cast<UT>(v2)); \
} \
constexpr inline T operator&(T v1, T v2) \
{ \
using UT = std::underlying_type_t<T>; \
return static_cast<T>(static_cast<UT>(v1) & static_cast<UT>(v2)); \
} \
constexpr inline T operator^(T v1, T v2) \
{ \
using UT = std::underlying_type_t<T>; \
return static_cast<T>(static_cast<UT>(v1) ^ static_cast<UT>(v2)); \
} \
constexpr inline T operator~(T v1) \
{ \
using UT = std::underlying_type_t<T>; \
return static_cast<T>(~static_cast<UT>(v1)); \
}

View File

@@ -36,20 +36,11 @@ struct float64_axis final : public axis_base
~float64_axis() final;
void set_inputs(std::vector<nvbench::float64_t> inputs)
{
m_values = std::move(inputs);
}
[[nodiscard]] nvbench::float64_t get_value(std::size_t i) const
{
return m_values[i];
}
void set_inputs(std::vector<nvbench::float64_t> inputs) { m_values = std::move(inputs); }
[[nodiscard]] nvbench::float64_t get_value(std::size_t i) const { return m_values[i]; }
private:
std::unique_ptr<axis_base> do_clone() const
{
return std::make_unique<float64_axis>(*this);
}
std::unique_ptr<axis_base> do_clone() const { return std::make_unique<float64_axis>(*this); }
std::size_t do_get_size() const final { return m_values.size(); }
std::string do_get_input_string(std::size_t i) const final;
std::string do_get_description(std::size_t i) const final;

View File

@@ -1,20 +1,20 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
@@ -25,7 +25,5 @@
#define NVBENCH_GIT_SHA1 NVBench_GIT_SHA1
#define NVBENCH_GIT_VERSION NVBench_GIT_VERSION
#ifdef NVBench_GIT_IS_DIRTY
# define NVBENCH_GIT_IS_DIRTY
#define NVBENCH_GIT_IS_DIRTY
#endif

View File

@@ -58,28 +58,18 @@ struct int64_axis final : public axis_base
return static_cast<bool>(m_flags & int64_axis_flags::power_of_two);
}
void set_inputs(std::vector<int64_t> inputs,
int64_axis_flags flags = int64_axis_flags::none);
void set_inputs(std::vector<int64_t> inputs, int64_axis_flags flags = int64_axis_flags::none);
[[nodiscard]] const std::vector<int64_t> &get_inputs() const
{
return m_inputs;
};
[[nodiscard]] const std::vector<int64_t> &get_inputs() const { return m_inputs; };
[[nodiscard]] int64_t get_value(std::size_t i) const { return m_values[i]; };
[[nodiscard]] const std::vector<int64_t> &get_values() const
{
return m_values;
};
[[nodiscard]] const std::vector<int64_t> &get_values() const { return m_values; };
int64_axis_flags get_flags() const { return m_flags; }
// Helper functions for pow2 conversions:
static nvbench::int64_t compute_pow2(nvbench::int64_t exponent)
{
return 1ll << exponent;
}
static nvbench::int64_t compute_pow2(nvbench::int64_t exponent) { return 1ll << exponent; }
// UB if value < 0.
static nvbench::int64_t compute_log2(nvbench::int64_t value)
@@ -95,10 +85,7 @@ struct int64_axis final : public axis_base
};
private:
std::unique_ptr<axis_base> do_clone() const
{
return std::make_unique<int64_axis>(*this);
}
std::unique_ptr<axis_base> do_clone() const { return std::make_unique<int64_axis>(*this); }
std::size_t do_get_size() const final { return m_inputs.size(); }
std::string do_get_input_string(std::size_t) const final;
std::string do_get_description(std::size_t) const final;

View File

@@ -66,9 +66,8 @@ std::string int64_axis::do_get_input_string(std::size_t i) const
std::string int64_axis::do_get_description(std::size_t i) const
{
return this->is_power_of_two()
? fmt::format("2^{} = {}", m_inputs[i], m_values[i])
: std::string{};
return this->is_power_of_two() ? fmt::format("2^{} = {}", m_inputs[i], m_values[i])
: std::string{};
}
std::string_view int64_axis::do_get_flags_as_string() const

View File

@@ -85,8 +85,7 @@ private:
" {:^{}} ",
col.header,
col.max_width);
iter =
fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
}
return fmt::format_to(iter, "\n");
}
@@ -102,8 +101,7 @@ private:
"{:-^{}}",
"",
col.max_width + 2);
iter =
fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
}
return fmt::format_to(iter, "\n");
}
@@ -116,8 +114,7 @@ private:
for (std::size_t row = 0; row < m_num_rows; ++row)
{
iter =
fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
for (const column &col : m_columns)
{
iter = fmt::format_to(iter,
@@ -125,8 +122,7 @@ private:
" {:>{}} ",
col.rows[row],
col.max_width);
iter =
fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
} // cols
iter = fmt::format_to(iter, "\n");

View File

@@ -74,10 +74,7 @@ struct call_failed : error
[[nodiscard]] nvmlReturn_t get_error_code() const { return m_error_code; }
[[nodiscard]] const std::string &get_error_string() const
{
return m_error_string;
}
[[nodiscard]] const std::string &get_error_string() const { return m_error_string; }
private:
nvmlReturn_t m_error_code;
@@ -90,30 +87,26 @@ private:
#ifdef NVBENCH_HAS_NVML
#define NVBENCH_NVML_CALL(call) \
do \
{ \
const auto _rr = call; \
if (_rr != NVML_SUCCESS) \
{ \
throw nvbench::nvml::call_failed(__FILE__, \
__LINE__, \
#call, \
_rr, \
nvmlErrorString(_rr)); \
} \
#define NVBENCH_NVML_CALL(call) \
do \
{ \
const auto _rr = call; \
if (_rr != NVML_SUCCESS) \
{ \
throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, nvmlErrorString(_rr)); \
} \
} while (false)
// Same as above, but used for nvmlInit(), where a failure means that
// nvmlErrorString is not available.
#define NVBENCH_NVML_CALL_NO_API(call) \
do \
{ \
const auto _rr = call; \
if (_rr != NVML_SUCCESS) \
{ \
throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, ""); \
} \
#define NVBENCH_NVML_CALL_NO_API(call) \
do \
{ \
const auto _rr = call; \
if (_rr != NVML_SUCCESS) \
{ \
throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, ""); \
} \
} while (false)
#endif // NVBENCH_HAS_NVML

View File

@@ -47,18 +47,14 @@ struct table_builder
const std::string &header,
std::string value)
{
auto iter = std::find_if(m_columns.begin(),
m_columns.end(),
[&column_key](const column &col) {
return col.key == column_key;
});
auto iter = std::find_if(m_columns.begin(), m_columns.end(), [&column_key](const column &col) {
return col.key == column_key;
});
auto &col = iter == m_columns.end()
? m_columns.emplace_back(column{column_key,
header,
std::vector<std::string>{},
header.size()})
: *iter;
? m_columns.emplace_back(
column{column_key, header, std::vector<std::string>{}, header.size()})
: *iter;
col.max_width = std::max(col.max_width, value.size());
if (col.rows.size() <= row)
@@ -76,11 +72,9 @@ struct table_builder
std::size_t{},
[](const auto &a, const auto &b) { return a > b ? a : b; },
[](const column &col) { return col.rows.size(); });
std::for_each(m_columns.begin(),
m_columns.end(),
[num_rows = m_num_rows](column &col) {
col.rows.resize(num_rows);
});
std::for_each(m_columns.begin(), m_columns.end(), [num_rows = m_num_rows](column &col) {
col.rows.resize(num_rows);
});
}
};

View File

@@ -126,11 +126,10 @@ std::string json_printer::version_t::get_string() const
return fmt::format("{}.{}.{}", this->major, this->minor, this->patch);
}
void json_printer::do_process_bulk_data_float64(
state &state,
const std::string &tag,
const std::string &hint,
const std::vector<nvbench::float64_t> &data)
void json_printer::do_process_bulk_data_float64(state &state,
const std::string &tag,
const std::string &hint,
const std::vector<nvbench::float64_t> &data)
{
printer_base::do_process_bulk_data_float64(state, tag, hint, data);
@@ -157,16 +156,12 @@ void json_printer::do_process_bulk_data_float64(
{
if (!fs::create_directory(result_path))
{
NVBENCH_THROW(std::runtime_error,
"{}",
"Failed to create result directory '{}'.");
NVBENCH_THROW(std::runtime_error, "{}", "Failed to create result directory '{}'.");
}
}
else if (!fs::is_directory(result_path))
{
NVBENCH_THROW(std::runtime_error,
"{}",
"'{}' exists and is not a directory.");
NVBENCH_THROW(std::runtime_error, "{}", "'{}' exists and is not a directory.");
}
const auto file_id = m_num_jsonbin_files++;
@@ -197,16 +192,12 @@ void json_printer::do_process_bulk_data_float64(
}
catch (std::exception &e)
{
if (auto printer_opt_ref = state.get_benchmark().get_printer();
printer_opt_ref.has_value())
if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
{
auto &printer = printer_opt_ref.value().get();
printer.log(nvbench::log_level::warn,
fmt::format("Error writing {} ({}) to {}: {}",
tag,
hint,
result_path.string(),
e.what()));
printer.log(
nvbench::log_level::warn,
fmt::format("Error writing {} ({}) to {}: {}", tag, hint, result_path.string(), e.what()));
}
} // end catch
@@ -221,14 +212,12 @@ void json_printer::do_process_bulk_data_float64(
summ.set_string("hide", "Not needed in table.");
timer.stop();
if (auto printer_opt_ref = state.get_benchmark().get_printer();
printer_opt_ref.has_value())
if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
{
auto &printer = printer_opt_ref.value().get();
printer.log(nvbench::log_level::info,
fmt::format("Wrote '{}' in {:>6.3f}ms",
result_path.string(),
timer.get_duration() * 1000));
printer.log(
nvbench::log_level::info,
fmt::format("Wrote '{}' in {:>6.3f}ms", result_path.string(), timer.get_duration() * 1000));
}
} // end hint == sample_times
}
@@ -304,15 +293,12 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
device["global_memory_size"] = dev_info.get_global_memory_size();
device["global_memory_bus_peak_clock_rate"] =
dev_info.get_global_memory_bus_peak_clock_rate();
device["global_memory_bus_width"] =
dev_info.get_global_memory_bus_width();
device["global_memory_bus_bandwidth"] =
dev_info.get_global_memory_bus_bandwidth();
device["l2_cache_size"] = dev_info.get_l2_cache_size();
device["shared_memory_per_sm"] = dev_info.get_shared_memory_per_sm();
device["shared_memory_per_block"] =
dev_info.get_shared_memory_per_block();
device["ecc_state"] = dev_info.get_ecc_state();
device["global_memory_bus_width"] = dev_info.get_global_memory_bus_width();
device["global_memory_bus_bandwidth"] = dev_info.get_global_memory_bus_bandwidth();
device["l2_cache_size"] = dev_info.get_l2_cache_size();
device["shared_memory_per_sm"] = dev_info.get_shared_memory_per_sm();
device["shared_memory_per_block"] = dev_info.get_shared_memory_per_block();
device["ecc_state"] = dev_info.get_ecc_state();
}
} // "devices"
@@ -358,23 +344,19 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
switch (axis_ptr->get_type())
{
case nvbench::axis_type::type:
value["is_active"] =
static_cast<type_axis &>(*axis_ptr).get_is_active(i);
value["is_active"] = static_cast<type_axis &>(*axis_ptr).get_is_active(i);
break;
case nvbench::axis_type::int64:
value["value"] =
static_cast<int64_axis &>(*axis_ptr).get_value(i);
value["value"] = static_cast<int64_axis &>(*axis_ptr).get_value(i);
break;
case nvbench::axis_type::float64:
value["value"] =
static_cast<float64_axis &>(*axis_ptr).get_value(i);
value["value"] = static_cast<float64_axis &>(*axis_ptr).get_value(i);
break;
case nvbench::axis_type::string:
value["value"] =
static_cast<string_axis &>(*axis_ptr).get_value(i);
value["value"] = static_cast<string_axis &>(*axis_ptr).get_value(i);
break;
default:
break;

View File

@@ -38,9 +38,7 @@ struct json_printer : nvbench::printer_base
{
using printer_base::printer_base;
json_printer(std::ostream &stream,
std::string stream_name,
bool enable_binary_output)
json_printer(std::ostream &stream, std::string stream_name, bool enable_binary_output)
: printer_base(stream, std::move(stream_name))
, m_enable_binary_output{enable_binary_output}
{}
@@ -59,23 +57,16 @@ struct json_printer : nvbench::printer_base
[[nodiscard]] static version_t get_json_file_version();
[[nodiscard]] bool get_enable_binary_output() const
{
return m_enable_binary_output;
}
[[nodiscard]] bool get_enable_binary_output() const { return m_enable_binary_output; }
void set_enable_binary_output(bool b) { m_enable_binary_output = b; }
protected:
// Virtual API from printer_base:
void do_log_argv(const std::vector<std::string>& argv) override
{
m_argv = argv;
}
void do_process_bulk_data_float64(
nvbench::state &state,
const std::string &tag,
const std::string &hint,
const std::vector<nvbench::float64_t> &data) override;
void do_log_argv(const std::vector<std::string> &argv) override { m_argv = argv; }
void do_process_bulk_data_float64(nvbench::state &state,
const std::string &tag,
const std::string &hint,
const std::vector<nvbench::float64_t> &data) override;
void do_print_benchmark_results(const benchmark_vector &benches) override;
bool m_enable_binary_output{false};

View File

@@ -42,20 +42,17 @@ struct launch
{}
// move-only
launch(const launch &) = delete;
launch(launch &&) = default;
launch(const launch &) = delete;
launch(launch &&) = default;
launch &operator=(const launch &) = delete;
launch &operator=(launch &&) = default;
launch &operator=(launch &&) = default;
/**
* @return a CUDA stream that all kernels and other stream-ordered CUDA work
* must use. This stream can be changed by the `KernelGenerator` using the
* `nvbench::state::set_cuda_stream` method.
*/
__forceinline__ const nvbench::cuda_stream &get_stream() const
{
return m_stream;
};
__forceinline__ const nvbench::cuda_stream &get_stream() const { return m_stream; };
private:
// The stream is owned by the `nvbench::state` associated with this launch.

View File

@@ -27,23 +27,23 @@
#include <iostream>
#define NVBENCH_MAIN \
int main(int argc, char const *const *argv) \
try \
{ \
NVBENCH_MAIN_BODY(argc, argv); \
NVBENCH_CUDA_CALL(cudaDeviceReset()); \
return 0; \
} \
catch (std::exception & e) \
{ \
std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n"; \
return 1; \
} \
catch (...) \
{ \
std::cerr << "\nNVBench encountered an unknown error.\n"; \
return 1; \
#define NVBENCH_MAIN \
int main(int argc, char const *const *argv) \
try \
{ \
NVBENCH_MAIN_BODY(argc, argv); \
NVBENCH_CUDA_CALL(cudaDeviceReset()); \
return 0; \
} \
catch (std::exception & e) \
{ \
std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n"; \
return 1; \
} \
catch (...) \
{ \
std::cerr << "\nNVBench encountered an unknown error.\n"; \
return 1; \
}
#ifdef NVBENCH_HAS_CUPTI
@@ -54,35 +54,35 @@
// clang-format on
#endif
#define NVBENCH_MAIN_PARSE(argc, argv) \
nvbench::option_parser parser; \
#define NVBENCH_MAIN_PARSE(argc, argv) \
nvbench::option_parser parser; \
parser.parse(argc, argv)
#define NVBENCH_MAIN_BODY(argc, argv) \
do \
{ \
NVBENCH_INITIALIZE_DRIVER_API; \
NVBENCH_MAIN_PARSE(argc, argv); \
auto &printer = parser.get_printer(); \
\
printer.print_device_info(); \
printer.print_log_preamble(); \
auto &benchmarks = parser.get_benchmarks(); \
\
std::size_t total_states = 0; \
for (auto &bench_ptr : benchmarks) \
{ \
total_states += bench_ptr->get_config_count(); \
} \
printer.set_total_state_count(total_states); \
\
printer.set_completed_state_count(0); \
for (auto &bench_ptr : benchmarks) \
{ \
bench_ptr->set_printer(printer); \
bench_ptr->run(); \
bench_ptr->clear_printer(); \
} \
printer.print_log_epilogue(); \
printer.print_benchmark_results(benchmarks); \
#define NVBENCH_MAIN_BODY(argc, argv) \
do \
{ \
NVBENCH_INITIALIZE_DRIVER_API; \
NVBENCH_MAIN_PARSE(argc, argv); \
auto &printer = parser.get_printer(); \
\
printer.print_device_info(); \
printer.print_log_preamble(); \
auto &benchmarks = parser.get_benchmarks(); \
\
std::size_t total_states = 0; \
for (auto &bench_ptr : benchmarks) \
{ \
total_states += bench_ptr->get_config_count(); \
} \
printer.set_total_state_count(total_states); \
\
printer.set_completed_state_count(0); \
for (auto &bench_ptr : benchmarks) \
{ \
bench_ptr->set_printer(printer); \
bench_ptr->run(); \
bench_ptr->clear_printer(); \
} \
printer.print_log_epilogue(); \
printer.print_benchmark_results(benchmarks); \
} while (false)

View File

@@ -44,9 +44,8 @@ void markdown_printer::do_print_device_info()
fmt::format_to(buffer, "# Devices\n\n");
const auto &device_mgr = nvbench::device_manager::get();
const auto &devices = device_mgr.get_number_of_used_devices() > 0
? device_mgr.get_used_devices()
: device_mgr.get_devices();
const auto &devices = device_mgr.get_number_of_used_devices() > 0 ? device_mgr.get_used_devices()
: device_mgr.get_devices();
for (const auto &device : devices)
{
const auto [gmem_free, gmem_used] = device.get_global_memory_usage();
@@ -64,22 +63,17 @@ void markdown_printer::do_print_device_info()
"* Global Memory: {} MiB Free / {} MiB Total\n",
gmem_free / 1024 / 1024,
gmem_used / 1024 / 1024);
fmt::format_to(
buffer,
"* Global Memory Bus Peak: {} GB/sec ({}-bit DDR @{}MHz)\n",
device.get_global_memory_bus_bandwidth() / 1000 / 1000 / 1000,
device.get_global_memory_bus_width(),
device.get_global_memory_bus_peak_clock_rate() / 1000 / 1000);
fmt::format_to(buffer,
"* Global Memory Bus Peak: {} GB/sec ({}-bit DDR @{}MHz)\n",
device.get_global_memory_bus_bandwidth() / 1000 / 1000 / 1000,
device.get_global_memory_bus_width(),
device.get_global_memory_bus_peak_clock_rate() / 1000 / 1000);
fmt::format_to(buffer,
"* Max Shared Memory: {} KiB/SM, {} KiB/Block\n",
device.get_shared_memory_per_sm() / 1024,
device.get_shared_memory_per_block() / 1024);
fmt::format_to(buffer,
"* L2 Cache Size: {} KiB\n",
device.get_l2_cache_size() / 1024);
fmt::format_to(buffer,
"* Maximum Active Blocks: {}/SM\n",
device.get_max_blocks_per_sm());
fmt::format_to(buffer, "* L2 Cache Size: {} KiB\n", device.get_l2_cache_size() / 1024);
fmt::format_to(buffer, "* Maximum Active Blocks: {}/SM\n", device.get_max_blocks_per_sm());
fmt::format_to(buffer,
"* Maximum Active Threads: {}/SM, {}/Block\n",
device.get_max_threads_per_sm(),
@@ -88,18 +82,13 @@ void markdown_printer::do_print_device_info()
"* Available Registers: {}/SM, {}/Block\n",
device.get_registers_per_sm(),
device.get_registers_per_block());
fmt::format_to(buffer,
"* ECC Enabled: {}\n",
device.get_ecc_state() ? "Yes" : "No");
fmt::format_to(buffer, "* ECC Enabled: {}\n", device.get_ecc_state() ? "Yes" : "No");
fmt::format_to(buffer, "\n");
}
m_ostream << fmt::to_string(buffer);
}
void markdown_printer::do_print_log_preamble()
{
m_ostream << "# Log\n\n```\n";
}
void markdown_printer::do_print_log_preamble() { m_ostream << "# Log\n\n```\n"; }
void markdown_printer::do_print_log_epilogue() { m_ostream << "```\n\n"; }
@@ -147,8 +136,7 @@ void markdown_printer::do_log_run_state(const nvbench::state &exec_state)
{
if (m_total_state_count == 0)
{ // No progress info
this->log(nvbench::log_level::run,
exec_state.get_short_description(m_color));
this->log(nvbench::log_level::run, exec_state.get_short_description(m_color));
}
else
{ // Add progress
@@ -160,8 +148,7 @@ void markdown_printer::do_log_run_state(const nvbench::state &exec_state)
}
}
void markdown_printer::do_print_benchmark_list(
const printer_base::benchmark_vector &benches)
void markdown_printer::do_print_benchmark_list(const printer_base::benchmark_vector &benches)
{
if (benches.empty())
{
@@ -204,10 +191,7 @@ void markdown_printer::do_print_benchmark_list(
{
desc = fmt::format(" ({})", desc);
}
fmt::format_to(buffer,
" * `{}`{}\n",
axis_ptr->get_input_string(i),
desc);
fmt::format_to(buffer, " * `{}`{}\n", axis_ptr->get_input_string(i), desc);
} // end foreach value
} // end foreach axis
fmt::format_to(buffer, "\n");
@@ -216,8 +200,7 @@ void markdown_printer::do_print_benchmark_list(
m_ostream << fmt::to_string(buffer);
}
void markdown_printer::do_print_benchmark_results(
const printer_base::benchmark_vector &benches)
void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_vector &benches)
{
auto format_visitor = [](const auto &v) {
using T = std::decay_t<decltype(v)>;
@@ -252,19 +235,15 @@ void markdown_printer::do_print_benchmark_results(
// Do a single pass when no devices are specified. This happens for
// benchmarks with `cpu` exec_tags.
const std::size_t num_device_passes = devices.empty() ? 1 : devices.size();
for (std::size_t device_pass = 0; device_pass < num_device_passes;
++device_pass)
for (std::size_t device_pass = 0; device_pass < num_device_passes; ++device_pass)
{
std::optional<nvbench::device_info> device =
devices.empty() ? std::nullopt
: std::make_optional(devices[device_pass]);
std::optional<nvbench::device_info> device = devices.empty()
? std::nullopt
: std::make_optional(devices[device_pass]);
if (device)
{
fmt::format_to(buffer,
"\n### [{}] {}\n\n",
device->get_id(),
device->get_name());
fmt::format_to(buffer, "\n### [{}] {}\n\n", device->get_id(), device->get_name());
}
std::size_t row = 0;
@@ -288,15 +267,11 @@ void markdown_printer::do_print_benchmark_results(
{
const nvbench::int64_t value = axis_values.get_int64(name);
const nvbench::int64_t exponent = int64_axis::compute_log2(value);
table.add_cell(row,
name,
name,
fmt::format("2^{} = {}", exponent, value));
table.add_cell(row, name, name, fmt::format("2^{} = {}", exponent, value));
}
else
{
std::string value = std::visit(format_visitor,
axis_values.get_value(name));
std::string value = std::visit(format_visitor, axis_values.get_value(name));
table.add_cell(row, name + "_axis", name, std::move(value));
}
}
@@ -308,12 +283,9 @@ void markdown_printer::do_print_benchmark_results(
continue;
}
const std::string &tag = summ.get_tag();
const std::string &header = summ.has_value("name")
? summ.get_string("name")
: tag;
const std::string &header = summ.has_value("name") ? summ.get_string("name") : tag;
std::string hint = summ.has_value("hint") ? summ.get_string("hint")
: std::string{};
std::string hint = summ.has_value("hint") ? summ.get_string("hint") : std::string{};
if (hint == "duration")
{
table.add_cell(row, tag, header, this->do_format_duration(summ));
@@ -332,10 +304,7 @@ void markdown_printer::do_print_benchmark_results(
}
else if (hint == "sample_size")
{
table.add_cell(row,
tag,
header,
this->do_format_sample_size(summ));
table.add_cell(row, tag, header, this->do_format_sample_size(summ));
}
else if (hint == "percentage")
{
@@ -353,8 +322,7 @@ void markdown_printer::do_print_benchmark_results(
auto table_str = table.to_string();
fmt::format_to(buffer,
"{}",
table_str.empty() ? "No data -- check log.\n"
: std::move(table_str));
table_str.empty() ? "No data -- check log.\n" : std::move(table_str));
} // end foreach device_pass
}

View File

@@ -33,8 +33,7 @@ namespace nvbench
*/
struct named_values
{
using value_type =
std::variant<nvbench::int64_t, nvbench::float64_t, std::string>;
using value_type = std::variant<nvbench::int64_t, nvbench::float64_t, std::string>;
enum class type
{
@@ -43,7 +42,7 @@ struct named_values
string
};
void append(const named_values& other);
void append(const named_values &other);
[[nodiscard]] std::size_t get_size() const;
[[nodiscard]] std::vector<std::string> get_names() const;
@@ -60,11 +59,11 @@ struct named_values
[[nodiscard]] type get_type(const std::string &name) const;
[[nodiscard]] bool has_value(const std::string &name) const;
[[nodiscard]] const value_type& get_value(const std::string &name) const;
[[nodiscard]] const value_type &get_value(const std::string &name) const;
void clear();
void remove_value(const std::string& name);
void remove_value(const std::string &name);
private:
struct named_value

View File

@@ -33,9 +33,7 @@ namespace nvbench
void named_values::append(const named_values &other)
{
m_storage.insert(m_storage.end(),
other.m_storage.cbegin(),
other.m_storage.cend());
m_storage.insert(m_storage.end(), other.m_storage.cbegin(), other.m_storage.cend());
}
void named_values::clear() { m_storage.clear(); }
@@ -55,20 +53,17 @@ std::vector<std::string> named_values::get_names() const
bool named_values::has_value(const std::string &name) const
{
auto iter =
std::find_if(m_storage.cbegin(),
m_storage.cend(),
[&name](const auto &val) { return val.name == name; });
auto iter = std::find_if(m_storage.cbegin(), m_storage.cend(), [&name](const auto &val) {
return val.name == name;
});
return iter != m_storage.cend();
}
const named_values::value_type &
named_values::get_value(const std::string &name) const
const named_values::value_type &named_values::get_value(const std::string &name) const
{
auto iter =
std::find_if(m_storage.cbegin(),
m_storage.cend(),
[&name](const auto &val) { return val.name == name; });
auto iter = std::find_if(m_storage.cbegin(), m_storage.cend(), [&name](const auto &val) {
return val.name == name;
});
if (iter == m_storage.cend())
{
NVBENCH_THROW(std::runtime_error, "No value with name '{}'.", name);
@@ -96,9 +91,7 @@ named_values::type named_values::get_type(const std::string &name) const
// warning C4702: unreachable code
// This is a future-proofing check, it'll be reachable if something breaks
NVBENCH_MSVC_PUSH_DISABLE_WARNING(4702)
NVBENCH_THROW(std::runtime_error,
"Unknown variant type for entry '{}'.",
name);
NVBENCH_THROW(std::runtime_error, "Unknown variant type for entry '{}'.", name);
},
this->get_value(name));
NVBENCH_MSVC_POP_WARNING()
@@ -111,10 +104,7 @@ try
}
catch (std::exception &err)
{
NVBENCH_THROW(std::runtime_error,
"Error looking up int64 value `{}`:\n{}",
name,
err.what());
NVBENCH_THROW(std::runtime_error, "Error looking up int64 value `{}`:\n{}", name, err.what());
}
nvbench::float64_t named_values::get_float64(const std::string &name) const
@@ -124,10 +114,7 @@ try
}
catch (std::exception &err)
{
NVBENCH_THROW(std::runtime_error,
"Error looking up float64 value `{}`:\n{}",
name,
err.what());
NVBENCH_THROW(std::runtime_error, "Error looking up float64 value `{}`:\n{}", name, err.what());
}
const std::string &named_values::get_string(const std::string &name) const
@@ -137,10 +124,7 @@ try
}
catch (std::exception &err)
{
NVBENCH_THROW(std::runtime_error,
"Error looking up string value `{}`:\n{}",
name,
err.what());
NVBENCH_THROW(std::runtime_error, "Error looking up string value `{}`:\n{}", name, err.what());
}
void named_values::set_int64(std::string name, nvbench::int64_t value)
@@ -165,10 +149,9 @@ void named_values::set_value(std::string name, named_values::value_type value)
void named_values::remove_value(const std::string &name)
{
auto iter =
std::find_if(m_storage.begin(), m_storage.end(), [&name](const auto &val) {
return val.name == name;
});
auto iter = std::find_if(m_storage.begin(), m_storage.end(), [&name](const auto &val) {
return val.name == name;
});
if (iter != m_storage.end())
{
m_storage.erase(iter);

View File

@@ -82,20 +82,11 @@ std::string_view submatch_to_sv(const sv_submatch &in)
//
// So we're stuck with materializing a std::string and calling std::stoX(). Ah
// well. At least it's not istream.
void parse(std::string_view input, nvbench::int32_t &val)
{
val = std::stoi(std::string(input));
}
void parse(std::string_view input, nvbench::int32_t &val) { val = std::stoi(std::string(input)); }
void parse(std::string_view input, nvbench::int64_t &val)
{
val = std::stoll(std::string(input));
}
void parse(std::string_view input, nvbench::int64_t &val) { val = std::stoll(std::string(input)); }
void parse(std::string_view input, nvbench::float64_t &val)
{
val = std::stod(std::string(input));
}
void parse(std::string_view input, nvbench::float64_t &val) { val = std::stod(std::string(input)); }
void parse(std::string_view input, std::string &val) { val = input; }
@@ -112,9 +103,8 @@ std::vector<T> parse_list_values(std::string_view list_spec)
"(?:,|$)" // Delimiters
};
auto values_begin =
sv_regex_iterator(list_spec.cbegin(), list_spec.cend(), value_regex);
auto values_end = sv_regex_iterator{};
auto values_begin = sv_regex_iterator(list_spec.cbegin(), list_spec.cend(), value_regex);
auto values_end = sv_regex_iterator{};
while (values_begin != values_end)
{
auto match = *values_begin++;
@@ -131,8 +121,7 @@ std::vector<T> parse_list_values(std::string_view list_spec)
// Parses a range specification "<start> : <stop> [ : <stride> ]" and returns
// a vector filled with the specified range.
template <typename T>
std::vector<T> parse_range_values(std::string_view range_spec,
nvbench::wrapped_type<T>)
std::vector<T> parse_range_values(std::string_view range_spec, nvbench::wrapped_type<T>)
{
std::vector<T> range_params;
@@ -143,9 +132,8 @@ std::vector<T> parse_range_values(std::string_view range_spec,
"(?:$|:)" // Delimiters
};
auto values_begin =
sv_regex_iterator(range_spec.cbegin(), range_spec.cend(), value_regex);
auto values_end = sv_regex_iterator{};
auto values_begin = sv_regex_iterator(range_spec.cbegin(), range_spec.cend(), value_regex);
auto values_end = sv_regex_iterator{};
for (; values_begin != values_end; ++values_begin)
{
auto match = *values_begin;
@@ -221,25 +209,15 @@ std::vector<T> parse_values(std::string_view value_spec)
"$"}; // EOS
sv_match match;
if (std::regex_search(value_spec.cbegin(),
value_spec.cend(),
match,
list_regex))
if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, list_regex))
{
return parse_list_values<T>(submatch_to_sv(match[1]));
}
else if (std::regex_search(value_spec.cbegin(),
value_spec.cend(),
match,
range_regex))
else if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, range_regex))
{
return parse_range_values(submatch_to_sv(match[1]),
nvbench::wrapped_type<T>{});
return parse_range_values(submatch_to_sv(match[1]), nvbench::wrapped_type<T>{});
}
else if (std::regex_search(value_spec.cbegin(),
value_spec.cend(),
match,
single_regex))
else if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, single_regex))
{
T val;
parse(submatch_to_sv(match[1]), val);
@@ -247,9 +225,7 @@ std::vector<T> parse_values(std::string_view value_spec)
}
else
{
NVBENCH_THROW(std::runtime_error,
"Invalid axis value spec: {}",
value_spec);
NVBENCH_THROW(std::runtime_error, "Invalid axis value spec: {}", value_spec);
}
}
@@ -514,8 +490,8 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
this->update_int64_prop(first[0], first[1]);
first += 2;
}
else if (arg == "--min-time" || arg == "--max-noise" ||
arg == "--skip-time" || arg == "--timeout")
else if (arg == "--min-time" || arg == "--max-noise" || arg == "--skip-time" ||
arg == "--timeout")
{
check_params(1);
this->update_float64_prop(first[0], first[1]);
@@ -523,9 +499,7 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
}
else
{
NVBENCH_THROW(std::runtime_error,
"Unrecognized command-line argument: `{}`.",
arg);
NVBENCH_THROW(std::runtime_error, "Unrecognized command-line argument: `{}`.", arg);
}
}
}
@@ -534,7 +508,7 @@ void option_parser::add_markdown_printer(const std::string &spec)
try
{
std::ostream &stream = this->printer_spec_to_ostream(spec);
auto &printer = m_printer.emplace<nvbench::markdown_printer>(stream, spec);
auto &printer = m_printer.emplace<nvbench::markdown_printer>(stream, spec);
if (spec == "stdout")
{
printer.set_color(m_color_md_stdout_printer);
@@ -556,14 +530,10 @@ try
}
catch (std::exception &e)
{
NVBENCH_THROW(std::runtime_error,
"Error while adding csv output for `{}`:\n{}",
spec,
e.what());
NVBENCH_THROW(std::runtime_error, "Error while adding csv output for `{}`:\n{}", spec, e.what());
}
void option_parser::add_json_printer(const std::string &spec,
bool enable_binary)
void option_parser::add_json_printer(const std::string &spec, bool enable_binary)
try
{
std::ostream &stream = this->printer_spec_to_ostream(spec);
@@ -624,10 +594,7 @@ void option_parser::print_help() const
fmt::print("{}\n{}\n", ::cli_help_text, ::cli_help_axis_text);
}
void option_parser::print_help_axis() const
{
fmt::print("{}\n", ::cli_help_axis_text);
}
void option_parser::print_help_axis() const { fmt::print("{}\n", ::cli_help_axis_text); }
void option_parser::set_persistence_mode(const std::string &state)
try
@@ -685,9 +652,7 @@ try
{
if (rate_val == nvbench::device_info::clock_rate::none)
{
fmt::print("Unlocking clocks for device '{}' ({}).\n",
device.get_name(),
device.get_id());
fmt::print("Unlocking clocks for device '{}' ({}).\n", device.get_name(), device.get_id());
}
else
{
@@ -757,16 +722,12 @@ try
}
catch (std::exception &e)
{
NVBENCH_THROW(std::runtime_error,
"Error handling option --benchmark `{}`:\n{}",
name,
e.what());
NVBENCH_THROW(std::runtime_error, "Error handling option --benchmark `{}`:\n{}", name, e.what());
}
void option_parser::replay_global_args()
{
this->parse_range(m_global_benchmark_args.cbegin(),
m_global_benchmark_args.cend());
this->parse_range(m_global_benchmark_args.cbegin(), m_global_benchmark_args.cend());
}
void option_parser::update_devices(const std::string &devices)
@@ -790,10 +751,7 @@ try
}
catch (std::exception &e)
{
NVBENCH_THROW(std::runtime_error,
"Error handling option --devices `{}`:\n{}",
devices,
e.what());
NVBENCH_THROW(std::runtime_error, "Error handling option --devices `{}`:\n{}", devices, e.what());
}
void option_parser::update_axis(const std::string &spec)
@@ -832,28 +790,20 @@ try
switch (axis.get_type())
{
case axis_type::type:
this->update_type_axis(static_cast<nvbench::type_axis &>(axis),
values,
flags);
this->update_type_axis(static_cast<nvbench::type_axis &>(axis), values, flags);
break;
case axis_type::int64:
this->update_int64_axis(static_cast<nvbench::int64_axis &>(axis),
values,
flags);
this->update_int64_axis(static_cast<nvbench::int64_axis &>(axis), values, flags);
break;
case axis_type::float64:
this->update_float64_axis(static_cast<nvbench::float64_axis &>(axis),
values,
flags);
this->update_float64_axis(static_cast<nvbench::float64_axis &>(axis), values, flags);
break;
case axis_type::string:
this->update_string_axis(static_cast<nvbench::string_axis &>(axis),
values,
flags);
this->update_string_axis(static_cast<nvbench::string_axis &>(axis), values, flags);
break;
@@ -866,10 +816,7 @@ try
}
catch (std::exception &e)
{
NVBENCH_THROW(std::runtime_error,
"Error handling option --axis `{}`:\n{}",
spec,
e.what());
NVBENCH_THROW(std::runtime_error, "Error handling option --axis `{}`:\n{}", spec, e.what());
}
void option_parser::update_int64_axis(int64_axis &axis,
@@ -888,9 +835,7 @@ void option_parser::update_int64_axis(int64_axis &axis,
}
else
{
NVBENCH_THROW(std::runtime_error,
"Invalid flag for int64 axis: `{}`",
flag_spec);
NVBENCH_THROW(std::runtime_error, "Invalid flag for int64 axis: `{}`", flag_spec);
}
auto input_values = parse_values<nvbench::int64_t>(value_spec);
@@ -905,9 +850,7 @@ void option_parser::update_float64_axis(float64_axis &axis,
// Validate flags:
if (!flag_spec.empty())
{
NVBENCH_THROW(std::runtime_error,
"Invalid flag for float64 axis: `{}`",
flag_spec);
NVBENCH_THROW(std::runtime_error, "Invalid flag for float64 axis: `{}`", flag_spec);
}
auto input_values = parse_values<nvbench::float64_t>(value_spec);
@@ -922,9 +865,7 @@ void option_parser::update_string_axis(string_axis &axis,
// Validate flags:
if (!flag_spec.empty())
{
NVBENCH_THROW(std::runtime_error,
"Invalid flag for string axis: `{}`",
flag_spec);
NVBENCH_THROW(std::runtime_error, "Invalid flag for string axis: `{}`", flag_spec);
}
auto input_values = parse_values<std::string>(value_spec);
@@ -939,9 +880,7 @@ void option_parser::update_type_axis(type_axis &axis,
// Validate flags:
if (!flag_spec.empty())
{
NVBENCH_THROW(std::runtime_error,
"Invalid flag for type axis: `{}`",
flag_spec);
NVBENCH_THROW(std::runtime_error, "Invalid flag for type axis: `{}`", flag_spec);
}
auto input_values = parse_values<std::string>(value_spec);
@@ -949,8 +888,7 @@ void option_parser::update_type_axis(type_axis &axis,
axis.set_active_inputs(input_values);
}
void option_parser::update_int64_prop(const std::string &prop_arg,
const std::string &prop_val)
void option_parser::update_int64_prop(const std::string &prop_arg, const std::string &prop_val)
try
{
// If no active benchmark, save args as global.
@@ -983,8 +921,7 @@ catch (std::exception &e)
e.what());
}
void option_parser::update_float64_prop(const std::string &prop_arg,
const std::string &prop_val)
void option_parser::update_float64_prop(const std::string &prop_arg, const std::string &prop_val)
try
{
// If no active benchmark, save args as global.

View File

@@ -41,8 +41,7 @@ struct type_axis;
*/
struct option_parser
{
using benchmark_vector =
std::vector<std::unique_ptr<nvbench::benchmark_base>>;
using benchmark_vector = std::vector<std::unique_ptr<nvbench::benchmark_base>>;
option_parser();
~option_parser();
@@ -51,15 +50,9 @@ struct option_parser
void parse(std::vector<std::string> args);
[[nodiscard]] benchmark_vector &get_benchmarks() { return m_benchmarks; };
[[nodiscard]] const benchmark_vector &get_benchmarks() const
{
return m_benchmarks;
};
[[nodiscard]] const benchmark_vector &get_benchmarks() const { return m_benchmarks; };
[[nodiscard]] const std::vector<std::string> &get_args() const
{
return m_args;
}
[[nodiscard]] const std::vector<std::string> &get_args() const { return m_args; }
/*!
* Returns the output format requested by the parse options.
@@ -115,10 +108,8 @@ private:
std::string_view value_spec,
std::string_view flag_spec);
void update_int64_prop(const std::string &prop_arg,
const std::string &prop_val);
void update_float64_prop(const std::string &prop_arg,
const std::string &prop_val);
void update_int64_prop(const std::string &prop_arg, const std::string &prop_val);
void update_float64_prop(const std::string &prop_arg, const std::string &prop_val);
void update_used_device_state() const;

View File

@@ -76,19 +76,16 @@ struct printer_base
virtual ~printer_base();
// move-only
printer_base(const printer_base &) = delete;
printer_base(printer_base &&) = default;
printer_base(const printer_base &) = delete;
printer_base(printer_base &&) = default;
printer_base &operator=(const printer_base &) = delete;
printer_base &operator=(printer_base &&) = default;
printer_base &operator=(printer_base &&) = default;
/*!
* Called once with the command line arguments used to invoke the current
* executable.
*/
void log_argv(const std::vector<std::string> &argv)
{
this->do_log_argv(argv);
}
void log_argv(const std::vector<std::string> &argv) { this->do_log_argv(argv); }
/*!
* Print a summary of all detected devices, if supported.
@@ -108,19 +105,13 @@ struct printer_base
/*!
* Print a log message at the specified log level.
*/
void log(nvbench::log_level level, const std::string &msg)
{
this->do_log(level, msg);
}
void log(nvbench::log_level level, const std::string &msg) { this->do_log(level, msg); }
/*!
* Called before running the measurements associated with state.
* Implementations are expected to call `log(log_level::run, ...)`.
*/
void log_run_state(const nvbench::state &exec_state)
{
this->do_log_run_state(exec_state);
}
void log_run_state(const nvbench::state &exec_state) { this->do_log_run_state(exec_state); }
/*!
* Measurements may call this to allow a printer to perform extra processing
@@ -181,10 +172,7 @@ struct printer_base
return this->do_get_completed_state_count();
}
virtual void set_total_state_count(std::size_t states)
{
this->do_set_total_state_count(states);
}
virtual void set_total_state_count(std::size_t states) { this->do_set_total_state_count(states); }
[[nodiscard]] virtual std::size_t get_total_state_count() const
{
return this->do_get_total_state_count();
@@ -193,17 +181,16 @@ struct printer_base
protected:
// Implementation hooks for subclasses:
virtual void do_log_argv(const std::vector<std::string>&) {}
virtual void do_log_argv(const std::vector<std::string> &) {}
virtual void do_print_device_info() {}
virtual void do_print_log_preamble() {}
virtual void do_print_log_epilogue() {}
virtual void do_log(nvbench::log_level, const std::string &) {}
virtual void do_log_run_state(const nvbench::state &) {}
virtual void
do_process_bulk_data_float64(nvbench::state &,
const std::string &,
const std::string &,
const std::vector<nvbench::float64_t> &){};
virtual void do_process_bulk_data_float64(nvbench::state &,
const std::string &,
const std::string &,
const std::vector<nvbench::float64_t> &){};
virtual void do_print_benchmark_list(const benchmark_vector &) {}
virtual void do_print_benchmark_results(const benchmark_vector &) {}

View File

@@ -38,19 +38,10 @@ void printer_base::do_set_completed_state_count(std::size_t states)
void printer_base::do_add_completed_state() { ++m_completed_state_count; }
std::size_t printer_base::do_get_completed_state_count() const
{
return m_completed_state_count;
}
std::size_t printer_base::do_get_completed_state_count() const { return m_completed_state_count; }
void printer_base::do_set_total_state_count(std::size_t states)
{
m_total_state_count = states;
}
void printer_base::do_set_total_state_count(std::size_t states) { m_total_state_count = states; }
std::size_t printer_base::do_get_total_state_count() const
{
return m_total_state_count;
}
std::size_t printer_base::do_get_total_state_count() const { return m_total_state_count; }
} // namespace nvbench

View File

@@ -40,10 +40,7 @@ struct printer_multiplex : nvbench::printer_base
return static_cast<Format &>(*m_printers.back());
}
[[nodiscard]] std::size_t get_printer_count() const
{
return m_printers.size();
}
[[nodiscard]] std::size_t get_printer_count() const { return m_printers.size(); }
protected:
void do_log_argv(const std::vector<std::string> &argv) override;
@@ -52,11 +49,10 @@ protected:
void do_print_log_epilogue() override;
void do_log(nvbench::log_level, const std::string &) override;
void do_log_run_state(const nvbench::state &) override;
void do_process_bulk_data_float64(
nvbench::state &,
const std::string &,
const std::string &,
const std::vector<nvbench::float64_t> &) override;
void do_process_bulk_data_float64(nvbench::state &,
const std::string &,
const std::string &,
const std::vector<nvbench::float64_t> &) override;
void do_print_benchmark_list(const benchmark_vector &benches) override;
void do_print_benchmark_results(const benchmark_vector &benches) override;
void do_set_completed_state_count(std::size_t states) override;

View File

@@ -67,11 +67,10 @@ void printer_multiplex::do_log_run_state(const nvbench::state &exec_state)
}
}
void printer_multiplex::do_process_bulk_data_float64(
state &state,
const std::string &tag,
const std::string &hint,
const std::vector<nvbench::float64_t> &data)
void printer_multiplex::do_process_bulk_data_float64(state &state,
const std::string &tag,
const std::string &hint,
const std::vector<nvbench::float64_t> &data)
{
for (auto &format_ptr : m_printers)
{
@@ -87,8 +86,7 @@ void printer_multiplex::do_print_benchmark_list(const benchmark_vector &benches)
}
}
void printer_multiplex::do_print_benchmark_results(
const benchmark_vector &benches)
void printer_multiplex::do_print_benchmark_results(const benchmark_vector &benches)
{
for (auto &format_ptr : m_printers)
{

View File

@@ -29,13 +29,11 @@ namespace nvbench
namespace detail
{
template <typename T>
using range_output_t = std::conditional_t<std::is_floating_point_v<T>,
nvbench::float64_t,
nvbench::int64_t>;
using range_output_t =
std::conditional_t<std::is_floating_point_v<T>, nvbench::float64_t, nvbench::int64_t>;
}
template <typename InT,
typename OutT = nvbench::detail::range_output_t<InT>>
template <typename InT, typename OutT = nvbench::detail::range_output_t<InT>>
auto range(InT start, InT end, InT stride = InT{1})
{
if constexpr (std::is_floating_point_v<InT>)

View File

@@ -37,8 +37,7 @@ struct runner_base
void generate_states();
void handle_sampling_exception(const std::exception &e,
nvbench::state &exec_state) const;
void handle_sampling_exception(const std::exception &e, nvbench::state &exec_state) const;
void run_state_prologue(state &exec_state) const;
void run_state_epilogue(state &exec_state) const;
@@ -51,11 +50,10 @@ struct runner_base
template <typename BenchmarkType>
struct runner : public runner_base
{
using benchmark_type = BenchmarkType;
using kernel_generator = typename benchmark_type::kernel_generator;
using type_configs = typename benchmark_type::type_configs;
static constexpr std::size_t num_type_configs =
benchmark_type::num_type_configs;
using benchmark_type = BenchmarkType;
using kernel_generator = typename benchmark_type::kernel_generator;
using type_configs = typename benchmark_type::type_configs;
static constexpr std::size_t num_type_configs = benchmark_type::num_type_configs;
explicit runner(benchmark_type &bench)
: runner_base{bench}
@@ -86,38 +84,37 @@ private:
// Iterate through type_configs:
std::size_t type_config_index = 0;
nvbench::tl::foreach<type_configs>([&self = *this,
&states = m_benchmark.m_states,
&type_config_index,
&device](auto type_config_wrapper) {
// Get current type_config:
using type_config = typename decltype(type_config_wrapper)::type;
nvbench::tl::foreach<type_configs>(
[&self = *this, &states = m_benchmark.m_states, &type_config_index, &device](
auto type_config_wrapper) {
// Get current type_config:
using type_config = typename decltype(type_config_wrapper)::type;
// Find states with the current device / type_config
for (nvbench::state &cur_state : states)
{
if (cur_state.get_device() == device &&
cur_state.get_type_config_index() == type_config_index)
// Find states with the current device / type_config
for (nvbench::state &cur_state : states)
{
self.run_state_prologue(cur_state);
try
if (cur_state.get_device() == device &&
cur_state.get_type_config_index() == type_config_index)
{
kernel_generator{}(cur_state, type_config{});
if (cur_state.is_skipped())
self.run_state_prologue(cur_state);
try
{
self.print_skip_notification(cur_state);
kernel_generator{}(cur_state, type_config{});
if (cur_state.is_skipped())
{
self.print_skip_notification(cur_state);
}
}
catch (std::exception &e)
{
self.handle_sampling_exception(e, cur_state);
}
self.run_state_epilogue(cur_state);
}
catch (std::exception &e)
{
self.handle_sampling_exception(e, cur_state);
}
self.run_state_epilogue(cur_state);
}
}
++type_config_index;
});
++type_config_index;
});
}
};

View File

@@ -35,8 +35,7 @@ void runner_base::generate_states()
m_benchmark.m_states = nvbench::detail::state_generator::create(m_benchmark);
}
void runner_base::handle_sampling_exception(const std::exception &e,
state &exec_state) const
void runner_base::handle_sampling_exception(const std::exception &e, state &exec_state) const
{
// If the state is skipped, that means the execution framework class handled
// the error already.
@@ -62,8 +61,7 @@ void runner_base::handle_sampling_exception(const std::exception &e,
void runner_base::run_state_prologue(nvbench::state &exec_state) const
{
// Log if a printer exists:
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
printer_opt_ref.has_value())
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
{
auto &printer = printer_opt_ref.value().get();
printer.log_run_state(exec_state);
@@ -73,19 +71,16 @@ void runner_base::run_state_prologue(nvbench::state &exec_state) const
void runner_base::run_state_epilogue(state &exec_state) const
{
// Notify the printer that the state has completed::
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
printer_opt_ref.has_value())
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
{
auto &printer = printer_opt_ref.value().get();
printer.add_completed_state();
}
}
void runner_base::print_skip_notification(state &exec_state) const
{
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
printer_opt_ref.has_value())
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
{
auto &printer = printer_opt_ref.value().get();
printer.log(nvbench::log_level::skip, exec_state.get_skip_reason());

View File

@@ -58,106 +58,68 @@ struct state_tester;
struct state
{
// move-only
state(const state &) = delete;
state(state &&) = default;
state(const state &) = delete;
state(state &&) = default;
state &operator=(const state &) = delete;
state &operator=(state &&) = default;
state &operator=(state &&) = default;
[[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const
{
return m_cuda_stream;
}
void set_cuda_stream(nvbench::cuda_stream &&stream)
{
m_cuda_stream = std::move(stream);
}
[[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const { return m_cuda_stream; }
void set_cuda_stream(nvbench::cuda_stream &&stream) { m_cuda_stream = std::move(stream); }
/// The CUDA device associated with with this benchmark state. May be
/// nullopt for CPU-only benchmarks.
[[nodiscard]] const std::optional<nvbench::device_info> &get_device() const
{
return m_device;
}
[[nodiscard]] const std::optional<nvbench::device_info> &get_device() const { return m_device; }
/// An index into a benchmark::type_configs type_list. Returns 0 if no type
/// axes in the associated benchmark.
[[nodiscard]] std::size_t get_type_config_index() const
{
return m_type_config_index;
}
[[nodiscard]] std::size_t get_type_config_index() const { return m_type_config_index; }
[[nodiscard]] nvbench::int64_t get_int64(const std::string &axis_name) const;
[[nodiscard]] nvbench::int64_t
get_int64_or_default(const std::string &axis_name,
nvbench::int64_t default_value) const;
[[nodiscard]] nvbench::int64_t get_int64_or_default(const std::string &axis_name,
nvbench::int64_t default_value) const;
[[nodiscard]] nvbench::float64_t
get_float64(const std::string &axis_name) const;
[[nodiscard]] nvbench::float64_t
get_float64_or_default(const std::string &axis_name,
nvbench::float64_t default_value) const;
[[nodiscard]] nvbench::float64_t get_float64(const std::string &axis_name) const;
[[nodiscard]] nvbench::float64_t get_float64_or_default(const std::string &axis_name,
nvbench::float64_t default_value) const;
[[nodiscard]] const std::string &
get_string(const std::string &axis_name) const;
[[nodiscard]] const std::string &
get_string_or_default(const std::string &axis_name,
const std::string &default_value) const;
[[nodiscard]] const std::string &get_string(const std::string &axis_name) const;
[[nodiscard]] const std::string &get_string_or_default(const std::string &axis_name,
const std::string &default_value) const;
void add_element_count(std::size_t elements, std::string column_name = {});
void set_element_count(std::size_t elements) { m_element_count = elements; }
[[nodiscard]] std::size_t get_element_count() const
{
return m_element_count;
}
[[nodiscard]] std::size_t get_element_count() const { return m_element_count; }
template <typename ElementType>
void add_global_memory_reads(std::size_t count, std::string column_name = {})
{
this->add_global_memory_reads(count * sizeof(ElementType),
std::move(column_name));
this->add_global_memory_reads(count * sizeof(ElementType), std::move(column_name));
}
void add_global_memory_reads(std::size_t bytes, std::string column_name = {});
template <typename ElementType>
void add_global_memory_writes(std::size_t count, std::string column_name = {})
{
this->add_global_memory_writes(count * sizeof(ElementType),
std::move(column_name));
this->add_global_memory_writes(count * sizeof(ElementType), std::move(column_name));
}
void add_global_memory_writes(std::size_t bytes,
std::string column_name = {});
void add_global_memory_writes(std::size_t bytes, std::string column_name = {});
void add_buffer_size(std::size_t num_bytes,
std::string summary_tag,
std::string column_name = {},
std::string description = {});
void set_global_memory_rw_bytes(std::size_t bytes)
{
m_global_memory_rw_bytes = bytes;
}
[[nodiscard]] std::size_t get_global_memory_rw_bytes() const
{
return m_global_memory_rw_bytes;
}
void set_global_memory_rw_bytes(std::size_t bytes) { m_global_memory_rw_bytes = bytes; }
[[nodiscard]] std::size_t get_global_memory_rw_bytes() const { return m_global_memory_rw_bytes; }
void skip(std::string reason) { m_skip_reason = std::move(reason); }
[[nodiscard]] bool is_skipped() const { return !m_skip_reason.empty(); }
[[nodiscard]] const std::string &get_skip_reason() const
{
return m_skip_reason;
}
[[nodiscard]] const std::string &get_skip_reason() const { return m_skip_reason; }
/// Execute at least this many trials per measurement. @{
[[nodiscard]] nvbench::int64_t get_min_samples() const
{
return m_min_samples;
}
void set_min_samples(nvbench::int64_t min_samples)
{
m_min_samples = min_samples;
}
[[nodiscard]] nvbench::int64_t get_min_samples() const { return m_min_samples; }
void set_min_samples(nvbench::int64_t min_samples) { m_min_samples = min_samples; }
/// @}
/// If true, the benchmark is only run once, skipping all warmup runs and only
@@ -222,20 +184,14 @@ struct state
}
///@}
[[nodiscard]] const named_values &get_axis_values() const
{
return m_axis_values;
}
[[nodiscard]] const named_values &get_axis_values() const { return m_axis_values; }
/*!
* Return a string of "axis_name1=input_string1 axis_name2=input_string2 ..."
*/
[[nodiscard]] std::string get_axis_values_as_string(bool color = false) const;
[[nodiscard]] const benchmark_base &get_benchmark() const
{
return m_benchmark;
}
[[nodiscard]] const benchmark_base &get_benchmark() const { return m_benchmark; }
void collect_l1_hit_rates() { m_collect_l1_hit_rates = true; }
void collect_l2_hit_rates() { m_collect_l2_hit_rates = true; }
@@ -252,26 +208,11 @@ struct state
collect_dram_throughput();
}
[[nodiscard]] bool is_l1_hit_rate_collected() const
{
return m_collect_l1_hit_rates;
}
[[nodiscard]] bool is_l2_hit_rate_collected() const
{
return m_collect_l2_hit_rates;
}
[[nodiscard]] bool is_stores_efficiency_collected() const
{
return m_collect_stores_efficiency;
}
[[nodiscard]] bool is_loads_efficiency_collected() const
{
return m_collect_loads_efficiency;
}
[[nodiscard]] bool is_dram_throughput_collected() const
{
return m_collect_dram_throughput;
}
[[nodiscard]] bool is_l1_hit_rate_collected() const { return m_collect_l1_hit_rates; }
[[nodiscard]] bool is_l2_hit_rate_collected() const { return m_collect_l2_hit_rates; }
[[nodiscard]] bool is_stores_efficiency_collected() const { return m_collect_stores_efficiency; }
[[nodiscard]] bool is_loads_efficiency_collected() const { return m_collect_loads_efficiency; }
[[nodiscard]] bool is_dram_throughput_collected() const { return m_collect_dram_throughput; }
[[nodiscard]] bool is_cupti_required() const
{
@@ -306,8 +247,7 @@ struct state
template <typename KernelLauncher>
void exec(KernelLauncher &&kernel_launcher)
{
this->exec(nvbench::exec_tag::none,
std::forward<KernelLauncher>(kernel_launcher));
this->exec(nvbench::exec_tag::none, std::forward<KernelLauncher>(kernel_launcher));
}
private:

View File

@@ -65,9 +65,8 @@ nvbench::int64_t state::get_int64(const std::string &axis_name) const
return m_axis_values.get_int64(axis_name);
}
nvbench::int64_t
state::get_int64_or_default(const std::string &axis_name,
nvbench::int64_t default_value) const
nvbench::int64_t state::get_int64_or_default(const std::string &axis_name,
nvbench::int64_t default_value) const
try
{
return this->get_int64(axis_name);
@@ -82,9 +81,8 @@ nvbench::float64_t state::get_float64(const std::string &axis_name) const
return m_axis_values.get_float64(axis_name);
}
nvbench::float64_t
state::get_float64_or_default(const std::string &axis_name,
nvbench::float64_t default_value) const
nvbench::float64_t state::get_float64_or_default(const std::string &axis_name,
nvbench::float64_t default_value) const
try
{
return this->get_float64(axis_name);
@@ -99,9 +97,8 @@ const std::string &state::get_string(const std::string &axis_name) const
return m_axis_values.get_string(axis_name);
}
const std::string &
state::get_string_or_default(const std::string &axis_name,
const std::string &default_value) const
const std::string &state::get_string_or_default(const std::string &axis_name,
const std::string &default_value) const
try
{
return this->get_string(axis_name);
@@ -125,20 +122,18 @@ summary &state::add_summary(summary s)
const summary &state::get_summary(std::string_view tag) const
{
// Check tags first
auto iter =
std::find_if(m_summaries.cbegin(),
m_summaries.cend(),
[&tag](const auto &s) { return s.get_tag() == tag; });
auto iter = std::find_if(m_summaries.cbegin(), m_summaries.cend(), [&tag](const auto &s) {
return s.get_tag() == tag;
});
if (iter != m_summaries.cend())
{
return *iter;
}
// Then names:
iter =
std::find_if(m_summaries.cbegin(),
m_summaries.cend(),
[&tag](const auto &s) { return s.get_string("name") == tag; });
iter = std::find_if(m_summaries.cbegin(), m_summaries.cend(), [&tag](const auto &s) {
return s.get_string("name") == tag;
});
if (iter != m_summaries.cend())
{
return *iter;
@@ -150,20 +145,18 @@ const summary &state::get_summary(std::string_view tag) const
summary &state::get_summary(std::string_view tag)
{
// Check tags first
auto iter =
std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
return s.get_tag() == tag;
});
auto iter = std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
return s.get_tag() == tag;
});
if (iter != m_summaries.end())
{
return *iter;
}
// Then names:
iter =
std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
return s.get_string("name") == tag;
});
iter = std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
return s.get_string("name") == tag;
});
if (iter != m_summaries.end())
{
return *iter;
@@ -187,18 +180,17 @@ std::string state::get_axis_values_as_string(bool color) const
// Create a Key=Value list of all parameters:
fmt::memory_buffer buffer;
auto append_key_value = [&buffer, &style](const std::string &key,
const auto &value,
std::string value_fmtstr = "{}") {
constexpr auto key_format = fmt::emphasis::italic;
constexpr auto value_format = fmt::emphasis::bold;
auto append_key_value =
[&buffer, &style](const std::string &key, const auto &value, std::string value_fmtstr = "{}") {
constexpr auto key_format = fmt::emphasis::italic;
constexpr auto value_format = fmt::emphasis::bold;
fmt::format_to(buffer,
"{}{}={}",
buffer.size() == 0 ? "" : " ",
fmt::format(style(key_format), "{}", key),
fmt::format(style(value_format), value_fmtstr, value));
};
fmt::format_to(buffer,
"{}{}={}",
buffer.size() == 0 ? "" : " ",
fmt::format(style(key_format), "{}", key),
fmt::format(style(value_format), value_fmtstr, value));
};
if (m_device)
{
@@ -211,8 +203,7 @@ std::string state::get_axis_values_as_string(bool color) const
const auto axis_type = m_axis_values.get_type(name);
// Handle power-of-two int64 axes differently:
if (axis_type == named_values::type::int64 &&
axes.get_int64_axis(name).is_power_of_two())
if (axis_type == named_values::type::int64 && axes.get_int64_axis(name).is_power_of_two())
{
const nvbench::int64_t value = m_axis_values.get_int64(name);
const nvbench::int64_t exponent = int64_axis::compute_log2(value);
@@ -242,10 +233,9 @@ std::string state::get_short_description(bool color) const
return color ? fmt_style : no_style;
};
return fmt::format(
"{} [{}]",
fmt::format(style(fmt::emphasis::bold), "{}", m_benchmark.get().get_name()),
this->get_axis_values_as_string(color));
return fmt::format("{} [{}]",
fmt::format(style(fmt::emphasis::bold), "{}", m_benchmark.get().get_name()),
this->get_axis_values_as_string(color));
}
void state::add_element_count(std::size_t elements, std::string column_name)

View File

@@ -36,25 +36,13 @@ struct string_axis final : public axis_base
~string_axis() final;
void set_inputs(std::vector<std::string> inputs)
{
m_values = std::move(inputs);
}
[[nodiscard]] const std::string &get_value(std::size_t i) const
{
return m_values[i];
}
void set_inputs(std::vector<std::string> inputs) { m_values = std::move(inputs); }
[[nodiscard]] const std::string &get_value(std::size_t i) const { return m_values[i]; }
private:
std::unique_ptr<axis_base> do_clone() const
{
return std::make_unique<string_axis>(*this);
}
std::unique_ptr<axis_base> do_clone() const { return std::make_unique<string_axis>(*this); }
std::size_t do_get_size() const final { return m_values.size(); }
std::string do_get_input_string(std::size_t i) const final
{
return m_values[i];
}
std::string do_get_input_string(std::size_t i) const final { return m_values[i]; }
std::string do_get_description(std::size_t) const final { return {}; }
std::vector<std::string> m_values;

View File

@@ -92,10 +92,10 @@ struct summary : public nvbench::named_values
{}
// move-only
summary(const summary &) = delete;
summary(summary &&) = default;
summary(const summary &) = delete;
summary(summary &&) = default;
summary &operator=(const summary &) = delete;
summary &operator=(summary &&) = default;
summary &operator=(summary &&) = default;
void set_tag(std::string tag) { m_tag = std::move(tag); }
[[nodiscard]] const std::string &get_tag() const { return m_tag; }

View File

@@ -38,8 +38,8 @@ namespace nvbench
__global__ void sleep_kernel(double seconds)
{
const auto start = cuda::std::chrono::high_resolution_clock::now();
const auto ns = cuda::std::chrono::nanoseconds(
static_cast<nvbench::int64_t>(seconds * 1000 * 1000 * 1000));
const auto ns =
cuda::std::chrono::nanoseconds(static_cast<nvbench::int64_t>(seconds * 1000 * 1000 * 1000));
const auto finish = start + ns;
auto now = cuda::std::chrono::high_resolution_clock::now();
@@ -53,7 +53,7 @@ __global__ void sleep_kernel(double seconds)
* Naive copy of `n` values from `in` -> `out`.
*/
template <typename T, typename U>
__global__ void copy_kernel(const T* in, U* out, std::size_t n)
__global__ void copy_kernel(const T *in, U *out, std::size_t n)
{
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = blockDim.x * gridDim.x;
@@ -68,7 +68,7 @@ __global__ void copy_kernel(const T* in, U* out, std::size_t n)
* For `i <- [0,n)`, `out[i] = in[i] % 2`.
*/
template <typename T, typename U>
__global__ void mod2_kernel(const T* in, U* out, std::size_t n)
__global__ void mod2_kernel(const T *in, U *out, std::size_t n)
{
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = blockDim.x * gridDim.x;
@@ -79,4 +79,4 @@ __global__ void mod2_kernel(const T* in, U* out, std::size_t n)
}
}
}
} // namespace nvbench

View File

@@ -43,7 +43,7 @@ struct type_axis final : public axis_base
template <typename TypeList>
void set_inputs();
void set_active_inputs(const std::vector<std::string>& inputs);
void set_active_inputs(const std::vector<std::string> &inputs);
[[nodiscard]] bool get_is_active(const std::string &input) const;
[[nodiscard]] bool get_is_active(std::size_t index) const;
@@ -57,23 +57,13 @@ struct type_axis final : public axis_base
/**
* The index in this axis of the type with the specified `input_string`.
*/
[[nodiscard]] std::size_t
get_type_index(const std::string &input_string) const;
[[nodiscard]] std::size_t get_type_index(const std::string &input_string) const;
private:
std::unique_ptr<axis_base> do_clone() const
{
return std::make_unique<type_axis>(*this);
}
std::unique_ptr<axis_base> do_clone() const { return std::make_unique<type_axis>(*this); }
std::size_t do_get_size() const final { return m_input_strings.size(); }
std::string do_get_input_string(std::size_t i) const final
{
return m_input_strings[i];
}
std::string do_get_description(std::size_t i) const final
{
return m_descriptions[i];
}
std::string do_get_input_string(std::size_t i) const final { return m_input_strings[i]; }
std::string do_get_description(std::size_t i) const final { return m_descriptions[i]; }
std::vector<std::string> m_input_strings;
std::vector<std::string> m_descriptions;

View File

@@ -35,10 +35,10 @@ void type_axis::set_active_inputs(const std::vector<std::string> &inputs)
{
m_mask.clear();
m_mask.resize(m_input_strings.size(), false);
for (const auto& input : inputs)
for (const auto &input : inputs)
{
const auto idx = this->get_type_index(input);
m_mask[idx] = true;
m_mask[idx] = true;
}
}
@@ -47,21 +47,16 @@ bool type_axis::get_is_active(const std::string &input) const
return this->get_is_active(this->get_type_index(input));
}
bool type_axis::get_is_active(std::size_t idx) const
{
return m_mask.at(idx);
}
bool type_axis::get_is_active(std::size_t idx) const { return m_mask.at(idx); }
std::size_t type_axis::get_active_count() const
{
return static_cast<std::size_t>(
std::count(m_mask.cbegin(), m_mask.cend(), true));
return static_cast<std::size_t>(std::count(m_mask.cbegin(), m_mask.cend(), true));
}
std::size_t type_axis::get_type_index(const std::string &input_string) const
{
auto it =
std::find(m_input_strings.cbegin(), m_input_strings.cend(), input_string);
auto it = std::find(m_input_strings.cbegin(), m_input_strings.cend(), input_string);
if (it == m_input_strings.end())
{
NVBENCH_THROW(std::runtime_error,

View File

@@ -30,17 +30,17 @@ namespace nvbench
std::string demangle(const std::string &str);
template <typename T>
std::string demangle() { return demangle(typeid(T).name()); }
std::string demangle()
{
return demangle(typeid(T).name());
}
template <typename T>
struct type_strings
{
// The string used to identify the type in shorthand (e.g. output tables and
// CLI options):
static std::string input_string()
{
return nvbench::demangle<T>();
}
static std::string input_string() { return nvbench::demangle<T>(); }
// A more descriptive identifier for the type, if input_string is not a common
// identifier. May be blank if `input_string` is obvious.
@@ -56,10 +56,7 @@ struct type_strings<std::integral_constant<T, Value>>
// A more descriptive identifier for the type, if input_string is not a common
// identifier. May be blank if `input_string` is obvious.
static std::string description()
{
return nvbench::demangle<std::integral_constant<T, Value>>();
}
static std::string description() { return nvbench::demangle<std::integral_constant<T, Value>>(); }
};
} // namespace nvbench
@@ -67,15 +64,15 @@ struct type_strings<std::integral_constant<T, Value>>
/*!
* Declare an `input_string` and `description` to use with a specific `type`.
*/
#define NVBENCH_DECLARE_TYPE_STRINGS(Type, InputString, Description) \
namespace nvbench \
{ \
template <> \
struct type_strings<Type> \
{ \
static std::string input_string() { return {InputString}; } \
static std::string description() { return {Description}; } \
}; \
#define NVBENCH_DECLARE_TYPE_STRINGS(Type, InputString, Description) \
namespace nvbench \
{ \
template <> \
struct type_strings<Type> \
{ \
static std::string input_string() { return {InputString}; } \
static std::string description() { return {Description}; } \
}; \
}
NVBENCH_DECLARE_TYPE_STRINGS(nvbench::int8_t, "I8", "int8_t");