mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-03-14 20:27:24 +00:00
Merge pull request #96 from senior-zero/fix-main/github/format
Fix formatting
This commit is contained in:
@@ -36,7 +36,7 @@ BreakBeforeBinaryOperators: None
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializers: BeforeComma
|
||||
BreakInheritanceList: BeforeComma
|
||||
ColumnLimit: 80
|
||||
ColumnLimit: 100
|
||||
CompactNamespaces: false
|
||||
ContinuationIndentWidth: 2
|
||||
IndentCaseLabels: true
|
||||
@@ -55,7 +55,7 @@ PenaltyExcessCharacter: 100
|
||||
PenaltyReturnTypeOnItsOwnLine: 90
|
||||
PointerAlignment: Right
|
||||
ReflowComments: true
|
||||
SortIncludes: true
|
||||
SortIncludes: CaseInsensitive
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceAfterLogicalNot: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
|
||||
17
.git-blame-ignore-revs
Normal file
17
.git-blame-ignore-revs
Normal file
@@ -0,0 +1,17 @@
|
||||
# Exclude these commits from git-blame and similar tools.
|
||||
#
|
||||
# To use this file, run the following command from the repo root:
|
||||
#
|
||||
# ```
|
||||
# $ git config blame.ignoreRevsFile .git-blame-ignore-revs
|
||||
# ```
|
||||
#
|
||||
# Include a brief comment with each commit added, for example:
|
||||
#
|
||||
# ```
|
||||
# 8f1152d4a22287a35be2dde596e3cf86ace8054a # Increase column limit to 100
|
||||
# ```
|
||||
#
|
||||
# Only add commits that are pure formatting changes (e.g. clang-format version changes, etc).
|
||||
8f1152d4a22287a35be2dde596e3cf86ace8054a # Increase column limit to 100
|
||||
|
||||
@@ -41,8 +41,8 @@ struct axes_metadata
|
||||
template <typename... TypeAxes>
|
||||
explicit axes_metadata(nvbench::type_list<TypeAxes...>);
|
||||
|
||||
axes_metadata() = default;
|
||||
axes_metadata(axes_metadata &&) = default;
|
||||
axes_metadata() = default;
|
||||
axes_metadata(axes_metadata &&) = default;
|
||||
axes_metadata &operator=(axes_metadata &&) = default;
|
||||
|
||||
axes_metadata(const axes_metadata &);
|
||||
@@ -58,20 +58,16 @@ struct axes_metadata
|
||||
|
||||
void add_string_axis(std::string name, std::vector<std::string> data);
|
||||
|
||||
[[nodiscard]] const nvbench::int64_axis &
|
||||
get_int64_axis(std::string_view name) const;
|
||||
[[nodiscard]] const nvbench::int64_axis &get_int64_axis(std::string_view name) const;
|
||||
[[nodiscard]] nvbench::int64_axis &get_int64_axis(std::string_view name);
|
||||
|
||||
[[nodiscard]] const nvbench::float64_axis &
|
||||
get_float64_axis(std::string_view name) const;
|
||||
[[nodiscard]] const nvbench::float64_axis &get_float64_axis(std::string_view name) const;
|
||||
[[nodiscard]] nvbench::float64_axis &get_float64_axis(std::string_view name);
|
||||
|
||||
[[nodiscard]] const nvbench::string_axis &
|
||||
get_string_axis(std::string_view name) const;
|
||||
[[nodiscard]] const nvbench::string_axis &get_string_axis(std::string_view name) const;
|
||||
[[nodiscard]] nvbench::string_axis &get_string_axis(std::string_view name);
|
||||
|
||||
[[nodiscard]] const nvbench::type_axis &
|
||||
get_type_axis(std::string_view name) const;
|
||||
[[nodiscard]] const nvbench::type_axis &get_type_axis(std::string_view name) const;
|
||||
[[nodiscard]] nvbench::type_axis &get_type_axis(std::string_view name);
|
||||
|
||||
[[nodiscard]] const nvbench::type_axis &get_type_axis(std::size_t index) const;
|
||||
@@ -83,10 +79,9 @@ struct axes_metadata
|
||||
[[nodiscard]] const nvbench::axis_base &get_axis(std::string_view name) const;
|
||||
[[nodiscard]] nvbench::axis_base &get_axis(std::string_view name);
|
||||
|
||||
[[nodiscard]] const nvbench::axis_base &
|
||||
get_axis(std::string_view name, nvbench::axis_type type) const;
|
||||
[[nodiscard]] nvbench::axis_base &get_axis(std::string_view name,
|
||||
nvbench::axis_type type);
|
||||
[[nodiscard]] const nvbench::axis_base &get_axis(std::string_view name,
|
||||
nvbench::axis_type type) const;
|
||||
[[nodiscard]] nvbench::axis_base &get_axis(std::string_view name, nvbench::axis_type type);
|
||||
|
||||
[[nodiscard]] static std::vector<std::string>
|
||||
generate_default_type_axis_names(std::size_t num_type_axes);
|
||||
@@ -101,7 +96,7 @@ axes_metadata::axes_metadata(nvbench::type_list<TypeAxes...>)
|
||||
{
|
||||
using type_axes_list = nvbench::type_list<TypeAxes...>;
|
||||
constexpr auto num_type_axes = nvbench::tl::size<type_axes_list>::value;
|
||||
auto names = axes_metadata::generate_default_type_axis_names(num_type_axes);
|
||||
auto names = axes_metadata::generate_default_type_axis_names(num_type_axes);
|
||||
|
||||
auto names_iter = names.begin(); // contents will be moved from
|
||||
nvbench::tl::foreach<type_axes_list>(
|
||||
@@ -114,8 +109,7 @@ axes_metadata::axes_metadata(nvbench::type_list<TypeAxes...>)
|
||||
// The word "type" appears 6 times in the next line.
|
||||
// Every. Single. Token.
|
||||
typedef typename decltype(wrapped_type)::type type_list;
|
||||
auto axis = std::make_unique<nvbench::type_axis>(std::move(*names_iter++),
|
||||
type_axis_index);
|
||||
auto axis = std::make_unique<nvbench::type_axis>(std::move(*names_iter++), type_axis_index);
|
||||
axis->template set_inputs<type_list>();
|
||||
axes.push_back(std::move(axis));
|
||||
});
|
||||
|
||||
@@ -64,9 +64,7 @@ try
|
||||
auto &axis = *m_axes[i];
|
||||
if (axis.get_type() != nvbench::axis_type::type)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Number of names exceeds number of type axes ({})",
|
||||
i);
|
||||
NVBENCH_THROW(std::runtime_error, "Number of names exceeds number of type axes ({})", i);
|
||||
}
|
||||
|
||||
axis.set_name(std::move(names[i]));
|
||||
@@ -81,8 +79,7 @@ catch (std::exception &e)
|
||||
names);
|
||||
}
|
||||
|
||||
void axes_metadata::add_float64_axis(std::string name,
|
||||
std::vector<nvbench::float64_t> data)
|
||||
void axes_metadata::add_float64_axis(std::string name, std::vector<nvbench::float64_t> data)
|
||||
{
|
||||
auto axis = std::make_unique<nvbench::float64_axis>(std::move(name));
|
||||
axis->set_inputs(std::move(data));
|
||||
@@ -98,8 +95,7 @@ void axes_metadata::add_int64_axis(std::string name,
|
||||
m_axes.push_back(std::move(axis));
|
||||
}
|
||||
|
||||
void axes_metadata::add_string_axis(std::string name,
|
||||
std::vector<std::string> data)
|
||||
void axes_metadata::add_string_axis(std::string name, std::vector<std::string> data)
|
||||
{
|
||||
auto axis = std::make_unique<nvbench::string_axis>(std::move(name));
|
||||
axis->set_inputs(std::move(data));
|
||||
@@ -188,10 +184,9 @@ nvbench::type_axis &axes_metadata::get_type_axis(std::size_t index)
|
||||
|
||||
const axis_base &axes_metadata::get_axis(std::string_view name) const
|
||||
{
|
||||
auto iter =
|
||||
std::find_if(m_axes.cbegin(), m_axes.cend(), [&name](const auto &axis) {
|
||||
return axis->get_name() == name;
|
||||
});
|
||||
auto iter = std::find_if(m_axes.cbegin(), m_axes.cend(), [&name](const auto &axis) {
|
||||
return axis->get_name() == name;
|
||||
});
|
||||
|
||||
if (iter == m_axes.cend())
|
||||
{
|
||||
@@ -203,10 +198,9 @@ const axis_base &axes_metadata::get_axis(std::string_view name) const
|
||||
|
||||
axis_base &axes_metadata::get_axis(std::string_view name)
|
||||
{
|
||||
auto iter =
|
||||
std::find_if(m_axes.begin(), m_axes.end(), [&name](const auto &axis) {
|
||||
return axis->get_name() == name;
|
||||
});
|
||||
auto iter = std::find_if(m_axes.begin(), m_axes.end(), [&name](const auto &axis) {
|
||||
return axis->get_name() == name;
|
||||
});
|
||||
|
||||
if (iter == m_axes.end())
|
||||
{
|
||||
@@ -216,8 +210,7 @@ axis_base &axes_metadata::get_axis(std::string_view name)
|
||||
return **iter;
|
||||
}
|
||||
|
||||
const axis_base &axes_metadata::get_axis(std::string_view name,
|
||||
nvbench::axis_type type) const
|
||||
const axis_base &axes_metadata::get_axis(std::string_view name, nvbench::axis_type type) const
|
||||
{
|
||||
const auto &axis = this->get_axis(name);
|
||||
if (axis.get_type() != type)
|
||||
@@ -231,8 +224,7 @@ const axis_base &axes_metadata::get_axis(std::string_view name,
|
||||
return axis;
|
||||
}
|
||||
|
||||
axis_base &axes_metadata::get_axis(std::string_view name,
|
||||
nvbench::axis_type type)
|
||||
axis_base &axes_metadata::get_axis(std::string_view name, nvbench::axis_type type)
|
||||
{
|
||||
auto &axis = this->get_axis(name);
|
||||
if (axis.get_type() != type)
|
||||
@@ -246,8 +238,7 @@ axis_base &axes_metadata::get_axis(std::string_view name,
|
||||
return axis;
|
||||
}
|
||||
|
||||
std::vector<std::string>
|
||||
axes_metadata::generate_default_type_axis_names(std::size_t num_type_axes)
|
||||
std::vector<std::string> axes_metadata::generate_default_type_axis_names(std::size_t num_type_axes)
|
||||
{
|
||||
switch (num_type_axes)
|
||||
{
|
||||
|
||||
@@ -47,10 +47,7 @@ struct axis_base
|
||||
|
||||
[[nodiscard]] axis_type get_type() const { return m_type; }
|
||||
|
||||
[[nodiscard]] std::string_view get_type_as_string() const
|
||||
{
|
||||
return axis_type_to_string(m_type);
|
||||
}
|
||||
[[nodiscard]] std::string_view get_type_as_string() const { return axis_type_to_string(m_type); }
|
||||
|
||||
[[nodiscard]] std::string_view get_flags_as_string() const
|
||||
{
|
||||
|
||||
@@ -23,9 +23,6 @@ namespace nvbench
|
||||
|
||||
axis_base::~axis_base() = default;
|
||||
|
||||
std::unique_ptr<axis_base> axis_base::clone() const
|
||||
{
|
||||
return this->do_clone();
|
||||
}
|
||||
std::unique_ptr<axis_base> axis_base::clone() const { return this->do_clone(); }
|
||||
|
||||
} // namespace nvbench
|
||||
|
||||
@@ -57,18 +57,14 @@ struct benchmark final : public benchmark_base
|
||||
using type_axes = TypeAxes;
|
||||
using type_configs = nvbench::tl::cartesian_product<type_axes>;
|
||||
|
||||
static constexpr std::size_t num_type_configs =
|
||||
nvbench::tl::size<type_configs>{};
|
||||
static constexpr std::size_t num_type_configs = nvbench::tl::size<type_configs>{};
|
||||
|
||||
benchmark()
|
||||
: benchmark_base(type_axes{})
|
||||
{}
|
||||
|
||||
private:
|
||||
std::unique_ptr<benchmark_base> do_clone() const final
|
||||
{
|
||||
return std::make_unique<benchmark>();
|
||||
}
|
||||
std::unique_ptr<benchmark_base> do_clone() const final { return std::make_unique<benchmark>(); }
|
||||
|
||||
void do_set_type_axes_names(std::vector<std::string> names) final
|
||||
{
|
||||
|
||||
@@ -80,32 +80,28 @@ struct benchmark_base
|
||||
return *this;
|
||||
}
|
||||
|
||||
benchmark_base &add_float64_axis(std::string name,
|
||||
std::vector<nvbench::float64_t> data)
|
||||
benchmark_base &add_float64_axis(std::string name, std::vector<nvbench::float64_t> data)
|
||||
{
|
||||
m_axes.add_float64_axis(std::move(name), std::move(data));
|
||||
return *this;
|
||||
}
|
||||
|
||||
benchmark_base &add_int64_axis(
|
||||
std::string name,
|
||||
std::vector<nvbench::int64_t> data,
|
||||
nvbench::int64_axis_flags flags = nvbench::int64_axis_flags::none)
|
||||
benchmark_base &add_int64_axis(std::string name,
|
||||
std::vector<nvbench::int64_t> data,
|
||||
nvbench::int64_axis_flags flags = nvbench::int64_axis_flags::none)
|
||||
{
|
||||
m_axes.add_int64_axis(std::move(name), std::move(data), flags);
|
||||
return *this;
|
||||
}
|
||||
|
||||
benchmark_base &add_int64_power_of_two_axis(std::string name,
|
||||
std::vector<nvbench::int64_t> data)
|
||||
benchmark_base &add_int64_power_of_two_axis(std::string name, std::vector<nvbench::int64_t> data)
|
||||
{
|
||||
return this->add_int64_axis(std::move(name),
|
||||
std::move(data),
|
||||
nvbench::int64_axis_flags::power_of_two);
|
||||
}
|
||||
|
||||
benchmark_base &add_string_axis(std::string name,
|
||||
std::vector<std::string> data)
|
||||
benchmark_base &add_string_axis(std::string name, std::vector<std::string> data)
|
||||
{
|
||||
m_axes.add_string_axis(std::move(name), std::move(data));
|
||||
return *this;
|
||||
@@ -133,48 +129,30 @@ struct benchmark_base
|
||||
return *this;
|
||||
}
|
||||
|
||||
[[nodiscard]] const std::vector<nvbench::device_info> &get_devices() const
|
||||
{
|
||||
return m_devices;
|
||||
}
|
||||
[[nodiscard]] const std::vector<nvbench::device_info> &get_devices() const { return m_devices; }
|
||||
|
||||
[[nodiscard]] nvbench::axes_metadata &get_axes() { return m_axes; }
|
||||
|
||||
[[nodiscard]] const nvbench::axes_metadata &get_axes() const
|
||||
{
|
||||
return m_axes;
|
||||
}
|
||||
[[nodiscard]] const nvbench::axes_metadata &get_axes() const { return m_axes; }
|
||||
|
||||
// Computes the number of configs in the benchmark.
|
||||
// Unlike get_states().size(), this method may be used prior to calling run().
|
||||
[[nodiscard]] std::size_t get_config_count() const;
|
||||
|
||||
// Is empty until run() is called.
|
||||
[[nodiscard]] const std::vector<nvbench::state> &get_states() const
|
||||
{
|
||||
return m_states;
|
||||
}
|
||||
[[nodiscard]] const std::vector<nvbench::state> &get_states() const { return m_states; }
|
||||
[[nodiscard]] std::vector<nvbench::state> &get_states() { return m_states; }
|
||||
|
||||
void run() { this->do_run(); }
|
||||
|
||||
void set_printer(nvbench::printer_base &printer)
|
||||
{
|
||||
m_printer = std::ref(printer);
|
||||
}
|
||||
void set_printer(nvbench::printer_base &printer) { m_printer = std::ref(printer); }
|
||||
|
||||
void clear_printer() { m_printer = std::nullopt; }
|
||||
|
||||
[[nodiscard]] optional_ref<nvbench::printer_base> get_printer() const
|
||||
{
|
||||
return m_printer;
|
||||
}
|
||||
[[nodiscard]] optional_ref<nvbench::printer_base> get_printer() const { return m_printer; }
|
||||
|
||||
/// Execute at least this many trials per measurement. @{
|
||||
[[nodiscard]] nvbench::int64_t get_min_samples() const
|
||||
{
|
||||
return m_min_samples;
|
||||
}
|
||||
[[nodiscard]] nvbench::int64_t get_min_samples() const { return m_min_samples; }
|
||||
benchmark_base &set_min_samples(nvbench::int64_t min_samples)
|
||||
{
|
||||
m_min_samples = min_samples;
|
||||
@@ -193,7 +171,7 @@ struct benchmark_base
|
||||
}
|
||||
/// @}
|
||||
|
||||
/// If true, the benchmark does not use the blocking_kernel. This is intended
|
||||
/// If true, the benchmark does not use the blocking_kernel. This is intended
|
||||
/// for use with external profiling tools. @{
|
||||
[[nodiscard]] bool get_disable_blocking_kernel() const { return m_disable_blocking_kernel; }
|
||||
benchmark_base &set_disable_blocking_kernel(bool v)
|
||||
|
||||
@@ -68,8 +68,7 @@ std::size_t benchmark_base::get_config_count() const
|
||||
std::size_t{1},
|
||||
std::multiplies<>{},
|
||||
[](const auto &axis_ptr) {
|
||||
if (const auto *type_axis_ptr =
|
||||
dynamic_cast<const nvbench::type_axis *>(axis_ptr.get());
|
||||
if (const auto *type_axis_ptr = dynamic_cast<const nvbench::type_axis *>(axis_ptr.get());
|
||||
type_axis_ptr != nullptr)
|
||||
{
|
||||
return type_axis_ptr->get_active_count();
|
||||
|
||||
@@ -31,8 +31,7 @@ namespace nvbench
|
||||
*/
|
||||
struct benchmark_manager
|
||||
{
|
||||
using benchmark_vector =
|
||||
std::vector<std::unique_ptr<nvbench::benchmark_base>>;
|
||||
using benchmark_vector = std::vector<std::unique_ptr<nvbench::benchmark_base>>;
|
||||
|
||||
/**
|
||||
* @return The singleton benchmark_manager instance.
|
||||
@@ -53,25 +52,21 @@ struct benchmark_manager
|
||||
* Get a non-mutable reference to benchmark with the specified name/index.
|
||||
* @{
|
||||
*/
|
||||
[[nodiscard]] const benchmark_base &
|
||||
get_benchmark(const std::string &name) const;
|
||||
[[nodiscard]] const benchmark_base &get_benchmark(const std::string &name) const;
|
||||
[[nodiscard]] const benchmark_base &get_benchmark(std::size_t idx) const
|
||||
{
|
||||
return *m_benchmarks.at(idx);
|
||||
}
|
||||
/**@}*/
|
||||
|
||||
[[nodiscard]] const benchmark_vector &get_benchmarks() const
|
||||
{
|
||||
return m_benchmarks;
|
||||
};
|
||||
[[nodiscard]] const benchmark_vector &get_benchmarks() const { return m_benchmarks; };
|
||||
|
||||
private:
|
||||
benchmark_manager() = default;
|
||||
benchmark_manager(const benchmark_manager &) = delete;
|
||||
benchmark_manager(benchmark_manager &&) = delete;
|
||||
benchmark_manager() = default;
|
||||
benchmark_manager(const benchmark_manager &) = delete;
|
||||
benchmark_manager(benchmark_manager &&) = delete;
|
||||
benchmark_manager &operator=(const benchmark_manager &) = delete;
|
||||
benchmark_manager &operator=(benchmark_manager &&) = delete;
|
||||
benchmark_manager &operator=(benchmark_manager &&) = delete;
|
||||
|
||||
benchmark_vector m_benchmarks;
|
||||
};
|
||||
|
||||
@@ -43,21 +43,18 @@ benchmark_base &benchmark_manager::add(std::unique_ptr<benchmark_base> bench)
|
||||
benchmark_manager::benchmark_vector benchmark_manager::clone_benchmarks() const
|
||||
{
|
||||
benchmark_vector result(m_benchmarks.size());
|
||||
std::transform(m_benchmarks.cbegin(),
|
||||
m_benchmarks.cend(),
|
||||
result.begin(),
|
||||
[](const auto &bench) { return bench->clone(); });
|
||||
std::transform(m_benchmarks.cbegin(), m_benchmarks.cend(), result.begin(), [](const auto &bench) {
|
||||
return bench->clone();
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
const benchmark_base &
|
||||
benchmark_manager::get_benchmark(const std::string &name) const
|
||||
const benchmark_base &benchmark_manager::get_benchmark(const std::string &name) const
|
||||
{
|
||||
auto iter = std::find_if(m_benchmarks.cbegin(),
|
||||
m_benchmarks.cend(),
|
||||
[&name](const auto &bench_ptr) {
|
||||
return bench_ptr->get_name() == name;
|
||||
});
|
||||
auto iter =
|
||||
std::find_if(m_benchmarks.cbegin(), m_benchmarks.cend(), [&name](const auto &bench_ptr) {
|
||||
return bench_ptr->get_name() == name;
|
||||
});
|
||||
if (iter == m_benchmarks.cend())
|
||||
{
|
||||
NVBENCH_THROW(std::out_of_range, "No benchmark named '{}'.", name);
|
||||
|
||||
@@ -42,8 +42,8 @@ __global__ void block_stream(const volatile nvbench::int32_t *flag,
|
||||
nvbench::float64_t timeout)
|
||||
{
|
||||
const auto start_point = cuda::std::chrono::high_resolution_clock::now();
|
||||
const auto timeout_ns = cuda::std::chrono::nanoseconds(
|
||||
static_cast<nvbench::int64_t>(timeout * 1e9));
|
||||
const auto timeout_ns =
|
||||
cuda::std::chrono::nanoseconds(static_cast<nvbench::int64_t>(timeout * 1e9));
|
||||
const auto timeout_point = start_point + timeout_ns;
|
||||
|
||||
const bool use_timeout = timeout >= 0.;
|
||||
@@ -57,41 +57,40 @@ __global__ void block_stream(const volatile nvbench::int32_t *flag,
|
||||
{
|
||||
*timeout_flag = 1;
|
||||
__threadfence_system(); // Ensure timeout flag visibility on host.
|
||||
printf(
|
||||
"\n"
|
||||
"######################################################################\n"
|
||||
"##################### Possible Deadlock Detected #####################\n"
|
||||
"######################################################################\n"
|
||||
"\n"
|
||||
"Forcing unblock: The current measurement appears to have deadlocked\n"
|
||||
"and the results cannot be trusted.\n"
|
||||
"\n"
|
||||
"This happens when the KernelLauncher synchronizes the CUDA device.\n"
|
||||
"If this is the case, pass the `sync` exec_tag to the `exec` call:\n"
|
||||
"\n"
|
||||
" state.exec(<KernelLauncher>); // Deadlock\n"
|
||||
" state.exec(nvbench::exec_tag::sync, <KernelLauncher>); // Safe\n"
|
||||
"\n"
|
||||
"This tells NVBench about the sync so it can run the benchmark safely.\n"
|
||||
"\n"
|
||||
"If the KernelLauncher does not synchronize but has a very long \n"
|
||||
"execution time, this may be a false positive. If so, disable this\n"
|
||||
"check with:\n"
|
||||
"\n"
|
||||
" state.set_blocking_kernel_timeout(-1);\n"
|
||||
"\n"
|
||||
"The current timeout is set to %0.5g seconds.\n"
|
||||
"\n"
|
||||
"For more information, see the 'Benchmarks that sync' section of the\n"
|
||||
"NVBench documentation.\n"
|
||||
"\n"
|
||||
"If this happens while profiling with an external tool,\n"
|
||||
"pass the `--disable-blocking-kernel` flag or the `--profile` flag\n"
|
||||
"(to also only run the benchmark once) to the executable.\n"
|
||||
"\n"
|
||||
"For more information, see the 'Benchmark Properties' section of the\n"
|
||||
"NVBench documentation.\n\n",
|
||||
timeout);
|
||||
printf("\n"
|
||||
"######################################################################\n"
|
||||
"##################### Possible Deadlock Detected #####################\n"
|
||||
"######################################################################\n"
|
||||
"\n"
|
||||
"Forcing unblock: The current measurement appears to have deadlocked\n"
|
||||
"and the results cannot be trusted.\n"
|
||||
"\n"
|
||||
"This happens when the KernelLauncher synchronizes the CUDA device.\n"
|
||||
"If this is the case, pass the `sync` exec_tag to the `exec` call:\n"
|
||||
"\n"
|
||||
" state.exec(<KernelLauncher>); // Deadlock\n"
|
||||
" state.exec(nvbench::exec_tag::sync, <KernelLauncher>); // Safe\n"
|
||||
"\n"
|
||||
"This tells NVBench about the sync so it can run the benchmark safely.\n"
|
||||
"\n"
|
||||
"If the KernelLauncher does not synchronize but has a very long \n"
|
||||
"execution time, this may be a false positive. If so, disable this\n"
|
||||
"check with:\n"
|
||||
"\n"
|
||||
" state.set_blocking_kernel_timeout(-1);\n"
|
||||
"\n"
|
||||
"The current timeout is set to %0.5g seconds.\n"
|
||||
"\n"
|
||||
"For more information, see the 'Benchmarks that sync' section of the\n"
|
||||
"NVBench documentation.\n"
|
||||
"\n"
|
||||
"If this happens while profiling with an external tool,\n"
|
||||
"pass the `--disable-blocking-kernel` flag or the `--profile` flag\n"
|
||||
"(to also only run the benchmark once) to the executable.\n"
|
||||
"\n"
|
||||
"For more information, see the 'Benchmark Properties' section of the\n"
|
||||
"NVBench documentation.\n\n",
|
||||
timeout);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,15 +101,11 @@ namespace nvbench
|
||||
|
||||
blocking_kernel::blocking_kernel()
|
||||
{
|
||||
NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_flag,
|
||||
sizeof(m_host_flag),
|
||||
cudaHostRegisterMapped));
|
||||
NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_flag, sizeof(m_host_flag), cudaHostRegisterMapped));
|
||||
NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_flag, &m_host_flag, 0));
|
||||
NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_timeout_flag,
|
||||
sizeof(m_host_timeout_flag),
|
||||
cudaHostRegisterMapped));
|
||||
NVBENCH_CUDA_CALL(
|
||||
cudaHostGetDevicePointer(&m_device_timeout_flag, &m_host_timeout_flag, 0));
|
||||
cudaHostRegister(&m_host_timeout_flag, sizeof(m_host_timeout_flag), cudaHostRegisterMapped));
|
||||
NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_timeout_flag, &m_host_timeout_flag, 0));
|
||||
}
|
||||
|
||||
blocking_kernel::~blocking_kernel()
|
||||
@@ -119,14 +114,11 @@ blocking_kernel::~blocking_kernel()
|
||||
NVBENCH_CUDA_CALL_NOEXCEPT(cudaHostUnregister(&m_host_timeout_flag));
|
||||
}
|
||||
|
||||
void blocking_kernel::block(const nvbench::cuda_stream &stream,
|
||||
nvbench::float64_t timeout)
|
||||
void blocking_kernel::block(const nvbench::cuda_stream &stream, nvbench::float64_t timeout)
|
||||
{
|
||||
m_host_flag = 0;
|
||||
m_host_timeout_flag = 0;
|
||||
block_stream<<<1, 1, 0, stream>>>(m_device_flag,
|
||||
m_device_timeout_flag,
|
||||
timeout);
|
||||
block_stream<<<1, 1, 0, stream>>>(m_device_flag, m_device_timeout_flag, timeout);
|
||||
}
|
||||
|
||||
void blocking_kernel::timeout_detected()
|
||||
|
||||
@@ -97,10 +97,10 @@ struct blocking_kernel
|
||||
}
|
||||
|
||||
// move-only
|
||||
blocking_kernel(const blocking_kernel &) = delete;
|
||||
blocking_kernel(blocking_kernel &&) = default;
|
||||
blocking_kernel(const blocking_kernel &) = delete;
|
||||
blocking_kernel(blocking_kernel &&) = default;
|
||||
blocking_kernel &operator=(const blocking_kernel &) = delete;
|
||||
blocking_kernel &operator=(blocking_kernel &&) = default;
|
||||
blocking_kernel &operator=(blocking_kernel &&) = default;
|
||||
|
||||
private:
|
||||
nvbench::int32_t m_host_flag{};
|
||||
|
||||
@@ -30,35 +30,29 @@ struct state;
|
||||
// Define a simple callable wrapper around a function. This allows the function
|
||||
// to be used as a class template parameter. Intended for use with kernel
|
||||
// generators and `NVBENCH_BENCH` macros.
|
||||
#define NVBENCH_DEFINE_UNIQUE_CALLABLE(function) \
|
||||
#define NVBENCH_DEFINE_UNIQUE_CALLABLE(function) \
|
||||
NVBENCH_DEFINE_CALLABLE(function, NVBENCH_UNIQUE_IDENTIFIER(function))
|
||||
|
||||
#define NVBENCH_DEFINE_CALLABLE(function, callable_name) \
|
||||
struct callable_name \
|
||||
{ \
|
||||
void operator()(nvbench::state &state, nvbench::type_list<>) \
|
||||
{ \
|
||||
function(state); \
|
||||
} \
|
||||
#define NVBENCH_DEFINE_CALLABLE(function, callable_name) \
|
||||
struct callable_name \
|
||||
{ \
|
||||
void operator()(nvbench::state &state, nvbench::type_list<>) { function(state); } \
|
||||
}
|
||||
|
||||
#define NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(function) \
|
||||
NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, \
|
||||
NVBENCH_UNIQUE_IDENTIFIER(function))
|
||||
#define NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(function) \
|
||||
NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, NVBENCH_UNIQUE_IDENTIFIER(function))
|
||||
|
||||
#define NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, callable_name) \
|
||||
struct callable_name \
|
||||
{ \
|
||||
template <typename... Ts> \
|
||||
void operator()(nvbench::state &state, nvbench::type_list<Ts...>) \
|
||||
{ \
|
||||
function(state, nvbench::type_list<Ts...>{}); \
|
||||
} \
|
||||
#define NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, callable_name) \
|
||||
struct callable_name \
|
||||
{ \
|
||||
template <typename... Ts> \
|
||||
void operator()(nvbench::state &state, nvbench::type_list<Ts...>) \
|
||||
{ \
|
||||
function(state, nvbench::type_list<Ts...>{}); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define NVBENCH_UNIQUE_IDENTIFIER(prefix) \
|
||||
NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, __LINE__)
|
||||
#define NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, unique_id) \
|
||||
#define NVBENCH_UNIQUE_IDENTIFIER(prefix) NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, __LINE__)
|
||||
#define NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, unique_id) \
|
||||
NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id)
|
||||
#define NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id) \
|
||||
prefix##_line_##unique_id
|
||||
#define NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id) prefix##_line_##unique_id
|
||||
|
||||
@@ -30,27 +30,20 @@ struct cpu_timer
|
||||
__forceinline__ cpu_timer() = default;
|
||||
|
||||
// move-only
|
||||
cpu_timer(const cpu_timer &) = delete;
|
||||
cpu_timer(cpu_timer &&) = default;
|
||||
cpu_timer(const cpu_timer &) = delete;
|
||||
cpu_timer(cpu_timer &&) = default;
|
||||
cpu_timer &operator=(const cpu_timer &) = delete;
|
||||
cpu_timer &operator=(cpu_timer &&) = default;
|
||||
cpu_timer &operator=(cpu_timer &&) = default;
|
||||
|
||||
__forceinline__ void start()
|
||||
{
|
||||
m_start = std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
__forceinline__ void start() { m_start = std::chrono::high_resolution_clock::now(); }
|
||||
|
||||
__forceinline__ void stop()
|
||||
{
|
||||
m_stop = std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
__forceinline__ void stop() { m_stop = std::chrono::high_resolution_clock::now(); }
|
||||
|
||||
// In seconds:
|
||||
[[nodiscard]] __forceinline__ nvbench::float64_t get_duration()
|
||||
{
|
||||
const auto duration = m_stop - m_start;
|
||||
const auto ns =
|
||||
std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
|
||||
const auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
|
||||
return static_cast<nvbench::float64_t>(ns) * (1e-9);
|
||||
}
|
||||
|
||||
|
||||
@@ -27,19 +27,17 @@
|
||||
|
||||
#define NVBENCH_TYPE_AXES(...) nvbench::type_list<__VA_ARGS__>
|
||||
|
||||
#define NVBENCH_BENCH(KernelGenerator) \
|
||||
NVBENCH_DEFINE_UNIQUE_CALLABLE(KernelGenerator); \
|
||||
nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) = \
|
||||
nvbench::benchmark_manager::get() \
|
||||
.add(std::make_unique< \
|
||||
nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator)>>()) \
|
||||
#define NVBENCH_BENCH(KernelGenerator) \
|
||||
NVBENCH_DEFINE_UNIQUE_CALLABLE(KernelGenerator); \
|
||||
nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) = \
|
||||
nvbench::benchmark_manager::get() \
|
||||
.add(std::make_unique<nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator)>>()) \
|
||||
.set_name(#KernelGenerator)
|
||||
|
||||
#define NVBENCH_BENCH_TYPES(KernelGenerator, TypeAxes) \
|
||||
NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(KernelGenerator); \
|
||||
nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) = \
|
||||
nvbench::benchmark_manager::get() \
|
||||
.add(std::make_unique< \
|
||||
nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator), \
|
||||
TypeAxes>>()) \
|
||||
#define NVBENCH_BENCH_TYPES(KernelGenerator, TypeAxes) \
|
||||
NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(KernelGenerator); \
|
||||
nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) = \
|
||||
nvbench::benchmark_manager::get() \
|
||||
.add(std::make_unique< \
|
||||
nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator), TypeAxes>>()) \
|
||||
.set_name(#KernelGenerator)
|
||||
|
||||
@@ -66,10 +66,8 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
|
||||
{
|
||||
std::optional<nvbench::device_info> device = cur_state.get_device();
|
||||
|
||||
std::string device_id = device ? fmt::to_string(device->get_id())
|
||||
: std::string{};
|
||||
std::string device_name = device ? std::string{device->get_name()}
|
||||
: std::string{};
|
||||
std::string device_id = device ? fmt::to_string(device->get_id()) : std::string{};
|
||||
std::string device_name = device ? std::string{device->get_name()} : std::string{};
|
||||
|
||||
table.add_cell(row, "_bench_name", "Benchmark", bench_name);
|
||||
table.add_cell(row, "_device_id", "Device", std::move(device_id));
|
||||
@@ -88,15 +86,11 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
|
||||
name + "_axis_pow2_pretty",
|
||||
name + " (pow2)",
|
||||
fmt::format("2^{}", exponent));
|
||||
table.add_cell(row,
|
||||
name + "_axis_plain",
|
||||
fmt::format("{}", name),
|
||||
fmt::to_string(value));
|
||||
table.add_cell(row, name + "_axis_plain", fmt::format("{}", name), fmt::to_string(value));
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string value = std::visit(format_visitor,
|
||||
axis_values.get_value(name));
|
||||
std::string value = std::visit(format_visitor, axis_values.get_value(name));
|
||||
table.add_cell(row, name + "_axis", name, std::move(value));
|
||||
}
|
||||
}
|
||||
@@ -117,14 +111,10 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
|
||||
continue;
|
||||
}
|
||||
const std::string &tag = summ.get_tag();
|
||||
const std::string &header = summ.has_value("name")
|
||||
? summ.get_string("name")
|
||||
: tag;
|
||||
const std::string &header = summ.has_value("name") ? summ.get_string("name") : tag;
|
||||
|
||||
const std::string hint = summ.has_value("hint")
|
||||
? summ.get_string("hint")
|
||||
: std::string{};
|
||||
std::string value = std::visit(format_visitor, summ.get_value("value"));
|
||||
const std::string hint = summ.has_value("hint") ? summ.get_string("hint") : std::string{};
|
||||
std::string value = std::visit(format_visitor, summ.get_value("value"));
|
||||
if (hint == "duration")
|
||||
{
|
||||
table.add_cell(row, tag, header + " (sec)", std::move(value));
|
||||
@@ -182,10 +172,7 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
|
||||
std::size_t remaining = table.m_columns.size();
|
||||
for (const auto &col : table.m_columns)
|
||||
{
|
||||
fmt::format_to(buffer,
|
||||
"{}{}",
|
||||
col.rows[i],
|
||||
(--remaining == 0) ? "" : ",");
|
||||
fmt::format_to(buffer, "{}{}", col.rows[i], (--remaining == 0) ? "" : ",");
|
||||
}
|
||||
fmt::format_to(buffer, "\n");
|
||||
}
|
||||
|
||||
@@ -16,8 +16,8 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <nvbench/cuda_call.cuh>
|
||||
#include <nvbench/config.cuh>
|
||||
#include <nvbench/cuda_call.cuh>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
@@ -66,11 +66,7 @@ void throw_error(const std::string &filename,
|
||||
command));
|
||||
}
|
||||
#else
|
||||
void throw_error(const std::string &,
|
||||
std::size_t,
|
||||
const std::string &,
|
||||
CUresult)
|
||||
{}
|
||||
void throw_error(const std::string &, std::size_t, const std::string &, CUresult) {}
|
||||
#endif
|
||||
|
||||
void exit_error(const std::string &filename,
|
||||
|
||||
@@ -18,52 +18,43 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
/// Throws a std::runtime_error if `call` doesn't return `cudaSuccess`.
|
||||
#define NVBENCH_CUDA_CALL(call) \
|
||||
do \
|
||||
{ \
|
||||
const cudaError_t nvbench_cuda_call_error = call; \
|
||||
if (nvbench_cuda_call_error != cudaSuccess) \
|
||||
{ \
|
||||
nvbench::cuda_call::throw_error(__FILE__, \
|
||||
__LINE__, \
|
||||
#call, \
|
||||
nvbench_cuda_call_error); \
|
||||
} \
|
||||
#define NVBENCH_CUDA_CALL(call) \
|
||||
do \
|
||||
{ \
|
||||
const cudaError_t nvbench_cuda_call_error = call; \
|
||||
if (nvbench_cuda_call_error != cudaSuccess) \
|
||||
{ \
|
||||
nvbench::cuda_call::throw_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
/// Throws a std::runtime_error if `call` doesn't return `CUDA_SUCCESS`.
|
||||
#define NVBENCH_DRIVER_API_CALL(call) \
|
||||
do \
|
||||
{ \
|
||||
const CUresult nvbench_cuda_call_error = call; \
|
||||
if (nvbench_cuda_call_error != CUDA_SUCCESS) \
|
||||
{ \
|
||||
nvbench::cuda_call::throw_error(__FILE__, \
|
||||
__LINE__, \
|
||||
#call, \
|
||||
nvbench_cuda_call_error); \
|
||||
} \
|
||||
#define NVBENCH_DRIVER_API_CALL(call) \
|
||||
do \
|
||||
{ \
|
||||
const CUresult nvbench_cuda_call_error = call; \
|
||||
if (nvbench_cuda_call_error != CUDA_SUCCESS) \
|
||||
{ \
|
||||
nvbench::cuda_call::throw_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
/// Terminates process with failure status if `call` doesn't return
|
||||
/// `cudaSuccess`.
|
||||
#define NVBENCH_CUDA_CALL_NOEXCEPT(call) \
|
||||
do \
|
||||
{ \
|
||||
const cudaError_t nvbench_cuda_call_error = call; \
|
||||
if (nvbench_cuda_call_error != cudaSuccess) \
|
||||
{ \
|
||||
nvbench::cuda_call::exit_error(__FILE__, \
|
||||
__LINE__, \
|
||||
#call, \
|
||||
nvbench_cuda_call_error); \
|
||||
} \
|
||||
#define NVBENCH_CUDA_CALL_NOEXCEPT(call) \
|
||||
do \
|
||||
{ \
|
||||
const cudaError_t nvbench_cuda_call_error = call; \
|
||||
if (nvbench_cuda_call_error != cudaSuccess) \
|
||||
{ \
|
||||
nvbench::cuda_call::exit_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
namespace nvbench::cuda_call
|
||||
|
||||
@@ -66,10 +66,10 @@ struct cuda_stream
|
||||
~cuda_stream() = default;
|
||||
|
||||
// move-only
|
||||
cuda_stream(const cuda_stream &) = delete;
|
||||
cuda_stream(const cuda_stream &) = delete;
|
||||
cuda_stream &operator=(const cuda_stream &) = delete;
|
||||
cuda_stream(cuda_stream &&) = default;
|
||||
cuda_stream &operator=(cuda_stream &&) = default;
|
||||
cuda_stream &operator=(cuda_stream &&) = default;
|
||||
|
||||
/**
|
||||
* @return The `cudaStream_t` managed by this `cuda_stream`.
|
||||
|
||||
@@ -42,10 +42,10 @@ struct cuda_timer
|
||||
}
|
||||
|
||||
// move-only
|
||||
cuda_timer(const cuda_timer &) = delete;
|
||||
cuda_timer(cuda_timer &&) = default;
|
||||
cuda_timer(const cuda_timer &) = delete;
|
||||
cuda_timer(cuda_timer &&) = default;
|
||||
cuda_timer &operator=(const cuda_timer &) = delete;
|
||||
cuda_timer &operator=(cuda_timer &&) = default;
|
||||
cuda_timer &operator=(cuda_timer &&) = default;
|
||||
|
||||
__forceinline__ void start(cudaStream_t stream)
|
||||
{
|
||||
|
||||
@@ -21,14 +21,13 @@
|
||||
#include <nvbench/config.cuh>
|
||||
#include <nvbench/device_info.cuh>
|
||||
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <optional>
|
||||
|
||||
namespace nvbench::detail
|
||||
{
|
||||
|
||||
|
||||
#ifdef NVBENCH_HAS_CUPTI
|
||||
/**
|
||||
* Pass required metrics in the constructor and organize your code as follows
|
||||
@@ -62,7 +61,7 @@ namespace nvbench::detail
|
||||
*/
|
||||
class cupti_profiler
|
||||
{
|
||||
bool m_available {};
|
||||
bool m_available{};
|
||||
std::string m_chip_name;
|
||||
|
||||
// Counter data
|
||||
@@ -87,11 +86,10 @@ public:
|
||||
cupti_profiler(cupti_profiler &&) noexcept;
|
||||
cupti_profiler &operator=(cupti_profiler &&) noexcept;
|
||||
|
||||
cupti_profiler(const cupti_profiler &) = delete;
|
||||
cupti_profiler(const cupti_profiler &) = delete;
|
||||
cupti_profiler &operator=(const cupti_profiler &) = delete;
|
||||
|
||||
cupti_profiler(nvbench::device_info device,
|
||||
std::vector<std::string> &&metric_names);
|
||||
cupti_profiler(nvbench::device_info device, std::vector<std::string> &&metric_names);
|
||||
~cupti_profiler();
|
||||
|
||||
[[nodiscard]] bool is_initialized() const;
|
||||
@@ -125,5 +123,4 @@ private:
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
} // namespace nvbench::detail
|
||||
|
||||
@@ -59,8 +59,7 @@ void nvpw_call(const NVPA_Status status)
|
||||
|
||||
} // namespace
|
||||
|
||||
cupti_profiler::cupti_profiler(nvbench::device_info device,
|
||||
std::vector<std::string> &&metric_names)
|
||||
cupti_profiler::cupti_profiler(nvbench::device_info device, std::vector<std::string> &&metric_names)
|
||||
: m_metric_names(metric_names)
|
||||
, m_device(device)
|
||||
{
|
||||
@@ -154,12 +153,10 @@ class eval_request
|
||||
NVPW_MetricsEvaluator *evaluator_ptr;
|
||||
|
||||
public:
|
||||
eval_request(NVPW_MetricsEvaluator *evaluator_ptr,
|
||||
const std::string &metric_name)
|
||||
eval_request(NVPW_MetricsEvaluator *evaluator_ptr, const std::string &metric_name)
|
||||
: evaluator_ptr(evaluator_ptr)
|
||||
{
|
||||
NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params =
|
||||
{};
|
||||
NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params = {};
|
||||
|
||||
params.structSize =
|
||||
NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params_STRUCT_SIZE;
|
||||
@@ -168,8 +165,7 @@ public:
|
||||
params.pMetricEvalRequest = &request;
|
||||
params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
|
||||
|
||||
nvpw_call(
|
||||
NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(¶ms));
|
||||
nvpw_call(NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(¶ms));
|
||||
}
|
||||
|
||||
[[nodiscard]] std::vector<const char *> get_raw_dependencies()
|
||||
@@ -178,10 +174,9 @@ public:
|
||||
|
||||
NVPW_MetricsEvaluator_GetMetricRawDependencies_Params params{};
|
||||
|
||||
params.structSize =
|
||||
NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE;
|
||||
params.pMetricsEvaluator = evaluator_ptr;
|
||||
params.pMetricEvalRequests = &request;
|
||||
params.structSize = NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE;
|
||||
params.pMetricsEvaluator = evaluator_ptr;
|
||||
params.pMetricEvalRequests = &request;
|
||||
params.numMetricEvalRequests = 1;
|
||||
params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
|
||||
params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
|
||||
@@ -211,26 +206,23 @@ public:
|
||||
const std::uint8_t *counter_data_image = nullptr,
|
||||
const std::size_t counter_data_image_size = 0)
|
||||
{
|
||||
NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params
|
||||
scratch_buffer_param{};
|
||||
NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params scratch_buffer_param{};
|
||||
|
||||
scratch_buffer_param.structSize =
|
||||
NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params_STRUCT_SIZE;
|
||||
scratch_buffer_param.pChipName = chip_name.c_str();
|
||||
scratch_buffer_param.pCounterAvailabilityImage = counter_availability_image;
|
||||
|
||||
nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(
|
||||
&scratch_buffer_param));
|
||||
nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(&scratch_buffer_param));
|
||||
|
||||
scratch_buffer.resize(scratch_buffer_param.scratchBufferSize);
|
||||
|
||||
NVPW_CUDA_MetricsEvaluator_Initialize_Params evaluator_params{};
|
||||
|
||||
evaluator_params.structSize =
|
||||
NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE;
|
||||
evaluator_params.scratchBufferSize = scratch_buffer.size();
|
||||
evaluator_params.pScratchBuffer = scratch_buffer.data();
|
||||
evaluator_params.pChipName = chip_name.c_str();
|
||||
evaluator_params.structSize = NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE;
|
||||
evaluator_params.scratchBufferSize = scratch_buffer.size();
|
||||
evaluator_params.pScratchBuffer = scratch_buffer.data();
|
||||
evaluator_params.pChipName = chip_name.c_str();
|
||||
evaluator_params.pCounterAvailabilityImage = counter_availability_image;
|
||||
evaluator_params.pCounterDataImage = counter_data_image;
|
||||
evaluator_params.counterDataImageSize = counter_data_image_size;
|
||||
@@ -247,7 +239,7 @@ public:
|
||||
{
|
||||
NVPW_MetricsEvaluator_Destroy_Params params{};
|
||||
|
||||
params.structSize = NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE;
|
||||
params.pMetricsEvaluator = evaluator_ptr;
|
||||
|
||||
nvpw_call(NVPW_MetricsEvaluator_Destroy(¶ms));
|
||||
@@ -259,10 +251,7 @@ public:
|
||||
return {evaluator_ptr, metric_name};
|
||||
}
|
||||
|
||||
[[nodiscard]] operator NVPW_MetricsEvaluator *() const
|
||||
{
|
||||
return evaluator_ptr;
|
||||
}
|
||||
[[nodiscard]] operator NVPW_MetricsEvaluator *() const { return evaluator_ptr; }
|
||||
};
|
||||
|
||||
} // namespace
|
||||
@@ -270,10 +259,10 @@ public:
|
||||
namespace
|
||||
{
|
||||
|
||||
[[nodiscard]] std::vector<NVPA_RawMetricRequest> get_raw_metric_requests(
|
||||
const std::string &chip_name,
|
||||
const std::vector<std::string> &metric_names,
|
||||
const std::uint8_t *counter_availability_image = nullptr)
|
||||
[[nodiscard]] std::vector<NVPA_RawMetricRequest>
|
||||
get_raw_metric_requests(const std::string &chip_name,
|
||||
const std::vector<std::string> &metric_names,
|
||||
const std::uint8_t *counter_availability_image = nullptr)
|
||||
{
|
||||
metric_evaluator evaluator(chip_name, counter_availability_image);
|
||||
|
||||
@@ -282,8 +271,7 @@ namespace
|
||||
|
||||
for (auto &metric_name : metric_names)
|
||||
{
|
||||
for (auto &raw_dependency :
|
||||
evaluator.create_request(metric_name).get_raw_dependencies())
|
||||
for (auto &raw_dependency : evaluator.create_request(metric_name).get_raw_dependencies())
|
||||
{
|
||||
raw_metric_names.push_back(raw_dependency);
|
||||
}
|
||||
@@ -295,10 +283,10 @@ namespace
|
||||
for (auto &raw_name : raw_metric_names)
|
||||
{
|
||||
NVPA_RawMetricRequest metricRequest{};
|
||||
metricRequest.structSize = NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE;
|
||||
metricRequest.pMetricName = raw_name;
|
||||
metricRequest.isolated = true;
|
||||
metricRequest.keepInstances = true;
|
||||
metricRequest.structSize = NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE;
|
||||
metricRequest.pMetricName = raw_name;
|
||||
metricRequest.isolated = true;
|
||||
metricRequest.keepInstances = true;
|
||||
raw_requests.push_back(metricRequest);
|
||||
}
|
||||
|
||||
@@ -309,12 +297,11 @@ class metrics_config
|
||||
{
|
||||
bool initialized{};
|
||||
|
||||
void create(const std::string &chip_name,
|
||||
const std::uint8_t *availability_image)
|
||||
void create(const std::string &chip_name, const std::uint8_t *availability_image)
|
||||
{
|
||||
NVPW_CUDA_RawMetricsConfig_Create_V2_Params params{};
|
||||
|
||||
params.structSize = NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE;
|
||||
params.activityKind = NVPA_ACTIVITY_KIND_PROFILER;
|
||||
params.pChipName = chip_name.c_str();
|
||||
params.pCounterAvailabilityImage = availability_image;
|
||||
@@ -329,9 +316,8 @@ class metrics_config
|
||||
{
|
||||
NVPW_RawMetricsConfig_SetCounterAvailability_Params params{};
|
||||
|
||||
params.structSize =
|
||||
NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE;
|
||||
params.pRawMetricsConfig = raw_metrics_config;
|
||||
params.structSize = NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE;
|
||||
params.pRawMetricsConfig = raw_metrics_config;
|
||||
params.pCounterAvailabilityImage = availability_image;
|
||||
|
||||
nvpw_call(NVPW_RawMetricsConfig_SetCounterAvailability(¶ms));
|
||||
@@ -341,7 +327,7 @@ class metrics_config
|
||||
{
|
||||
NVPW_RawMetricsConfig_BeginPassGroup_Params params{};
|
||||
|
||||
params.structSize = NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE;
|
||||
params.pRawMetricsConfig = raw_metrics_config;
|
||||
|
||||
nvpw_call(NVPW_RawMetricsConfig_BeginPassGroup(¶ms));
|
||||
@@ -351,7 +337,7 @@ class metrics_config
|
||||
{
|
||||
NVPW_RawMetricsConfig_AddMetrics_Params params{};
|
||||
|
||||
params.structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE;
|
||||
params.pRawMetricsConfig = raw_metrics_config;
|
||||
params.pRawMetricRequests = raw_metric_requests.data();
|
||||
params.numMetricRequests = raw_metric_requests.size();
|
||||
@@ -363,7 +349,7 @@ class metrics_config
|
||||
{
|
||||
NVPW_RawMetricsConfig_EndPassGroup_Params params{};
|
||||
|
||||
params.structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE;
|
||||
params.pRawMetricsConfig = raw_metrics_config;
|
||||
|
||||
nvpw_call(NVPW_RawMetricsConfig_EndPassGroup(¶ms));
|
||||
@@ -373,8 +359,7 @@ class metrics_config
|
||||
{
|
||||
NVPW_RawMetricsConfig_GenerateConfigImage_Params params{};
|
||||
|
||||
params.structSize =
|
||||
NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE;
|
||||
params.pRawMetricsConfig = raw_metrics_config;
|
||||
|
||||
nvpw_call(NVPW_RawMetricsConfig_GenerateConfigImage(¶ms));
|
||||
@@ -398,7 +383,7 @@ public:
|
||||
{
|
||||
NVPW_RawMetricsConfig_GetConfigImage_Params params{};
|
||||
|
||||
params.structSize = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE;
|
||||
params.pRawMetricsConfig = raw_metrics_config;
|
||||
params.bytesAllocated = 0;
|
||||
params.pBuffer = nullptr;
|
||||
@@ -419,7 +404,7 @@ public:
|
||||
{
|
||||
NVPW_RawMetricsConfig_Destroy_Params params{};
|
||||
|
||||
params.structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE;
|
||||
params.pRawMetricsConfig = raw_metrics_config;
|
||||
|
||||
NVPW_RawMetricsConfig_Destroy(¶ms);
|
||||
@@ -433,13 +418,12 @@ public:
|
||||
|
||||
void cupti_profiler::initialize_config_image()
|
||||
{
|
||||
m_config_image =
|
||||
metrics_config(m_chip_name,
|
||||
get_raw_metric_requests(m_chip_name,
|
||||
m_metric_names,
|
||||
m_availability_image.data()),
|
||||
m_availability_image.data())
|
||||
.get_config_image();
|
||||
m_config_image = metrics_config(m_chip_name,
|
||||
get_raw_metric_requests(m_chip_name,
|
||||
m_metric_names,
|
||||
m_availability_image.data()),
|
||||
m_availability_image.data())
|
||||
.get_config_image();
|
||||
}
|
||||
|
||||
namespace
|
||||
@@ -450,12 +434,11 @@ class counter_data_builder
|
||||
bool initialized{};
|
||||
|
||||
public:
|
||||
counter_data_builder(const std::string &chip_name,
|
||||
const std::uint8_t *pCounterAvailabilityImage)
|
||||
counter_data_builder(const std::string &chip_name, const std::uint8_t *pCounterAvailabilityImage)
|
||||
{
|
||||
NVPW_CUDA_CounterDataBuilder_Create_Params params{};
|
||||
|
||||
params.structSize = NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE;
|
||||
params.pChipName = chip_name.c_str();
|
||||
params.pCounterAvailabilityImage = pCounterAvailabilityImage;
|
||||
|
||||
@@ -471,7 +454,7 @@ public:
|
||||
{
|
||||
NVPW_CounterDataBuilder_Destroy_Params params{};
|
||||
|
||||
params.structSize = NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE;
|
||||
params.pCounterDataBuilder = builder;
|
||||
|
||||
NVPW_CounterDataBuilder_Destroy(¶ms);
|
||||
@@ -488,16 +471,14 @@ void cupti_profiler::initialize_counter_data_prefix_image()
|
||||
const std::uint8_t *counter_availability_image = nullptr;
|
||||
|
||||
std::vector<NVPA_RawMetricRequest> raw_metric_requests =
|
||||
get_raw_metric_requests(m_chip_name,
|
||||
m_metric_names,
|
||||
counter_availability_image);
|
||||
get_raw_metric_requests(m_chip_name, m_metric_names, counter_availability_image);
|
||||
|
||||
counter_data_builder data_builder(m_chip_name, counter_availability_image);
|
||||
|
||||
{
|
||||
NVPW_CounterDataBuilder_AddMetrics_Params params{};
|
||||
|
||||
params.structSize = NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE;
|
||||
params.pCounterDataBuilder = data_builder.builder;
|
||||
params.pRawMetricRequests = raw_metric_requests.data();
|
||||
params.numMetricRequests = raw_metric_requests.size();
|
||||
@@ -508,8 +489,7 @@ void cupti_profiler::initialize_counter_data_prefix_image()
|
||||
{
|
||||
NVPW_CounterDataBuilder_GetCounterDataPrefix_Params params{};
|
||||
|
||||
params.structSize =
|
||||
NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE;
|
||||
params.pCounterDataBuilder = data_builder.builder;
|
||||
params.bytesAllocated = 0;
|
||||
params.pBuffer = nullptr;
|
||||
@@ -532,11 +512,9 @@ get_counter_data_image_size(CUpti_Profiler_CounterDataImageOptions *options)
|
||||
{
|
||||
CUpti_Profiler_CounterDataImage_CalculateSize_Params params{};
|
||||
|
||||
params.structSize =
|
||||
CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE;
|
||||
params.pOptions = options;
|
||||
params.sizeofCounterDataImageOptions =
|
||||
CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
|
||||
params.structSize = CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE;
|
||||
params.pOptions = options;
|
||||
params.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
|
||||
|
||||
cupti_call(cuptiProfilerCounterDataImageCalculateSize(¶ms));
|
||||
return params.counterDataImageSize;
|
||||
@@ -559,12 +537,10 @@ void cupti_profiler::initialize_counter_data_image()
|
||||
{
|
||||
CUpti_Profiler_CounterDataImage_Initialize_Params params{};
|
||||
|
||||
params.structSize =
|
||||
CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE;
|
||||
params.sizeofCounterDataImageOptions =
|
||||
CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
|
||||
params.pOptions = &counter_data_image_options;
|
||||
params.counterDataImageSize = m_data_image.size();
|
||||
params.structSize = CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE;
|
||||
params.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
|
||||
params.pOptions = &counter_data_image_options;
|
||||
params.counterDataImageSize = m_data_image.size();
|
||||
|
||||
params.pCounterDataImage = &m_data_image[0];
|
||||
cupti_call(cuptiProfilerCounterDataImageInitialize(¶ms));
|
||||
@@ -578,8 +554,7 @@ void cupti_profiler::initialize_counter_data_image()
|
||||
params.counterDataImageSize = m_data_image.size();
|
||||
params.pCounterDataImage = &m_data_image[0];
|
||||
|
||||
cupti_call(
|
||||
cuptiProfilerCounterDataImageCalculateScratchBufferSize(¶ms));
|
||||
cupti_call(cuptiProfilerCounterDataImageCalculateScratchBufferSize(¶ms));
|
||||
|
||||
m_data_scratch_buffer.resize(params.counterDataScratchBufferSize);
|
||||
}
|
||||
@@ -587,8 +562,7 @@ void cupti_profiler::initialize_counter_data_image()
|
||||
{
|
||||
CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params params{};
|
||||
|
||||
params.structSize =
|
||||
CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE;
|
||||
params.structSize = CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE;
|
||||
params.counterDataImageSize = m_data_image.size();
|
||||
params.pCounterDataImage = &m_data_image[0];
|
||||
params.counterDataScratchBufferSize = m_data_scratch_buffer.size();
|
||||
@@ -608,17 +582,14 @@ cupti_profiler::~cupti_profiler()
|
||||
}
|
||||
}
|
||||
|
||||
bool cupti_profiler::is_initialized() const
|
||||
{
|
||||
return m_available;
|
||||
}
|
||||
bool cupti_profiler::is_initialized() const { return m_available; }
|
||||
|
||||
void cupti_profiler::prepare_user_loop()
|
||||
{
|
||||
{
|
||||
CUpti_Profiler_BeginSession_Params params{};
|
||||
|
||||
params.structSize = CUpti_Profiler_BeginSession_Params_STRUCT_SIZE;
|
||||
params.structSize = CUpti_Profiler_BeginSession_Params_STRUCT_SIZE;
|
||||
params.ctx = nullptr;
|
||||
params.counterDataImageSize = m_data_image.size();
|
||||
params.pCounterDataImage = &m_data_image[0];
|
||||
@@ -735,9 +706,7 @@ std::vector<double> cupti_profiler::get_counter_values()
|
||||
|
||||
if (params.numRanges != 1)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"{}",
|
||||
"Something's gone wrong, one range is expected");
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "Something's gone wrong, one range is expected");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -752,8 +721,7 @@ std::vector<double> cupti_profiler::get_counter_values()
|
||||
{
|
||||
NVPW_MetricsEvaluator_SetDeviceAttributes_Params params{};
|
||||
|
||||
params.structSize =
|
||||
NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE;
|
||||
params.structSize = NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE;
|
||||
params.pMetricsEvaluator = evaluator;
|
||||
params.pCounterDataImage = m_data_image.data();
|
||||
params.counterDataImageSize = m_data_image.size();
|
||||
@@ -764,11 +732,10 @@ std::vector<double> cupti_profiler::get_counter_values()
|
||||
{
|
||||
NVPW_MetricsEvaluator_EvaluateToGpuValues_Params params{};
|
||||
|
||||
params.structSize =
|
||||
NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE;
|
||||
params.pMetricsEvaluator = evaluator;
|
||||
params.pMetricEvalRequests = &request.request;
|
||||
params.numMetricEvalRequests = 1;
|
||||
params.structSize = NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE;
|
||||
params.pMetricsEvaluator = evaluator;
|
||||
params.pMetricEvalRequests = &request.request;
|
||||
params.numMetricEvalRequests = 1;
|
||||
params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
|
||||
params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
|
||||
params.pCounterDataImage = m_data_image.data();
|
||||
|
||||
@@ -39,9 +39,9 @@ struct [[maybe_unused]] device_scope
|
||||
~device_scope() { NVBENCH_CUDA_CALL(cudaSetDevice(m_old_device_id)); }
|
||||
|
||||
// move-only
|
||||
device_scope(device_scope &&) = default;
|
||||
device_scope &operator=(device_scope &&) = default;
|
||||
device_scope(const device_scope &) = delete;
|
||||
device_scope(device_scope &&) = default;
|
||||
device_scope &operator=(device_scope &&) = default;
|
||||
device_scope(const device_scope &) = delete;
|
||||
device_scope &operator=(const device_scope &) = delete;
|
||||
|
||||
private:
|
||||
|
||||
@@ -33,7 +33,7 @@ namespace detail
|
||||
template <typename KernelLauncher>
|
||||
struct kernel_launch_timer_wrapper
|
||||
{
|
||||
explicit kernel_launch_timer_wrapper(KernelLauncher &launcher)
|
||||
explicit kernel_launch_timer_wrapper(KernelLauncher &launcher)
|
||||
: m_kernel_launcher{launcher}
|
||||
{}
|
||||
|
||||
|
||||
@@ -31,13 +31,12 @@ struct l2flush
|
||||
{
|
||||
int dev_id{};
|
||||
NVBENCH_CUDA_CALL(cudaGetDevice(&dev_id));
|
||||
NVBENCH_CUDA_CALL(
|
||||
cudaDeviceGetAttribute(&m_l2_size, cudaDevAttrL2CacheSize, dev_id));
|
||||
NVBENCH_CUDA_CALL(cudaDeviceGetAttribute(&m_l2_size, cudaDevAttrL2CacheSize, dev_id));
|
||||
if (m_l2_size > 0)
|
||||
{
|
||||
void* buffer = m_l2_buffer;
|
||||
void *buffer = m_l2_buffer;
|
||||
NVBENCH_CUDA_CALL(cudaMalloc(&buffer, m_l2_size));
|
||||
m_l2_buffer = reinterpret_cast<int*>(buffer);
|
||||
m_l2_buffer = reinterpret_cast<int *>(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -54,15 +54,11 @@ void measure_cold_base::check()
|
||||
const auto device = m_state.get_device();
|
||||
if (!device)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"{}",
|
||||
"Device required for `cold` measurement.");
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "Device required for `cold` measurement.");
|
||||
}
|
||||
if (!device->is_active())
|
||||
{ // This means something went wrong higher up. Throw an error.
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"{}",
|
||||
"Internal error: Current device is not active.");
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,13 +88,11 @@ void measure_cold_base::record_measurements()
|
||||
++m_total_samples;
|
||||
|
||||
// Compute convergence statistics using CUDA timings:
|
||||
const auto mean_cuda_time = m_total_cuda_time /
|
||||
static_cast<nvbench::float64_t>(m_total_samples);
|
||||
const auto cuda_stdev =
|
||||
nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
|
||||
m_cuda_times.cend(),
|
||||
mean_cuda_time);
|
||||
auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
|
||||
const auto mean_cuda_time = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
|
||||
const auto cuda_stdev = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
|
||||
m_cuda_times.cend(),
|
||||
mean_cuda_time);
|
||||
auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
|
||||
if (std::isfinite(cuda_rel_stdev))
|
||||
{
|
||||
m_noise_tracker.push_back(cuda_rel_stdev);
|
||||
@@ -132,10 +126,10 @@ bool measure_cold_base::is_finished()
|
||||
{
|
||||
// Use the current noise as the stdev reference.
|
||||
const auto current_noise = m_noise_tracker.back();
|
||||
const auto noise_stdev = nvbench::detail::statistics::standard_deviation(
|
||||
m_noise_tracker.cbegin(),
|
||||
m_noise_tracker.cend(),
|
||||
current_noise);
|
||||
const auto noise_stdev =
|
||||
nvbench::detail::statistics::standard_deviation(m_noise_tracker.cbegin(),
|
||||
m_noise_tracker.cend(),
|
||||
current_noise);
|
||||
const auto noise_rel_stdev = noise_stdev / current_noise;
|
||||
|
||||
// If the rel stdev of the last N cuda noise measurements is less than
|
||||
@@ -162,13 +156,11 @@ bool measure_cold_base::is_finished()
|
||||
void measure_cold_base::run_trials_epilogue()
|
||||
{
|
||||
// Only need to compute this at the end, not per iteration.
|
||||
const auto cpu_mean = m_total_cuda_time /
|
||||
static_cast<nvbench::float64_t>(m_total_samples);
|
||||
const auto cpu_stdev =
|
||||
nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
|
||||
m_cpu_times.cend(),
|
||||
cpu_mean);
|
||||
m_cpu_noise = cpu_stdev / cpu_mean;
|
||||
const auto cpu_mean = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
|
||||
const auto cpu_stdev = nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
|
||||
m_cpu_times.cend(),
|
||||
cpu_mean);
|
||||
m_cpu_noise = cpu_stdev / cpu_mean;
|
||||
|
||||
m_walltime_timer.stop();
|
||||
}
|
||||
@@ -199,8 +191,7 @@ void measure_cold_base::generate_summaries()
|
||||
auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative");
|
||||
summ.set_string("name", "Noise");
|
||||
summ.set_string("hint", "percentage");
|
||||
summ.set_string("description",
|
||||
"Relative standard deviation of isolated CPU times");
|
||||
summ.set_string("description", "Relative standard deviation of isolated CPU times");
|
||||
summ.set_float64("value", m_cpu_noise);
|
||||
}
|
||||
|
||||
@@ -219,12 +210,10 @@ void measure_cold_base::generate_summaries()
|
||||
auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative");
|
||||
summ.set_string("name", "Noise");
|
||||
summ.set_string("hint", "percentage");
|
||||
summ.set_string("description",
|
||||
"Relative standard deviation of isolated GPU times");
|
||||
summ.set_string("description", "Relative standard deviation of isolated GPU times");
|
||||
summ.set_float64("value",
|
||||
m_noise_tracker.empty()
|
||||
? std::numeric_limits<nvbench::float64_t>::infinity()
|
||||
: m_noise_tracker.back());
|
||||
m_noise_tracker.empty() ? std::numeric_limits<nvbench::float64_t>::infinity()
|
||||
: m_noise_tracker.back());
|
||||
}
|
||||
|
||||
if (const auto items = m_state.get_element_count(); items != 0)
|
||||
@@ -232,8 +221,7 @@ void measure_cold_base::generate_summaries()
|
||||
auto &summ = m_state.add_summary("nv/cold/bw/item_rate");
|
||||
summ.set_string("name", "Elem/s");
|
||||
summ.set_string("hint", "item_rate");
|
||||
summ.set_string("description",
|
||||
"Number of input elements processed per second");
|
||||
summ.set_string("description", "Number of input elements processed per second");
|
||||
summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
|
||||
}
|
||||
|
||||
@@ -251,8 +239,8 @@ void measure_cold_base::generate_summaries()
|
||||
}
|
||||
|
||||
{
|
||||
const auto peak_gmem_bw = static_cast<double>(
|
||||
m_state.get_device()->get_global_memory_bus_bandwidth());
|
||||
const auto peak_gmem_bw =
|
||||
static_cast<double>(m_state.get_device()->get_global_memory_bus_bandwidth());
|
||||
|
||||
auto &summ = m_state.add_summary("nv/cold/bw/global/utilization");
|
||||
summ.set_string("name", "BWUtil");
|
||||
@@ -274,8 +262,7 @@ void measure_cold_base::generate_summaries()
|
||||
}
|
||||
|
||||
// Log if a printer exists:
|
||||
if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
|
||||
printer_opt_ref.has_value())
|
||||
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
|
||||
@@ -324,10 +311,7 @@ void measure_cold_base::generate_summaries()
|
||||
m_walltime_timer.get_duration(),
|
||||
m_total_samples));
|
||||
|
||||
printer.process_bulk_data(m_state,
|
||||
"nv/cold/sample_times",
|
||||
"sample_times",
|
||||
m_cuda_times);
|
||||
printer.process_bulk_data(m_state, "nv/cold/sample_times", "sample_times", m_cuda_times);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -49,10 +49,10 @@ namespace detail
|
||||
struct measure_cold_base
|
||||
{
|
||||
explicit measure_cold_base(nvbench::state &exec_state);
|
||||
measure_cold_base(const measure_cold_base &) = delete;
|
||||
measure_cold_base(measure_cold_base &&) = delete;
|
||||
measure_cold_base(const measure_cold_base &) = delete;
|
||||
measure_cold_base(measure_cold_base &&) = delete;
|
||||
measure_cold_base &operator=(const measure_cold_base &) = delete;
|
||||
measure_cold_base &operator=(measure_cold_base &&) = delete;
|
||||
measure_cold_base &operator=(measure_cold_base &&) = delete;
|
||||
|
||||
protected:
|
||||
template <bool use_blocking_kernel>
|
||||
@@ -68,10 +68,7 @@ protected:
|
||||
|
||||
void check_skip_time(nvbench::float64_t warmup_time);
|
||||
|
||||
__forceinline__ void flush_device_l2()
|
||||
{
|
||||
m_l2flush.flush(m_launch.get_stream());
|
||||
}
|
||||
__forceinline__ void flush_device_l2() { m_l2flush.flush(m_launch.get_stream()); }
|
||||
|
||||
__forceinline__ void sync_stream() const
|
||||
{
|
||||
|
||||
@@ -50,8 +50,7 @@ struct metric_traits;
|
||||
template <>
|
||||
struct metric_traits<metric_id::dram_peak_sustained_throughput>
|
||||
{
|
||||
static constexpr const char *metric_name =
|
||||
"dram__throughput.avg.pct_of_peak_sustained_elapsed";
|
||||
static constexpr const char *metric_name = "dram__throughput.avg.pct_of_peak_sustained_elapsed";
|
||||
|
||||
static constexpr const char *name = "HBWPeak";
|
||||
static constexpr const char *hint = "percentage";
|
||||
@@ -119,10 +118,7 @@ struct metric_traits<metric_id::l1_hit_rate>
|
||||
static constexpr const char *description = "Hit rate at L1 cache.";
|
||||
static constexpr double divider = 100.0;
|
||||
|
||||
static bool is_collected(nvbench::state &m_state)
|
||||
{
|
||||
return m_state.is_l1_hit_rate_collected();
|
||||
};
|
||||
static bool is_collected(nvbench::state &m_state) { return m_state.is_l1_hit_rate_collected(); };
|
||||
};
|
||||
|
||||
template <>
|
||||
@@ -134,10 +130,7 @@ struct metric_traits<metric_id::l2_hit_rate>
|
||||
static constexpr const char *description = "Hit rate at L2 cache.";
|
||||
static constexpr double divider = 100.0;
|
||||
|
||||
static bool is_collected(nvbench::state &m_state)
|
||||
{
|
||||
return m_state.is_l2_hit_rate_collected();
|
||||
};
|
||||
static bool is_collected(nvbench::state &m_state) { return m_state.is_l2_hit_rate_collected(); };
|
||||
};
|
||||
|
||||
template <metric_id id = metric_id::dram_peak_sustained_throughput>
|
||||
@@ -153,8 +146,7 @@ void add_metrics_impl(nvbench::state &state, std::vector<std::string> &metrics)
|
||||
}
|
||||
|
||||
template <>
|
||||
void add_metrics_impl<metric_id::count>(nvbench::state &,
|
||||
std::vector<std::string> &)
|
||||
void add_metrics_impl<metric_id::count>(nvbench::state &, std::vector<std::string> &)
|
||||
{}
|
||||
|
||||
std::vector<std::string> add_metrics(nvbench::state &state)
|
||||
@@ -179,13 +171,11 @@ try
|
||||
// clang-format on
|
||||
catch (const std::exception &ex)
|
||||
{
|
||||
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
|
||||
printer_opt_ref)
|
||||
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref)
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
printer.log(nvbench::log_level::warn,
|
||||
fmt::format("CUPTI failed to construct profiler: {}",
|
||||
ex.what()));
|
||||
fmt::format("CUPTI failed to construct profiler: {}", ex.what()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,15 +184,11 @@ void measure_cupti_base::check()
|
||||
const auto device = m_state.get_device();
|
||||
if (!device)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"{}",
|
||||
"Device required for `cupti` measurement.");
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "Device required for `cupti` measurement.");
|
||||
}
|
||||
if (!device->is_active())
|
||||
{ // This means something went wrong higher up. Throw an error.
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"{}",
|
||||
"Internal error: Current device is not active.");
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -210,16 +196,13 @@ namespace
|
||||
{
|
||||
|
||||
template <metric_id id = metric_id::dram_peak_sustained_throughput>
|
||||
void gen_summary(std::size_t result_id,
|
||||
nvbench::state &m_state,
|
||||
const std::vector<double> &result)
|
||||
void gen_summary(std::size_t result_id, nvbench::state &m_state, const std::vector<double> &result)
|
||||
{
|
||||
using metric = metric_traits<id>;
|
||||
|
||||
if (metric::is_collected(m_state))
|
||||
{
|
||||
auto &summ =
|
||||
m_state.add_summary(fmt::format("nv/cupti/{}", metric::metric_name));
|
||||
auto &summ = m_state.add_summary(fmt::format("nv/cupti/{}", metric::metric_name));
|
||||
summ.set_string("name", metric::name);
|
||||
summ.set_string("hint", metric::hint);
|
||||
summ.set_string("description", metric::description);
|
||||
@@ -231,9 +214,7 @@ void gen_summary(std::size_t result_id,
|
||||
}
|
||||
|
||||
template <>
|
||||
void gen_summary<metric_id::count>(std::size_t,
|
||||
nvbench::state &,
|
||||
const std::vector<double> &)
|
||||
void gen_summary<metric_id::count>(std::size_t, nvbench::state &, const std::vector<double> &)
|
||||
{}
|
||||
|
||||
void gen_summaries(nvbench::state &state, const std::vector<double> &result)
|
||||
@@ -266,8 +247,7 @@ try
|
||||
}
|
||||
|
||||
// Log if a printer exists:
|
||||
if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
|
||||
printer_opt_ref.has_value())
|
||||
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
printer.log(nvbench::log_level::pass,
|
||||
@@ -278,13 +258,11 @@ try
|
||||
}
|
||||
catch (const std::exception &ex)
|
||||
{
|
||||
if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
|
||||
printer_opt_ref)
|
||||
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref)
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
printer.log(nvbench::log_level::warn,
|
||||
fmt::format("CUPTI failed to generate the summary: {}",
|
||||
ex.what()));
|
||||
fmt::format("CUPTI failed to generate the summary: {}", ex.what()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -50,10 +50,10 @@ namespace detail
|
||||
struct measure_cupti_base
|
||||
{
|
||||
explicit measure_cupti_base(nvbench::state &exec_state);
|
||||
measure_cupti_base(const measure_cupti_base &) = delete;
|
||||
measure_cupti_base(measure_cupti_base &&) = delete;
|
||||
measure_cupti_base(const measure_cupti_base &) = delete;
|
||||
measure_cupti_base(measure_cupti_base &&) = delete;
|
||||
measure_cupti_base &operator=(const measure_cupti_base &) = delete;
|
||||
measure_cupti_base &operator=(measure_cupti_base &&) = delete;
|
||||
measure_cupti_base &operator=(measure_cupti_base &&) = delete;
|
||||
|
||||
protected:
|
||||
struct kernel_launch_timer;
|
||||
@@ -61,10 +61,7 @@ protected:
|
||||
void check();
|
||||
void generate_summaries();
|
||||
|
||||
__forceinline__ void flush_device_l2()
|
||||
{
|
||||
m_l2flush.flush(m_launch.get_stream());
|
||||
}
|
||||
__forceinline__ void flush_device_l2() { m_l2flush.flush(m_launch.get_stream()); }
|
||||
|
||||
__forceinline__ void sync_stream() const
|
||||
{
|
||||
|
||||
@@ -47,9 +47,8 @@ measure_hot_base::measure_hot_base(state &exec_state)
|
||||
// to match the cold result if available.
|
||||
try
|
||||
{
|
||||
nvbench::int64_t cold_samples =
|
||||
m_state.get_summary("nv/cold/sample_size").get_int64("value");
|
||||
m_min_samples = std::max(m_min_samples, cold_samples);
|
||||
nvbench::int64_t cold_samples = m_state.get_summary("nv/cold/sample_size").get_int64("value");
|
||||
m_min_samples = std::max(m_min_samples, cold_samples);
|
||||
|
||||
// If the cold measurement ran successfully, disable skip_time. It'd just
|
||||
// be annoying to skip now.
|
||||
@@ -72,15 +71,11 @@ void measure_hot_base::check()
|
||||
const auto device = m_state.get_device();
|
||||
if (!device)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"{}",
|
||||
"Device required for `hot` measurement.");
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "Device required for `hot` measurement.");
|
||||
}
|
||||
if (!device->is_active())
|
||||
{ // This means something went wrong higher up. Throw an error.
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"{}",
|
||||
"Internal error: Current device is not active.");
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,8 +111,7 @@ void measure_hot_base::generate_summaries()
|
||||
}
|
||||
|
||||
// Log if a printer exists:
|
||||
if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
|
||||
printer_opt_ref.has_value())
|
||||
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
|
||||
|
||||
@@ -41,10 +41,10 @@ namespace detail
|
||||
struct measure_hot_base
|
||||
{
|
||||
explicit measure_hot_base(nvbench::state &exec_state);
|
||||
measure_hot_base(const measure_hot_base &) = delete;
|
||||
measure_hot_base(measure_hot_base &&) = delete;
|
||||
measure_hot_base(const measure_hot_base &) = delete;
|
||||
measure_hot_base(measure_hot_base &&) = delete;
|
||||
measure_hot_base &operator=(const measure_hot_base &) = delete;
|
||||
measure_hot_base &operator=(measure_hot_base &&) = delete;
|
||||
measure_hot_base &operator=(measure_hot_base &&) = delete;
|
||||
|
||||
protected:
|
||||
void check();
|
||||
@@ -131,7 +131,7 @@ private:
|
||||
// The .95 factor here pads the batch_size a bit to avoid needing a second
|
||||
// batch due to noise.
|
||||
const auto time_estimate = m_cuda_timer.get_duration() * 0.95;
|
||||
auto batch_size = static_cast<nvbench::int64_t>(m_min_time / time_estimate);
|
||||
auto batch_size = static_cast<nvbench::int64_t>(m_min_time / time_estimate);
|
||||
|
||||
do
|
||||
{
|
||||
@@ -142,7 +142,7 @@ private:
|
||||
// Block stream until some work is queued.
|
||||
// Limit the number of kernel executions while blocked to prevent
|
||||
// deadlocks. See warnings on blocking_kernel.
|
||||
const auto blocked_launches = std::min(batch_size, nvbench::int64_t{2});
|
||||
const auto blocked_launches = std::min(batch_size, nvbench::int64_t{2});
|
||||
const auto unblocked_launches = batch_size - blocked_launches;
|
||||
|
||||
this->block_stream();
|
||||
@@ -189,7 +189,6 @@ private:
|
||||
break; // Stop iterating
|
||||
}
|
||||
|
||||
|
||||
m_walltime_timer.stop();
|
||||
if (m_walltime_timer.get_duration() > m_timeout)
|
||||
{
|
||||
|
||||
@@ -58,18 +58,12 @@ struct ring_buffer
|
||||
/**
|
||||
* The number of valid values in the ring buffer. Always <= capacity().
|
||||
*/
|
||||
[[nodiscard]] std::size_t size() const
|
||||
{
|
||||
return m_full ? m_buffer.size() : m_index;
|
||||
}
|
||||
[[nodiscard]] std::size_t size() const { return m_full ? m_buffer.size() : m_index; }
|
||||
|
||||
/**
|
||||
* The maximum size of the ring buffer.
|
||||
*/
|
||||
[[nodiscard]] std::size_t capacity() const
|
||||
{
|
||||
return m_buffer.size();
|
||||
}
|
||||
[[nodiscard]] std::size_t capacity() const { return m_buffer.size(); }
|
||||
|
||||
/**
|
||||
* @return True if the ring buffer is empty.
|
||||
|
||||
@@ -53,7 +53,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
|
||||
"`ExecTags` argument must be a member (or combination of "
|
||||
"members) from nvbench::exec_tag.");
|
||||
|
||||
constexpr auto measure_tags = tags & measure_mask;
|
||||
constexpr auto measure_tags = tags & measure_mask;
|
||||
constexpr auto modifier_tags = tags & modifier_mask;
|
||||
|
||||
// "run once" is handled by the cold measurement:
|
||||
@@ -81,8 +81,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
|
||||
}
|
||||
else
|
||||
{
|
||||
this->exec(cold | hot | tags,
|
||||
std::forward<KernelLauncher>(kernel_launcher));
|
||||
this->exec(cold | hot | tags, std::forward<KernelLauncher>(kernel_launcher));
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -99,8 +98,8 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
|
||||
constexpr bool use_blocking_kernel = !(tags & no_block);
|
||||
if constexpr (tags & timer)
|
||||
{
|
||||
// Estimate bandwidth here
|
||||
#ifdef NVBENCH_HAS_CUPTI
|
||||
// Estimate bandwidth here
|
||||
#ifdef NVBENCH_HAS_CUPTI
|
||||
if constexpr (!(modifier_tags & run_once))
|
||||
{
|
||||
if (this->is_cupti_required())
|
||||
@@ -110,7 +109,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
|
||||
measure();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
|
||||
measure_t measure{*this, kernel_launcher};
|
||||
@@ -121,8 +120,8 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
|
||||
using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
|
||||
wrapper_t wrapper{kernel_launcher};
|
||||
|
||||
// Estimate bandwidth here
|
||||
#ifdef NVBENCH_HAS_CUPTI
|
||||
// Estimate bandwidth here
|
||||
#ifdef NVBENCH_HAS_CUPTI
|
||||
if constexpr (!(modifier_tags & run_once))
|
||||
{
|
||||
if (this->is_cupti_required())
|
||||
@@ -132,10 +131,9 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
|
||||
measure();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
using measure_t =
|
||||
nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
|
||||
using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
|
||||
measure_t measure(*this, wrapper);
|
||||
measure();
|
||||
}
|
||||
@@ -143,12 +141,10 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
|
||||
|
||||
if constexpr (tags & hot)
|
||||
{
|
||||
static_assert(!(tags & sync),
|
||||
"Hot measurement doesn't support the `sync` exec_tag.");
|
||||
static_assert(!(tags & timer),
|
||||
"Hot measurement doesn't support the `timer` exec_tag.");
|
||||
static_assert(!(tags & sync), "Hot measurement doesn't support the `sync` exec_tag.");
|
||||
static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
|
||||
constexpr bool use_blocking_kernel = !(tags & no_block);
|
||||
using measure_t = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
|
||||
using measure_t = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
|
||||
measure_t measure{*this, kernel_launcher};
|
||||
measure();
|
||||
}
|
||||
|
||||
@@ -40,9 +40,7 @@ void state_iterator::add_axis(const nvbench::axis_base &axis)
|
||||
this->add_axis(axis.get_name(), axis.get_type(), axis.get_size());
|
||||
}
|
||||
|
||||
void state_iterator::add_axis(std::string axis,
|
||||
nvbench::axis_type type,
|
||||
std::size_t size)
|
||||
void state_iterator::add_axis(std::string axis, nvbench::axis_type type, std::size_t size)
|
||||
{
|
||||
m_indices.push_back({std::move(axis), type, std::size_t{0}, size});
|
||||
}
|
||||
@@ -74,10 +72,7 @@ state_iterator::get_current_indices() const
|
||||
return m_indices;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool state_iterator::iter_valid() const
|
||||
{
|
||||
return m_current < m_total;
|
||||
}
|
||||
[[nodiscard]] bool state_iterator::iter_valid() const { return m_current < m_total; }
|
||||
|
||||
void state_iterator::next()
|
||||
{
|
||||
@@ -102,7 +97,7 @@ state_generator::state_generator(const benchmark_base &bench)
|
||||
|
||||
void state_generator::build_axis_configs()
|
||||
{
|
||||
const axes_metadata &axes = m_benchmark.get_axes();
|
||||
const axes_metadata &axes = m_benchmark.get_axes();
|
||||
const std::vector<std::unique_ptr<axis_base>> &axes_vec = axes.get_axes();
|
||||
|
||||
// Construct two state_generators:
|
||||
@@ -118,35 +113,29 @@ void state_generator::build_axis_configs()
|
||||
type_axes.reserve(axes_vec.size());
|
||||
|
||||
// Filter all axes by into type and non-type:
|
||||
std::for_each(axes_vec.cbegin(),
|
||||
axes_vec.cend(),
|
||||
[&non_type_si, &type_axes](const auto &axis) {
|
||||
if (axis->get_type() == nvbench::axis_type::type)
|
||||
{
|
||||
type_axes.push_back(
|
||||
std::cref(static_cast<const type_axis &>(*axis)));
|
||||
}
|
||||
else
|
||||
{
|
||||
non_type_si.add_axis(*axis);
|
||||
}
|
||||
});
|
||||
std::for_each(axes_vec.cbegin(), axes_vec.cend(), [&non_type_si, &type_axes](const auto &axis) {
|
||||
if (axis->get_type() == nvbench::axis_type::type)
|
||||
{
|
||||
type_axes.push_back(std::cref(static_cast<const type_axis &>(*axis)));
|
||||
}
|
||||
else
|
||||
{
|
||||
non_type_si.add_axis(*axis);
|
||||
}
|
||||
});
|
||||
|
||||
// Reverse sort type axes by index. This way the state_generator's cartesian
|
||||
// product of the type axes values will be enumerated in the same order as
|
||||
// nvbench::tl::cartesian_product<type_axes>. This is necessary to ensure
|
||||
// that the correct states are passed to the corresponding benchmark
|
||||
// instantiations.
|
||||
std::sort(type_axes.begin(),
|
||||
type_axes.end(),
|
||||
[](const auto &axis_1, const auto &axis_2) {
|
||||
return axis_1.get().get_axis_index() >
|
||||
axis_2.get().get_axis_index();
|
||||
});
|
||||
std::sort(type_axes.begin(), type_axes.end(), [](const auto &axis_1, const auto &axis_2) {
|
||||
return axis_1.get().get_axis_index() > axis_2.get().get_axis_index();
|
||||
});
|
||||
|
||||
std::for_each(type_axes.cbegin(),
|
||||
type_axes.cend(),
|
||||
[&type_si](const auto &axis) { type_si.add_axis(axis); });
|
||||
std::for_each(type_axes.cbegin(), type_axes.cend(), [&type_si](const auto &axis) {
|
||||
type_si.add_axis(axis);
|
||||
});
|
||||
}
|
||||
|
||||
// type_axis_configs generation:
|
||||
@@ -157,8 +146,8 @@ void state_generator::build_axis_configs()
|
||||
// Build type_axis_configs
|
||||
for (type_si.init(); type_si.iter_valid(); type_si.next())
|
||||
{
|
||||
auto &[config, active_mask] = m_type_axis_configs.emplace_back(
|
||||
std::make_pair(nvbench::named_values{}, true));
|
||||
auto &[config, active_mask] =
|
||||
m_type_axis_configs.emplace_back(std::make_pair(nvbench::named_values{}, true));
|
||||
|
||||
// Reverse the indices so they're once again in the same order as
|
||||
// specified:
|
||||
@@ -173,8 +162,7 @@ void state_generator::build_axis_configs()
|
||||
active_mask = false;
|
||||
}
|
||||
|
||||
config.set_string(axis_info.axis,
|
||||
axis.get_input_string(axis_info.index));
|
||||
config.set_string(axis_info.axis, axis.get_input_string(axis_info.index));
|
||||
}
|
||||
} // type_si
|
||||
} // type_axis_config generation
|
||||
@@ -199,21 +187,18 @@ void state_generator::build_axis_configs()
|
||||
break;
|
||||
|
||||
case axis_type::int64:
|
||||
config.set_int64(
|
||||
axis_info.axis,
|
||||
axes.get_int64_axis(axis_info.axis).get_value(axis_info.index));
|
||||
config.set_int64(axis_info.axis,
|
||||
axes.get_int64_axis(axis_info.axis).get_value(axis_info.index));
|
||||
break;
|
||||
|
||||
case axis_type::float64:
|
||||
config.set_float64(
|
||||
axis_info.axis,
|
||||
axes.get_float64_axis(axis_info.axis).get_value(axis_info.index));
|
||||
config.set_float64(axis_info.axis,
|
||||
axes.get_float64_axis(axis_info.axis).get_value(axis_info.index));
|
||||
break;
|
||||
|
||||
case axis_type::string:
|
||||
config.set_string(
|
||||
axis_info.axis,
|
||||
axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
|
||||
config.set_string(axis_info.axis,
|
||||
axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
|
||||
break;
|
||||
} // switch (type)
|
||||
} // for (axis_info : current_indices)
|
||||
@@ -239,15 +224,12 @@ void state_generator::build_states()
|
||||
}
|
||||
}
|
||||
|
||||
void state_generator::add_states_for_device(
|
||||
const std::optional<device_info> &device)
|
||||
void state_generator::add_states_for_device(const std::optional<device_info> &device)
|
||||
{
|
||||
const auto num_type_configs = m_type_axis_configs.size();
|
||||
for (std::size_t type_config_index = 0; type_config_index < num_type_configs;
|
||||
++type_config_index)
|
||||
for (std::size_t type_config_index = 0; type_config_index < num_type_configs; ++type_config_index)
|
||||
{
|
||||
const auto &[type_config,
|
||||
axis_mask] = m_type_axis_configs[type_config_index];
|
||||
const auto &[type_config, axis_mask] = m_type_axis_configs[type_config_index];
|
||||
|
||||
if (!axis_mask)
|
||||
{ // Don't generate inner vector if the type config is masked out.
|
||||
@@ -261,10 +243,7 @@ void state_generator::add_states_for_device(
|
||||
config.append(non_type_config);
|
||||
|
||||
// Create benchmark:
|
||||
m_states.push_back(nvbench::state{m_benchmark,
|
||||
std::move(config),
|
||||
device,
|
||||
type_config_index});
|
||||
m_states.push_back(nvbench::state{m_benchmark, std::move(config), device, type_config_index});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,8 +36,7 @@ namespace nvbench::detail::statistics
|
||||
*
|
||||
* If the input has fewer than 5 sample, infinity is returned.
|
||||
*/
|
||||
template <typename Iter,
|
||||
typename ValueType = typename std::iterator_traits<Iter>::value_type>
|
||||
template <typename Iter, typename ValueType = typename std::iterator_traits<Iter>::value_type>
|
||||
ValueType standard_deviation(Iter first, Iter last, ValueType mean)
|
||||
{
|
||||
static_assert(std::is_floating_point_v<ValueType>);
|
||||
|
||||
@@ -21,17 +21,15 @@
|
||||
#include <fmt/format.h>
|
||||
#include <stdexcept>
|
||||
|
||||
#define NVBENCH_THROW(exception_type, format_str, ...) \
|
||||
throw exception_type(fmt::format("{}:{}: {}", \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
fmt::format(format_str, __VA_ARGS__)))
|
||||
#define NVBENCH_THROW(exception_type, format_str, ...) \
|
||||
throw exception_type( \
|
||||
fmt::format("{}:{}: {}", __FILE__, __LINE__, fmt::format(format_str, __VA_ARGS__)))
|
||||
|
||||
#define NVBENCH_THROW_IF(condition, exception_type, format_str, ...) \
|
||||
do \
|
||||
{ \
|
||||
if (condition) \
|
||||
{ \
|
||||
NVBENCH_THROW(exception_type, format_str, __VA_ARGS__); \
|
||||
} \
|
||||
#define NVBENCH_THROW_IF(condition, exception_type, format_str, ...) \
|
||||
do \
|
||||
{ \
|
||||
if (condition) \
|
||||
{ \
|
||||
NVBENCH_THROW(exception_type, format_str, __VA_ARGS__); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
@@ -27,10 +27,7 @@
|
||||
namespace nvbench::detail
|
||||
{
|
||||
|
||||
template <typename InIterT,
|
||||
typename InitValueT,
|
||||
typename ReduceOp,
|
||||
typename TransformOp>
|
||||
template <typename InIterT, typename InitValueT, typename ReduceOp, typename TransformOp>
|
||||
InitValueT transform_reduce(InIterT first,
|
||||
InIterT last,
|
||||
InitValueT init,
|
||||
|
||||
@@ -20,12 +20,10 @@ namespace tl::detail
|
||||
{
|
||||
|
||||
template <typename... Ts>
|
||||
auto size(nvbench::type_list<Ts...>)
|
||||
-> std::integral_constant<std::size_t, sizeof...(Ts)>;
|
||||
auto size(nvbench::type_list<Ts...>) -> std::integral_constant<std::size_t, sizeof...(Ts)>;
|
||||
|
||||
template <std::size_t I, typename... Ts>
|
||||
auto get(nvbench::type_list<Ts...>)
|
||||
-> std::tuple_element_t<I, std::tuple<Ts...>>;
|
||||
auto get(nvbench::type_list<Ts...>) -> std::tuple_element_t<I, std::tuple<Ts...>>;
|
||||
|
||||
template <typename... Ts, typename... Us>
|
||||
auto concat(nvbench::type_list<Ts...>, nvbench::type_list<Us...>)
|
||||
@@ -44,9 +42,8 @@ struct prepend_each<T, nvbench::type_list<>>
|
||||
template <typename T, typename TL, typename... TLTail>
|
||||
struct prepend_each<T, nvbench::type_list<TL, TLTail...>>
|
||||
{
|
||||
using cur = decltype(detail::concat(nvbench::type_list<T>{}, TL{}));
|
||||
using next =
|
||||
typename detail::prepend_each<T, nvbench::type_list<TLTail...>>::type;
|
||||
using cur = decltype(detail::concat(nvbench::type_list<T>{}, TL{}));
|
||||
using next = typename detail::prepend_each<T, nvbench::type_list<TLTail...>>::type;
|
||||
using type = decltype(detail::concat(nvbench::type_list<cur>{}, next{}));
|
||||
};
|
||||
|
||||
@@ -71,23 +68,20 @@ struct cartesian_product<nvbench::type_list<nvbench::type_list<>, TLTail...>>
|
||||
template <typename T, typename... Ts>
|
||||
struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Ts...>>>
|
||||
{
|
||||
using cur = nvbench::type_list<nvbench::type_list<T>>;
|
||||
using next =
|
||||
std::conditional_t<sizeof...(Ts) != 0,
|
||||
typename detail::cartesian_product<
|
||||
nvbench::type_list<nvbench::type_list<Ts...>>>::type,
|
||||
nvbench::type_list<>>;
|
||||
using cur = nvbench::type_list<nvbench::type_list<T>>;
|
||||
using next = std::conditional_t<
|
||||
sizeof...(Ts) != 0,
|
||||
typename detail::cartesian_product<nvbench::type_list<nvbench::type_list<Ts...>>>::type,
|
||||
nvbench::type_list<>>;
|
||||
using type = decltype(detail::concat(cur{}, next{}));
|
||||
};
|
||||
|
||||
template <typename T, typename... Tail, typename TL, typename... TLTail>
|
||||
struct cartesian_product<
|
||||
nvbench::type_list<nvbench::type_list<T, Tail...>, TL, TLTail...>>
|
||||
struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Tail...>, TL, TLTail...>>
|
||||
{
|
||||
using tail_prod =
|
||||
typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
|
||||
using cur = typename detail::prepend_each<T, tail_prod>::type;
|
||||
using next = typename detail::cartesian_product<
|
||||
using tail_prod = typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
|
||||
using cur = typename detail::prepend_each<T, tail_prod>::type;
|
||||
using next = typename detail::cartesian_product<
|
||||
nvbench::type_list<nvbench::type_list<Tail...>, TL, TLTail...>>::type;
|
||||
using type = decltype(detail::concat(cur{}, next{}));
|
||||
};
|
||||
|
||||
@@ -65,17 +65,15 @@ void device_info::set_persistence_mode(bool state)
|
||||
#else // NVBENCH_HAS_NVML
|
||||
try
|
||||
{
|
||||
NVBENCH_NVML_CALL(nvmlDeviceSetPersistenceMode(
|
||||
m_nvml_device,
|
||||
state ? NVML_FEATURE_ENABLED : NVML_FEATURE_DISABLED));
|
||||
NVBENCH_NVML_CALL(
|
||||
nvmlDeviceSetPersistenceMode(m_nvml_device,
|
||||
state ? NVML_FEATURE_ENABLED : NVML_FEATURE_DISABLED));
|
||||
}
|
||||
catch (nvml::call_failed &e)
|
||||
{
|
||||
if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"{}",
|
||||
"Persistence mode is only supported on Linux.");
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "Persistence mode is only supported on Linux.");
|
||||
}
|
||||
else if (e.get_error_code() == NVML_ERROR_NO_PERMISSION)
|
||||
{
|
||||
@@ -104,30 +102,26 @@ try
|
||||
break;
|
||||
|
||||
case clock_rate::base:
|
||||
NVBENCH_NVML_CALL(nvmlDeviceSetGpuLockedClocks(
|
||||
m_nvml_device,
|
||||
static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP),
|
||||
static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP)));
|
||||
NVBENCH_NVML_CALL(
|
||||
nvmlDeviceSetGpuLockedClocks(m_nvml_device,
|
||||
static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP),
|
||||
static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP)));
|
||||
break;
|
||||
|
||||
case clock_rate::maximum: {
|
||||
const auto max_mhz = static_cast<unsigned int>(
|
||||
this->get_sm_default_clock_rate() / (1000 * 1000));
|
||||
NVBENCH_NVML_CALL(
|
||||
nvmlDeviceSetGpuLockedClocks(m_nvml_device, max_mhz, max_mhz));
|
||||
const auto max_mhz =
|
||||
static_cast<unsigned int>(this->get_sm_default_clock_rate() / (1000 * 1000));
|
||||
NVBENCH_NVML_CALL(nvmlDeviceSetGpuLockedClocks(m_nvml_device, max_mhz, max_mhz));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Unrecognized clock rate: {}",
|
||||
static_cast<int>(rate));
|
||||
NVBENCH_THROW(std::runtime_error, "Unrecognized clock rate: {}", static_cast<int>(rate));
|
||||
}
|
||||
}
|
||||
catch (nvml::call_failed &e)
|
||||
{
|
||||
if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED &&
|
||||
this->get_sm_version() < 700)
|
||||
if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED && this->get_sm_version() < 700)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"GPU clock rates can only be modified for Volta and later. "
|
||||
@@ -156,9 +150,7 @@ catch (nvml::call_failed &e)
|
||||
{
|
||||
if (!is_active())
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"{}",
|
||||
"get_context is called for inactive device");
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "get_context is called for inactive device");
|
||||
}
|
||||
|
||||
CUcontext cu_context;
|
||||
|
||||
@@ -54,10 +54,7 @@ struct device_info
|
||||
[[nodiscard]] int get_id() const { return m_id; }
|
||||
|
||||
/// @return The name of the device.
|
||||
[[nodiscard]] std::string_view get_name() const
|
||||
{
|
||||
return std::string_view(m_prop.name);
|
||||
}
|
||||
[[nodiscard]] std::string_view get_name() const { return std::string_view(m_prop.name); }
|
||||
|
||||
[[nodiscard]] bool is_active() const
|
||||
{
|
||||
@@ -83,7 +80,6 @@ struct device_info
|
||||
/// @note Requires root / admin privileges.
|
||||
void set_persistence_mode(bool state);
|
||||
|
||||
|
||||
/// Symbolic values for special clock rates
|
||||
enum class clock_rate
|
||||
{
|
||||
@@ -101,10 +97,7 @@ struct device_info
|
||||
void lock_gpu_clocks(clock_rate rate);
|
||||
|
||||
/// @return The SM version of the current device as (major*100) + (minor*10).
|
||||
[[nodiscard]] int get_sm_version() const
|
||||
{
|
||||
return m_prop.major * 100 + m_prop.minor * 10;
|
||||
}
|
||||
[[nodiscard]] int get_sm_version() const { return m_prop.major * 100 + m_prop.minor * 10; }
|
||||
|
||||
/// @return The PTX version of the current device, e.g. sm_80 returns 800.
|
||||
[[nodiscard]] __forceinline__ int get_ptx_version() const
|
||||
@@ -119,46 +112,25 @@ struct device_info
|
||||
}
|
||||
|
||||
/// @return The number of physical streaming multiprocessors on this device.
|
||||
[[nodiscard]] int get_number_of_sms() const
|
||||
{
|
||||
return m_prop.multiProcessorCount;
|
||||
}
|
||||
[[nodiscard]] int get_number_of_sms() const { return m_prop.multiProcessorCount; }
|
||||
|
||||
/// @return The maximum number of resident blocks per SM.
|
||||
[[nodiscard]] int get_max_blocks_per_sm() const
|
||||
{
|
||||
return m_prop.maxBlocksPerMultiProcessor;
|
||||
}
|
||||
[[nodiscard]] int get_max_blocks_per_sm() const { return m_prop.maxBlocksPerMultiProcessor; }
|
||||
|
||||
/// @return The maximum number of resident threads per SM.
|
||||
[[nodiscard]] int get_max_threads_per_sm() const
|
||||
{
|
||||
return m_prop.maxThreadsPerMultiProcessor;
|
||||
}
|
||||
[[nodiscard]] int get_max_threads_per_sm() const { return m_prop.maxThreadsPerMultiProcessor; }
|
||||
|
||||
/// @return The maximum number of threads per block.
|
||||
[[nodiscard]] int get_max_threads_per_block() const
|
||||
{
|
||||
return m_prop.maxThreadsPerBlock;
|
||||
}
|
||||
[[nodiscard]] int get_max_threads_per_block() const { return m_prop.maxThreadsPerBlock; }
|
||||
|
||||
/// @return The number of registers per SM.
|
||||
[[nodiscard]] int get_registers_per_sm() const
|
||||
{
|
||||
return m_prop.regsPerMultiprocessor;
|
||||
}
|
||||
[[nodiscard]] int get_registers_per_sm() const { return m_prop.regsPerMultiprocessor; }
|
||||
|
||||
/// @return The number of registers per block.
|
||||
[[nodiscard]] int get_registers_per_block() const
|
||||
{
|
||||
return m_prop.regsPerBlock;
|
||||
}
|
||||
[[nodiscard]] int get_registers_per_block() const { return m_prop.regsPerBlock; }
|
||||
|
||||
/// @return The total number of bytes available in global memory.
|
||||
[[nodiscard]] std::size_t get_global_memory_size() const
|
||||
{
|
||||
return m_prop.totalGlobalMem;
|
||||
}
|
||||
[[nodiscard]] std::size_t get_global_memory_size() const { return m_prop.totalGlobalMem; }
|
||||
|
||||
struct memory_info
|
||||
{
|
||||
@@ -176,10 +148,7 @@ struct device_info
|
||||
}
|
||||
|
||||
/// @return The width of the global memory bus in bits.
|
||||
[[nodiscard]] int get_global_memory_bus_width() const
|
||||
{
|
||||
return m_prop.memoryBusWidth;
|
||||
}
|
||||
[[nodiscard]] int get_global_memory_bus_width() const { return m_prop.memoryBusWidth; }
|
||||
|
||||
//// @return The global memory bus bandwidth in bytes/sec.
|
||||
[[nodiscard]] std::size_t get_global_memory_bus_bandwidth() const
|
||||
@@ -201,10 +170,7 @@ struct device_info
|
||||
}
|
||||
|
||||
/// @return The available amount of shared memory in bytes per block.
|
||||
[[nodiscard]] std::size_t get_shared_memory_per_block() const
|
||||
{
|
||||
return m_prop.sharedMemPerBlock;
|
||||
}
|
||||
[[nodiscard]] std::size_t get_shared_memory_per_block() const { return m_prop.sharedMemPerBlock; }
|
||||
|
||||
/// @return True if ECC is enabled on this device.
|
||||
[[nodiscard]] bool get_ecc_state() const { return m_prop.ECCEnabled; }
|
||||
@@ -224,23 +190,11 @@ struct device_info
|
||||
#endif
|
||||
|
||||
/// @return A cached copy of the device's cudaDeviceProp.
|
||||
[[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const
|
||||
{
|
||||
return m_prop;
|
||||
}
|
||||
[[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const { return m_prop; }
|
||||
|
||||
[[nodiscard]] bool operator<(const device_info &o) const
|
||||
{
|
||||
return m_id < o.m_id;
|
||||
}
|
||||
[[nodiscard]] bool operator==(const device_info &o) const
|
||||
{
|
||||
return m_id == o.m_id;
|
||||
}
|
||||
[[nodiscard]] bool operator!=(const device_info &o) const
|
||||
{
|
||||
return m_id != o.m_id;
|
||||
}
|
||||
[[nodiscard]] bool operator<(const device_info &o) const { return m_id < o.m_id; }
|
||||
[[nodiscard]] bool operator==(const device_info &o) const { return m_id == o.m_id; }
|
||||
[[nodiscard]] bool operator!=(const device_info &o) const { return m_id != o.m_id; }
|
||||
|
||||
private:
|
||||
int m_id;
|
||||
@@ -267,11 +221,10 @@ try
|
||||
{
|
||||
nvbench::detail::device_scope _{dev_id};
|
||||
cudaFuncAttributes attr{};
|
||||
NVBENCH_CUDA_CALL(
|
||||
cudaFuncGetAttributes(&attr, ((const void*)nvbench::detail::noop_kernel_ptr) ));
|
||||
NVBENCH_CUDA_CALL(cudaFuncGetAttributes(&attr, ((const void *)nvbench::detail::noop_kernel_ptr)));
|
||||
return attr.ptxVersion * 10;
|
||||
}
|
||||
catch(...)
|
||||
catch (...)
|
||||
{ // Fail gracefully when no appropriate PTX is found for this device.
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -40,10 +40,7 @@ struct device_manager
|
||||
/**
|
||||
* @return The total number of detected CUDA devices.
|
||||
*/
|
||||
[[nodiscard]] int get_number_of_devices() const
|
||||
{
|
||||
return static_cast<int>(m_devices.size());
|
||||
}
|
||||
[[nodiscard]] int get_number_of_devices() const { return static_cast<int>(m_devices.size()); }
|
||||
|
||||
/**
|
||||
* @return The number of devices actually used by all benchmarks.
|
||||
@@ -57,39 +54,27 @@ struct device_manager
|
||||
/**
|
||||
* @return The device_info object corresponding to `id`.
|
||||
*/
|
||||
[[nodiscard]] const nvbench::device_info &get_device(int id)
|
||||
{
|
||||
return m_devices.at(id);
|
||||
}
|
||||
[[nodiscard]] const nvbench::device_info &get_device(int id) { return m_devices.at(id); }
|
||||
|
||||
/**
|
||||
* @return A vector containing device_info objects for all detected CUDA
|
||||
* devices.
|
||||
*/
|
||||
[[nodiscard]] const device_info_vector &get_devices() const
|
||||
{
|
||||
return m_devices;
|
||||
}
|
||||
[[nodiscard]] const device_info_vector &get_devices() const { return m_devices; }
|
||||
|
||||
/**
|
||||
* @return A vector containing device_info objects for devices that are
|
||||
* actively used by all benchmarks.
|
||||
* @note This is only valid after nvbench::option_parser::parse executes.
|
||||
*/
|
||||
[[nodiscard]] const device_info_vector &get_used_devices() const
|
||||
{
|
||||
return m_used_devices;
|
||||
}
|
||||
[[nodiscard]] const device_info_vector &get_used_devices() const { return m_used_devices; }
|
||||
|
||||
private:
|
||||
device_manager();
|
||||
|
||||
friend struct option_parser;
|
||||
|
||||
void set_used_devices(device_info_vector devices)
|
||||
{
|
||||
m_used_devices = std::move(devices);
|
||||
}
|
||||
void set_used_devices(device_info_vector devices) { m_used_devices = std::move(devices); }
|
||||
|
||||
device_info_vector m_devices;
|
||||
device_info_vector m_used_devices;
|
||||
|
||||
@@ -64,10 +64,7 @@ struct type_strings<nvbench::enum_type<Value, T>>
|
||||
return std::to_string(Value);
|
||||
}
|
||||
|
||||
static std::string description()
|
||||
{
|
||||
return nvbench::demangle<nvbench::enum_type<Value, T>>();
|
||||
}
|
||||
static std::string description() { return nvbench::demangle<nvbench::enum_type<Value, T>>(); }
|
||||
};
|
||||
|
||||
} // namespace nvbench
|
||||
@@ -86,15 +83,13 @@ struct type_strings<nvbench::enum_type<Value, T>>
|
||||
* \relatesalso enum_type_list
|
||||
* \relatesalso nvbench::enum_type_list
|
||||
*/
|
||||
#define NVBENCH_DECLARE_ENUM_TYPE_STRINGS(T, \
|
||||
input_generator, \
|
||||
description_generator) \
|
||||
namespace nvbench \
|
||||
{ \
|
||||
template <T Value> \
|
||||
struct type_strings<enum_type<Value, T>> \
|
||||
{ \
|
||||
static std::string input_string() { return input_generator(Value); } \
|
||||
static std::string description() { return description_generator(Value); } \
|
||||
}; \
|
||||
#define NVBENCH_DECLARE_ENUM_TYPE_STRINGS(T, input_generator, description_generator) \
|
||||
namespace nvbench \
|
||||
{ \
|
||||
template <T Value> \
|
||||
struct type_strings<enum_type<Value, T>> \
|
||||
{ \
|
||||
static std::string input_string() { return input_generator(Value); } \
|
||||
static std::string description() { return description_generator(Value); } \
|
||||
}; \
|
||||
}
|
||||
|
||||
@@ -31,16 +31,16 @@ enum class exec_flag
|
||||
none = 0x0,
|
||||
|
||||
// Modifiers:
|
||||
timer = 0x01, // KernelLauncher uses manual timing
|
||||
no_block = 0x02, // Disables use of `blocking_kernel`.
|
||||
sync = 0x04, // KernelLauncher has indicated that it will sync
|
||||
run_once = 0x08, // Only run the benchmark once (for profiling).
|
||||
timer = 0x01, // KernelLauncher uses manual timing
|
||||
no_block = 0x02, // Disables use of `blocking_kernel`.
|
||||
sync = 0x04, // KernelLauncher has indicated that it will sync
|
||||
run_once = 0x08, // Only run the benchmark once (for profiling).
|
||||
modifier_mask = timer | no_block | sync | run_once,
|
||||
|
||||
// Measurement types:
|
||||
cold = 0x0100, // measure_hot
|
||||
hot = 0x0200, // measure_cold
|
||||
measure_mask = cold | hot
|
||||
cold = 0x0100, // measure_hot
|
||||
hot = 0x0200, // measure_cold
|
||||
measure_mask = cold | hot
|
||||
};
|
||||
|
||||
} // namespace nvbench::detail
|
||||
@@ -120,7 +120,6 @@ constexpr inline auto timer = nvbench::exec_tag::impl::timer;
|
||||
|
||||
/// Modifier used to indicate that the KernelGenerator will perform CUDA
|
||||
/// synchronizations. Without this flag such benchmarks will deadlock.
|
||||
constexpr inline auto sync = nvbench::exec_tag::impl::no_block |
|
||||
nvbench::exec_tag::impl::sync;
|
||||
constexpr inline auto sync = nvbench::exec_tag::impl::no_block | nvbench::exec_tag::impl::sync;
|
||||
|
||||
} // namespace nvbench::exec_tag
|
||||
|
||||
@@ -20,24 +20,24 @@
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#define NVBENCH_DECLARE_FLAGS(T) \
|
||||
constexpr inline T operator|(T v1, T v2) \
|
||||
{ \
|
||||
using UT = std::underlying_type_t<T>; \
|
||||
return static_cast<T>(static_cast<UT>(v1) | static_cast<UT>(v2)); \
|
||||
} \
|
||||
constexpr inline T operator&(T v1, T v2) \
|
||||
{ \
|
||||
using UT = std::underlying_type_t<T>; \
|
||||
return static_cast<T>(static_cast<UT>(v1) & static_cast<UT>(v2)); \
|
||||
} \
|
||||
constexpr inline T operator^(T v1, T v2) \
|
||||
{ \
|
||||
using UT = std::underlying_type_t<T>; \
|
||||
return static_cast<T>(static_cast<UT>(v1) ^ static_cast<UT>(v2)); \
|
||||
} \
|
||||
constexpr inline T operator~(T v1) \
|
||||
{ \
|
||||
using UT = std::underlying_type_t<T>; \
|
||||
return static_cast<T>(~static_cast<UT>(v1)); \
|
||||
#define NVBENCH_DECLARE_FLAGS(T) \
|
||||
constexpr inline T operator|(T v1, T v2) \
|
||||
{ \
|
||||
using UT = std::underlying_type_t<T>; \
|
||||
return static_cast<T>(static_cast<UT>(v1) | static_cast<UT>(v2)); \
|
||||
} \
|
||||
constexpr inline T operator&(T v1, T v2) \
|
||||
{ \
|
||||
using UT = std::underlying_type_t<T>; \
|
||||
return static_cast<T>(static_cast<UT>(v1) & static_cast<UT>(v2)); \
|
||||
} \
|
||||
constexpr inline T operator^(T v1, T v2) \
|
||||
{ \
|
||||
using UT = std::underlying_type_t<T>; \
|
||||
return static_cast<T>(static_cast<UT>(v1) ^ static_cast<UT>(v2)); \
|
||||
} \
|
||||
constexpr inline T operator~(T v1) \
|
||||
{ \
|
||||
using UT = std::underlying_type_t<T>; \
|
||||
return static_cast<T>(~static_cast<UT>(v1)); \
|
||||
}
|
||||
|
||||
@@ -36,20 +36,11 @@ struct float64_axis final : public axis_base
|
||||
|
||||
~float64_axis() final;
|
||||
|
||||
void set_inputs(std::vector<nvbench::float64_t> inputs)
|
||||
{
|
||||
m_values = std::move(inputs);
|
||||
}
|
||||
[[nodiscard]] nvbench::float64_t get_value(std::size_t i) const
|
||||
{
|
||||
return m_values[i];
|
||||
}
|
||||
void set_inputs(std::vector<nvbench::float64_t> inputs) { m_values = std::move(inputs); }
|
||||
[[nodiscard]] nvbench::float64_t get_value(std::size_t i) const { return m_values[i]; }
|
||||
|
||||
private:
|
||||
std::unique_ptr<axis_base> do_clone() const
|
||||
{
|
||||
return std::make_unique<float64_axis>(*this);
|
||||
}
|
||||
std::unique_ptr<axis_base> do_clone() const { return std::make_unique<float64_axis>(*this); }
|
||||
std::size_t do_get_size() const final { return m_values.size(); }
|
||||
std::string do_get_input_string(std::size_t i) const final;
|
||||
std::string do_get_description(std::size_t i) const final;
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -25,7 +25,5 @@
|
||||
#define NVBENCH_GIT_SHA1 NVBench_GIT_SHA1
|
||||
#define NVBENCH_GIT_VERSION NVBench_GIT_VERSION
|
||||
#ifdef NVBench_GIT_IS_DIRTY
|
||||
# define NVBENCH_GIT_IS_DIRTY
|
||||
#define NVBENCH_GIT_IS_DIRTY
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@@ -58,28 +58,18 @@ struct int64_axis final : public axis_base
|
||||
return static_cast<bool>(m_flags & int64_axis_flags::power_of_two);
|
||||
}
|
||||
|
||||
void set_inputs(std::vector<int64_t> inputs,
|
||||
int64_axis_flags flags = int64_axis_flags::none);
|
||||
void set_inputs(std::vector<int64_t> inputs, int64_axis_flags flags = int64_axis_flags::none);
|
||||
|
||||
[[nodiscard]] const std::vector<int64_t> &get_inputs() const
|
||||
{
|
||||
return m_inputs;
|
||||
};
|
||||
[[nodiscard]] const std::vector<int64_t> &get_inputs() const { return m_inputs; };
|
||||
|
||||
[[nodiscard]] int64_t get_value(std::size_t i) const { return m_values[i]; };
|
||||
|
||||
[[nodiscard]] const std::vector<int64_t> &get_values() const
|
||||
{
|
||||
return m_values;
|
||||
};
|
||||
[[nodiscard]] const std::vector<int64_t> &get_values() const { return m_values; };
|
||||
|
||||
int64_axis_flags get_flags() const { return m_flags; }
|
||||
|
||||
// Helper functions for pow2 conversions:
|
||||
static nvbench::int64_t compute_pow2(nvbench::int64_t exponent)
|
||||
{
|
||||
return 1ll << exponent;
|
||||
}
|
||||
static nvbench::int64_t compute_pow2(nvbench::int64_t exponent) { return 1ll << exponent; }
|
||||
|
||||
// UB if value < 0.
|
||||
static nvbench::int64_t compute_log2(nvbench::int64_t value)
|
||||
@@ -95,10 +85,7 @@ struct int64_axis final : public axis_base
|
||||
};
|
||||
|
||||
private:
|
||||
std::unique_ptr<axis_base> do_clone() const
|
||||
{
|
||||
return std::make_unique<int64_axis>(*this);
|
||||
}
|
||||
std::unique_ptr<axis_base> do_clone() const { return std::make_unique<int64_axis>(*this); }
|
||||
std::size_t do_get_size() const final { return m_inputs.size(); }
|
||||
std::string do_get_input_string(std::size_t) const final;
|
||||
std::string do_get_description(std::size_t) const final;
|
||||
|
||||
@@ -66,9 +66,8 @@ std::string int64_axis::do_get_input_string(std::size_t i) const
|
||||
|
||||
std::string int64_axis::do_get_description(std::size_t i) const
|
||||
{
|
||||
return this->is_power_of_two()
|
||||
? fmt::format("2^{} = {}", m_inputs[i], m_values[i])
|
||||
: std::string{};
|
||||
return this->is_power_of_two() ? fmt::format("2^{} = {}", m_inputs[i], m_values[i])
|
||||
: std::string{};
|
||||
}
|
||||
|
||||
std::string_view int64_axis::do_get_flags_as_string() const
|
||||
|
||||
@@ -85,8 +85,7 @@ private:
|
||||
" {:^{}} ",
|
||||
col.header,
|
||||
col.max_width);
|
||||
iter =
|
||||
fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
|
||||
iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
|
||||
}
|
||||
return fmt::format_to(iter, "\n");
|
||||
}
|
||||
@@ -102,8 +101,7 @@ private:
|
||||
"{:-^{}}",
|
||||
"",
|
||||
col.max_width + 2);
|
||||
iter =
|
||||
fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
|
||||
iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
|
||||
}
|
||||
return fmt::format_to(iter, "\n");
|
||||
}
|
||||
@@ -116,8 +114,7 @@ private:
|
||||
|
||||
for (std::size_t row = 0; row < m_num_rows; ++row)
|
||||
{
|
||||
iter =
|
||||
fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
|
||||
iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
|
||||
for (const column &col : m_columns)
|
||||
{
|
||||
iter = fmt::format_to(iter,
|
||||
@@ -125,8 +122,7 @@ private:
|
||||
" {:>{}} ",
|
||||
col.rows[row],
|
||||
col.max_width);
|
||||
iter =
|
||||
fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
|
||||
iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
|
||||
} // cols
|
||||
|
||||
iter = fmt::format_to(iter, "\n");
|
||||
|
||||
@@ -74,10 +74,7 @@ struct call_failed : error
|
||||
|
||||
[[nodiscard]] nvmlReturn_t get_error_code() const { return m_error_code; }
|
||||
|
||||
[[nodiscard]] const std::string &get_error_string() const
|
||||
{
|
||||
return m_error_string;
|
||||
}
|
||||
[[nodiscard]] const std::string &get_error_string() const { return m_error_string; }
|
||||
|
||||
private:
|
||||
nvmlReturn_t m_error_code;
|
||||
@@ -90,30 +87,26 @@ private:
|
||||
|
||||
#ifdef NVBENCH_HAS_NVML
|
||||
|
||||
#define NVBENCH_NVML_CALL(call) \
|
||||
do \
|
||||
{ \
|
||||
const auto _rr = call; \
|
||||
if (_rr != NVML_SUCCESS) \
|
||||
{ \
|
||||
throw nvbench::nvml::call_failed(__FILE__, \
|
||||
__LINE__, \
|
||||
#call, \
|
||||
_rr, \
|
||||
nvmlErrorString(_rr)); \
|
||||
} \
|
||||
#define NVBENCH_NVML_CALL(call) \
|
||||
do \
|
||||
{ \
|
||||
const auto _rr = call; \
|
||||
if (_rr != NVML_SUCCESS) \
|
||||
{ \
|
||||
throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, nvmlErrorString(_rr)); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
// Same as above, but used for nvmlInit(), where a failure means that
|
||||
// nvmlErrorString is not available.
|
||||
#define NVBENCH_NVML_CALL_NO_API(call) \
|
||||
do \
|
||||
{ \
|
||||
const auto _rr = call; \
|
||||
if (_rr != NVML_SUCCESS) \
|
||||
{ \
|
||||
throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, ""); \
|
||||
} \
|
||||
#define NVBENCH_NVML_CALL_NO_API(call) \
|
||||
do \
|
||||
{ \
|
||||
const auto _rr = call; \
|
||||
if (_rr != NVML_SUCCESS) \
|
||||
{ \
|
||||
throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, ""); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#endif // NVBENCH_HAS_NVML
|
||||
|
||||
@@ -47,18 +47,14 @@ struct table_builder
|
||||
const std::string &header,
|
||||
std::string value)
|
||||
{
|
||||
auto iter = std::find_if(m_columns.begin(),
|
||||
m_columns.end(),
|
||||
[&column_key](const column &col) {
|
||||
return col.key == column_key;
|
||||
});
|
||||
auto iter = std::find_if(m_columns.begin(), m_columns.end(), [&column_key](const column &col) {
|
||||
return col.key == column_key;
|
||||
});
|
||||
|
||||
auto &col = iter == m_columns.end()
|
||||
? m_columns.emplace_back(column{column_key,
|
||||
header,
|
||||
std::vector<std::string>{},
|
||||
header.size()})
|
||||
: *iter;
|
||||
? m_columns.emplace_back(
|
||||
column{column_key, header, std::vector<std::string>{}, header.size()})
|
||||
: *iter;
|
||||
|
||||
col.max_width = std::max(col.max_width, value.size());
|
||||
if (col.rows.size() <= row)
|
||||
@@ -76,11 +72,9 @@ struct table_builder
|
||||
std::size_t{},
|
||||
[](const auto &a, const auto &b) { return a > b ? a : b; },
|
||||
[](const column &col) { return col.rows.size(); });
|
||||
std::for_each(m_columns.begin(),
|
||||
m_columns.end(),
|
||||
[num_rows = m_num_rows](column &col) {
|
||||
col.rows.resize(num_rows);
|
||||
});
|
||||
std::for_each(m_columns.begin(), m_columns.end(), [num_rows = m_num_rows](column &col) {
|
||||
col.rows.resize(num_rows);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -126,11 +126,10 @@ std::string json_printer::version_t::get_string() const
|
||||
return fmt::format("{}.{}.{}", this->major, this->minor, this->patch);
|
||||
}
|
||||
|
||||
void json_printer::do_process_bulk_data_float64(
|
||||
state &state,
|
||||
const std::string &tag,
|
||||
const std::string &hint,
|
||||
const std::vector<nvbench::float64_t> &data)
|
||||
void json_printer::do_process_bulk_data_float64(state &state,
|
||||
const std::string &tag,
|
||||
const std::string &hint,
|
||||
const std::vector<nvbench::float64_t> &data)
|
||||
{
|
||||
printer_base::do_process_bulk_data_float64(state, tag, hint, data);
|
||||
|
||||
@@ -157,16 +156,12 @@ void json_printer::do_process_bulk_data_float64(
|
||||
{
|
||||
if (!fs::create_directory(result_path))
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"{}",
|
||||
"Failed to create result directory '{}'.");
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "Failed to create result directory '{}'.");
|
||||
}
|
||||
}
|
||||
else if (!fs::is_directory(result_path))
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"{}",
|
||||
"'{}' exists and is not a directory.");
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "'{}' exists and is not a directory.");
|
||||
}
|
||||
|
||||
const auto file_id = m_num_jsonbin_files++;
|
||||
@@ -197,16 +192,12 @@ void json_printer::do_process_bulk_data_float64(
|
||||
}
|
||||
catch (std::exception &e)
|
||||
{
|
||||
if (auto printer_opt_ref = state.get_benchmark().get_printer();
|
||||
printer_opt_ref.has_value())
|
||||
if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
printer.log(nvbench::log_level::warn,
|
||||
fmt::format("Error writing {} ({}) to {}: {}",
|
||||
tag,
|
||||
hint,
|
||||
result_path.string(),
|
||||
e.what()));
|
||||
printer.log(
|
||||
nvbench::log_level::warn,
|
||||
fmt::format("Error writing {} ({}) to {}: {}", tag, hint, result_path.string(), e.what()));
|
||||
}
|
||||
} // end catch
|
||||
|
||||
@@ -221,14 +212,12 @@ void json_printer::do_process_bulk_data_float64(
|
||||
summ.set_string("hide", "Not needed in table.");
|
||||
|
||||
timer.stop();
|
||||
if (auto printer_opt_ref = state.get_benchmark().get_printer();
|
||||
printer_opt_ref.has_value())
|
||||
if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
printer.log(nvbench::log_level::info,
|
||||
fmt::format("Wrote '{}' in {:>6.3f}ms",
|
||||
result_path.string(),
|
||||
timer.get_duration() * 1000));
|
||||
printer.log(
|
||||
nvbench::log_level::info,
|
||||
fmt::format("Wrote '{}' in {:>6.3f}ms", result_path.string(), timer.get_duration() * 1000));
|
||||
}
|
||||
} // end hint == sample_times
|
||||
}
|
||||
@@ -304,15 +293,12 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
|
||||
device["global_memory_size"] = dev_info.get_global_memory_size();
|
||||
device["global_memory_bus_peak_clock_rate"] =
|
||||
dev_info.get_global_memory_bus_peak_clock_rate();
|
||||
device["global_memory_bus_width"] =
|
||||
dev_info.get_global_memory_bus_width();
|
||||
device["global_memory_bus_bandwidth"] =
|
||||
dev_info.get_global_memory_bus_bandwidth();
|
||||
device["l2_cache_size"] = dev_info.get_l2_cache_size();
|
||||
device["shared_memory_per_sm"] = dev_info.get_shared_memory_per_sm();
|
||||
device["shared_memory_per_block"] =
|
||||
dev_info.get_shared_memory_per_block();
|
||||
device["ecc_state"] = dev_info.get_ecc_state();
|
||||
device["global_memory_bus_width"] = dev_info.get_global_memory_bus_width();
|
||||
device["global_memory_bus_bandwidth"] = dev_info.get_global_memory_bus_bandwidth();
|
||||
device["l2_cache_size"] = dev_info.get_l2_cache_size();
|
||||
device["shared_memory_per_sm"] = dev_info.get_shared_memory_per_sm();
|
||||
device["shared_memory_per_block"] = dev_info.get_shared_memory_per_block();
|
||||
device["ecc_state"] = dev_info.get_ecc_state();
|
||||
}
|
||||
} // "devices"
|
||||
|
||||
@@ -358,23 +344,19 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
|
||||
switch (axis_ptr->get_type())
|
||||
{
|
||||
case nvbench::axis_type::type:
|
||||
value["is_active"] =
|
||||
static_cast<type_axis &>(*axis_ptr).get_is_active(i);
|
||||
value["is_active"] = static_cast<type_axis &>(*axis_ptr).get_is_active(i);
|
||||
break;
|
||||
|
||||
case nvbench::axis_type::int64:
|
||||
value["value"] =
|
||||
static_cast<int64_axis &>(*axis_ptr).get_value(i);
|
||||
value["value"] = static_cast<int64_axis &>(*axis_ptr).get_value(i);
|
||||
break;
|
||||
|
||||
case nvbench::axis_type::float64:
|
||||
value["value"] =
|
||||
static_cast<float64_axis &>(*axis_ptr).get_value(i);
|
||||
value["value"] = static_cast<float64_axis &>(*axis_ptr).get_value(i);
|
||||
break;
|
||||
|
||||
case nvbench::axis_type::string:
|
||||
value["value"] =
|
||||
static_cast<string_axis &>(*axis_ptr).get_value(i);
|
||||
value["value"] = static_cast<string_axis &>(*axis_ptr).get_value(i);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -38,9 +38,7 @@ struct json_printer : nvbench::printer_base
|
||||
{
|
||||
using printer_base::printer_base;
|
||||
|
||||
json_printer(std::ostream &stream,
|
||||
std::string stream_name,
|
||||
bool enable_binary_output)
|
||||
json_printer(std::ostream &stream, std::string stream_name, bool enable_binary_output)
|
||||
: printer_base(stream, std::move(stream_name))
|
||||
, m_enable_binary_output{enable_binary_output}
|
||||
{}
|
||||
@@ -59,23 +57,16 @@ struct json_printer : nvbench::printer_base
|
||||
|
||||
[[nodiscard]] static version_t get_json_file_version();
|
||||
|
||||
[[nodiscard]] bool get_enable_binary_output() const
|
||||
{
|
||||
return m_enable_binary_output;
|
||||
}
|
||||
[[nodiscard]] bool get_enable_binary_output() const { return m_enable_binary_output; }
|
||||
void set_enable_binary_output(bool b) { m_enable_binary_output = b; }
|
||||
|
||||
protected:
|
||||
// Virtual API from printer_base:
|
||||
void do_log_argv(const std::vector<std::string>& argv) override
|
||||
{
|
||||
m_argv = argv;
|
||||
}
|
||||
void do_process_bulk_data_float64(
|
||||
nvbench::state &state,
|
||||
const std::string &tag,
|
||||
const std::string &hint,
|
||||
const std::vector<nvbench::float64_t> &data) override;
|
||||
void do_log_argv(const std::vector<std::string> &argv) override { m_argv = argv; }
|
||||
void do_process_bulk_data_float64(nvbench::state &state,
|
||||
const std::string &tag,
|
||||
const std::string &hint,
|
||||
const std::vector<nvbench::float64_t> &data) override;
|
||||
void do_print_benchmark_results(const benchmark_vector &benches) override;
|
||||
|
||||
bool m_enable_binary_output{false};
|
||||
|
||||
@@ -42,20 +42,17 @@ struct launch
|
||||
{}
|
||||
|
||||
// move-only
|
||||
launch(const launch &) = delete;
|
||||
launch(launch &&) = default;
|
||||
launch(const launch &) = delete;
|
||||
launch(launch &&) = default;
|
||||
launch &operator=(const launch &) = delete;
|
||||
launch &operator=(launch &&) = default;
|
||||
launch &operator=(launch &&) = default;
|
||||
|
||||
/**
|
||||
* @return a CUDA stream that all kernels and other stream-ordered CUDA work
|
||||
* must use. This stream can be changed by the `KernelGenerator` using the
|
||||
* `nvbench::state::set_cuda_stream` method.
|
||||
*/
|
||||
__forceinline__ const nvbench::cuda_stream &get_stream() const
|
||||
{
|
||||
return m_stream;
|
||||
};
|
||||
__forceinline__ const nvbench::cuda_stream &get_stream() const { return m_stream; };
|
||||
|
||||
private:
|
||||
// The stream is owned by the `nvbench::state` associated with this launch.
|
||||
|
||||
@@ -27,23 +27,23 @@
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#define NVBENCH_MAIN \
|
||||
int main(int argc, char const *const *argv) \
|
||||
try \
|
||||
{ \
|
||||
NVBENCH_MAIN_BODY(argc, argv); \
|
||||
NVBENCH_CUDA_CALL(cudaDeviceReset()); \
|
||||
return 0; \
|
||||
} \
|
||||
catch (std::exception & e) \
|
||||
{ \
|
||||
std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n"; \
|
||||
return 1; \
|
||||
} \
|
||||
catch (...) \
|
||||
{ \
|
||||
std::cerr << "\nNVBench encountered an unknown error.\n"; \
|
||||
return 1; \
|
||||
#define NVBENCH_MAIN \
|
||||
int main(int argc, char const *const *argv) \
|
||||
try \
|
||||
{ \
|
||||
NVBENCH_MAIN_BODY(argc, argv); \
|
||||
NVBENCH_CUDA_CALL(cudaDeviceReset()); \
|
||||
return 0; \
|
||||
} \
|
||||
catch (std::exception & e) \
|
||||
{ \
|
||||
std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n"; \
|
||||
return 1; \
|
||||
} \
|
||||
catch (...) \
|
||||
{ \
|
||||
std::cerr << "\nNVBench encountered an unknown error.\n"; \
|
||||
return 1; \
|
||||
}
|
||||
|
||||
#ifdef NVBENCH_HAS_CUPTI
|
||||
@@ -54,35 +54,35 @@
|
||||
// clang-format on
|
||||
#endif
|
||||
|
||||
#define NVBENCH_MAIN_PARSE(argc, argv) \
|
||||
nvbench::option_parser parser; \
|
||||
#define NVBENCH_MAIN_PARSE(argc, argv) \
|
||||
nvbench::option_parser parser; \
|
||||
parser.parse(argc, argv)
|
||||
|
||||
#define NVBENCH_MAIN_BODY(argc, argv) \
|
||||
do \
|
||||
{ \
|
||||
NVBENCH_INITIALIZE_DRIVER_API; \
|
||||
NVBENCH_MAIN_PARSE(argc, argv); \
|
||||
auto &printer = parser.get_printer(); \
|
||||
\
|
||||
printer.print_device_info(); \
|
||||
printer.print_log_preamble(); \
|
||||
auto &benchmarks = parser.get_benchmarks(); \
|
||||
\
|
||||
std::size_t total_states = 0; \
|
||||
for (auto &bench_ptr : benchmarks) \
|
||||
{ \
|
||||
total_states += bench_ptr->get_config_count(); \
|
||||
} \
|
||||
printer.set_total_state_count(total_states); \
|
||||
\
|
||||
printer.set_completed_state_count(0); \
|
||||
for (auto &bench_ptr : benchmarks) \
|
||||
{ \
|
||||
bench_ptr->set_printer(printer); \
|
||||
bench_ptr->run(); \
|
||||
bench_ptr->clear_printer(); \
|
||||
} \
|
||||
printer.print_log_epilogue(); \
|
||||
printer.print_benchmark_results(benchmarks); \
|
||||
#define NVBENCH_MAIN_BODY(argc, argv) \
|
||||
do \
|
||||
{ \
|
||||
NVBENCH_INITIALIZE_DRIVER_API; \
|
||||
NVBENCH_MAIN_PARSE(argc, argv); \
|
||||
auto &printer = parser.get_printer(); \
|
||||
\
|
||||
printer.print_device_info(); \
|
||||
printer.print_log_preamble(); \
|
||||
auto &benchmarks = parser.get_benchmarks(); \
|
||||
\
|
||||
std::size_t total_states = 0; \
|
||||
for (auto &bench_ptr : benchmarks) \
|
||||
{ \
|
||||
total_states += bench_ptr->get_config_count(); \
|
||||
} \
|
||||
printer.set_total_state_count(total_states); \
|
||||
\
|
||||
printer.set_completed_state_count(0); \
|
||||
for (auto &bench_ptr : benchmarks) \
|
||||
{ \
|
||||
bench_ptr->set_printer(printer); \
|
||||
bench_ptr->run(); \
|
||||
bench_ptr->clear_printer(); \
|
||||
} \
|
||||
printer.print_log_epilogue(); \
|
||||
printer.print_benchmark_results(benchmarks); \
|
||||
} while (false)
|
||||
|
||||
@@ -44,9 +44,8 @@ void markdown_printer::do_print_device_info()
|
||||
fmt::format_to(buffer, "# Devices\n\n");
|
||||
|
||||
const auto &device_mgr = nvbench::device_manager::get();
|
||||
const auto &devices = device_mgr.get_number_of_used_devices() > 0
|
||||
? device_mgr.get_used_devices()
|
||||
: device_mgr.get_devices();
|
||||
const auto &devices = device_mgr.get_number_of_used_devices() > 0 ? device_mgr.get_used_devices()
|
||||
: device_mgr.get_devices();
|
||||
for (const auto &device : devices)
|
||||
{
|
||||
const auto [gmem_free, gmem_used] = device.get_global_memory_usage();
|
||||
@@ -64,22 +63,17 @@ void markdown_printer::do_print_device_info()
|
||||
"* Global Memory: {} MiB Free / {} MiB Total\n",
|
||||
gmem_free / 1024 / 1024,
|
||||
gmem_used / 1024 / 1024);
|
||||
fmt::format_to(
|
||||
buffer,
|
||||
"* Global Memory Bus Peak: {} GB/sec ({}-bit DDR @{}MHz)\n",
|
||||
device.get_global_memory_bus_bandwidth() / 1000 / 1000 / 1000,
|
||||
device.get_global_memory_bus_width(),
|
||||
device.get_global_memory_bus_peak_clock_rate() / 1000 / 1000);
|
||||
fmt::format_to(buffer,
|
||||
"* Global Memory Bus Peak: {} GB/sec ({}-bit DDR @{}MHz)\n",
|
||||
device.get_global_memory_bus_bandwidth() / 1000 / 1000 / 1000,
|
||||
device.get_global_memory_bus_width(),
|
||||
device.get_global_memory_bus_peak_clock_rate() / 1000 / 1000);
|
||||
fmt::format_to(buffer,
|
||||
"* Max Shared Memory: {} KiB/SM, {} KiB/Block\n",
|
||||
device.get_shared_memory_per_sm() / 1024,
|
||||
device.get_shared_memory_per_block() / 1024);
|
||||
fmt::format_to(buffer,
|
||||
"* L2 Cache Size: {} KiB\n",
|
||||
device.get_l2_cache_size() / 1024);
|
||||
fmt::format_to(buffer,
|
||||
"* Maximum Active Blocks: {}/SM\n",
|
||||
device.get_max_blocks_per_sm());
|
||||
fmt::format_to(buffer, "* L2 Cache Size: {} KiB\n", device.get_l2_cache_size() / 1024);
|
||||
fmt::format_to(buffer, "* Maximum Active Blocks: {}/SM\n", device.get_max_blocks_per_sm());
|
||||
fmt::format_to(buffer,
|
||||
"* Maximum Active Threads: {}/SM, {}/Block\n",
|
||||
device.get_max_threads_per_sm(),
|
||||
@@ -88,18 +82,13 @@ void markdown_printer::do_print_device_info()
|
||||
"* Available Registers: {}/SM, {}/Block\n",
|
||||
device.get_registers_per_sm(),
|
||||
device.get_registers_per_block());
|
||||
fmt::format_to(buffer,
|
||||
"* ECC Enabled: {}\n",
|
||||
device.get_ecc_state() ? "Yes" : "No");
|
||||
fmt::format_to(buffer, "* ECC Enabled: {}\n", device.get_ecc_state() ? "Yes" : "No");
|
||||
fmt::format_to(buffer, "\n");
|
||||
}
|
||||
m_ostream << fmt::to_string(buffer);
|
||||
}
|
||||
|
||||
void markdown_printer::do_print_log_preamble()
|
||||
{
|
||||
m_ostream << "# Log\n\n```\n";
|
||||
}
|
||||
void markdown_printer::do_print_log_preamble() { m_ostream << "# Log\n\n```\n"; }
|
||||
|
||||
void markdown_printer::do_print_log_epilogue() { m_ostream << "```\n\n"; }
|
||||
|
||||
@@ -147,8 +136,7 @@ void markdown_printer::do_log_run_state(const nvbench::state &exec_state)
|
||||
{
|
||||
if (m_total_state_count == 0)
|
||||
{ // No progress info
|
||||
this->log(nvbench::log_level::run,
|
||||
exec_state.get_short_description(m_color));
|
||||
this->log(nvbench::log_level::run, exec_state.get_short_description(m_color));
|
||||
}
|
||||
else
|
||||
{ // Add progress
|
||||
@@ -160,8 +148,7 @@ void markdown_printer::do_log_run_state(const nvbench::state &exec_state)
|
||||
}
|
||||
}
|
||||
|
||||
void markdown_printer::do_print_benchmark_list(
|
||||
const printer_base::benchmark_vector &benches)
|
||||
void markdown_printer::do_print_benchmark_list(const printer_base::benchmark_vector &benches)
|
||||
{
|
||||
if (benches.empty())
|
||||
{
|
||||
@@ -204,10 +191,7 @@ void markdown_printer::do_print_benchmark_list(
|
||||
{
|
||||
desc = fmt::format(" ({})", desc);
|
||||
}
|
||||
fmt::format_to(buffer,
|
||||
" * `{}`{}\n",
|
||||
axis_ptr->get_input_string(i),
|
||||
desc);
|
||||
fmt::format_to(buffer, " * `{}`{}\n", axis_ptr->get_input_string(i), desc);
|
||||
} // end foreach value
|
||||
} // end foreach axis
|
||||
fmt::format_to(buffer, "\n");
|
||||
@@ -216,8 +200,7 @@ void markdown_printer::do_print_benchmark_list(
|
||||
m_ostream << fmt::to_string(buffer);
|
||||
}
|
||||
|
||||
void markdown_printer::do_print_benchmark_results(
|
||||
const printer_base::benchmark_vector &benches)
|
||||
void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_vector &benches)
|
||||
{
|
||||
auto format_visitor = [](const auto &v) {
|
||||
using T = std::decay_t<decltype(v)>;
|
||||
@@ -252,19 +235,15 @@ void markdown_printer::do_print_benchmark_results(
|
||||
// Do a single pass when no devices are specified. This happens for
|
||||
// benchmarks with `cpu` exec_tags.
|
||||
const std::size_t num_device_passes = devices.empty() ? 1 : devices.size();
|
||||
for (std::size_t device_pass = 0; device_pass < num_device_passes;
|
||||
++device_pass)
|
||||
for (std::size_t device_pass = 0; device_pass < num_device_passes; ++device_pass)
|
||||
{
|
||||
std::optional<nvbench::device_info> device =
|
||||
devices.empty() ? std::nullopt
|
||||
: std::make_optional(devices[device_pass]);
|
||||
std::optional<nvbench::device_info> device = devices.empty()
|
||||
? std::nullopt
|
||||
: std::make_optional(devices[device_pass]);
|
||||
|
||||
if (device)
|
||||
{
|
||||
fmt::format_to(buffer,
|
||||
"\n### [{}] {}\n\n",
|
||||
device->get_id(),
|
||||
device->get_name());
|
||||
fmt::format_to(buffer, "\n### [{}] {}\n\n", device->get_id(), device->get_name());
|
||||
}
|
||||
|
||||
std::size_t row = 0;
|
||||
@@ -288,15 +267,11 @@ void markdown_printer::do_print_benchmark_results(
|
||||
{
|
||||
const nvbench::int64_t value = axis_values.get_int64(name);
|
||||
const nvbench::int64_t exponent = int64_axis::compute_log2(value);
|
||||
table.add_cell(row,
|
||||
name,
|
||||
name,
|
||||
fmt::format("2^{} = {}", exponent, value));
|
||||
table.add_cell(row, name, name, fmt::format("2^{} = {}", exponent, value));
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string value = std::visit(format_visitor,
|
||||
axis_values.get_value(name));
|
||||
std::string value = std::visit(format_visitor, axis_values.get_value(name));
|
||||
table.add_cell(row, name + "_axis", name, std::move(value));
|
||||
}
|
||||
}
|
||||
@@ -308,12 +283,9 @@ void markdown_printer::do_print_benchmark_results(
|
||||
continue;
|
||||
}
|
||||
const std::string &tag = summ.get_tag();
|
||||
const std::string &header = summ.has_value("name")
|
||||
? summ.get_string("name")
|
||||
: tag;
|
||||
const std::string &header = summ.has_value("name") ? summ.get_string("name") : tag;
|
||||
|
||||
std::string hint = summ.has_value("hint") ? summ.get_string("hint")
|
||||
: std::string{};
|
||||
std::string hint = summ.has_value("hint") ? summ.get_string("hint") : std::string{};
|
||||
if (hint == "duration")
|
||||
{
|
||||
table.add_cell(row, tag, header, this->do_format_duration(summ));
|
||||
@@ -332,10 +304,7 @@ void markdown_printer::do_print_benchmark_results(
|
||||
}
|
||||
else if (hint == "sample_size")
|
||||
{
|
||||
table.add_cell(row,
|
||||
tag,
|
||||
header,
|
||||
this->do_format_sample_size(summ));
|
||||
table.add_cell(row, tag, header, this->do_format_sample_size(summ));
|
||||
}
|
||||
else if (hint == "percentage")
|
||||
{
|
||||
@@ -353,8 +322,7 @@ void markdown_printer::do_print_benchmark_results(
|
||||
auto table_str = table.to_string();
|
||||
fmt::format_to(buffer,
|
||||
"{}",
|
||||
table_str.empty() ? "No data -- check log.\n"
|
||||
: std::move(table_str));
|
||||
table_str.empty() ? "No data -- check log.\n" : std::move(table_str));
|
||||
} // end foreach device_pass
|
||||
}
|
||||
|
||||
|
||||
@@ -33,8 +33,7 @@ namespace nvbench
|
||||
*/
|
||||
struct named_values
|
||||
{
|
||||
using value_type =
|
||||
std::variant<nvbench::int64_t, nvbench::float64_t, std::string>;
|
||||
using value_type = std::variant<nvbench::int64_t, nvbench::float64_t, std::string>;
|
||||
|
||||
enum class type
|
||||
{
|
||||
@@ -43,7 +42,7 @@ struct named_values
|
||||
string
|
||||
};
|
||||
|
||||
void append(const named_values& other);
|
||||
void append(const named_values &other);
|
||||
|
||||
[[nodiscard]] std::size_t get_size() const;
|
||||
[[nodiscard]] std::vector<std::string> get_names() const;
|
||||
@@ -60,11 +59,11 @@ struct named_values
|
||||
|
||||
[[nodiscard]] type get_type(const std::string &name) const;
|
||||
[[nodiscard]] bool has_value(const std::string &name) const;
|
||||
[[nodiscard]] const value_type& get_value(const std::string &name) const;
|
||||
[[nodiscard]] const value_type &get_value(const std::string &name) const;
|
||||
|
||||
void clear();
|
||||
|
||||
void remove_value(const std::string& name);
|
||||
void remove_value(const std::string &name);
|
||||
|
||||
private:
|
||||
struct named_value
|
||||
|
||||
@@ -33,9 +33,7 @@ namespace nvbench
|
||||
|
||||
void named_values::append(const named_values &other)
|
||||
{
|
||||
m_storage.insert(m_storage.end(),
|
||||
other.m_storage.cbegin(),
|
||||
other.m_storage.cend());
|
||||
m_storage.insert(m_storage.end(), other.m_storage.cbegin(), other.m_storage.cend());
|
||||
}
|
||||
|
||||
void named_values::clear() { m_storage.clear(); }
|
||||
@@ -55,20 +53,17 @@ std::vector<std::string> named_values::get_names() const
|
||||
|
||||
bool named_values::has_value(const std::string &name) const
|
||||
{
|
||||
auto iter =
|
||||
std::find_if(m_storage.cbegin(),
|
||||
m_storage.cend(),
|
||||
[&name](const auto &val) { return val.name == name; });
|
||||
auto iter = std::find_if(m_storage.cbegin(), m_storage.cend(), [&name](const auto &val) {
|
||||
return val.name == name;
|
||||
});
|
||||
return iter != m_storage.cend();
|
||||
}
|
||||
|
||||
const named_values::value_type &
|
||||
named_values::get_value(const std::string &name) const
|
||||
const named_values::value_type &named_values::get_value(const std::string &name) const
|
||||
{
|
||||
auto iter =
|
||||
std::find_if(m_storage.cbegin(),
|
||||
m_storage.cend(),
|
||||
[&name](const auto &val) { return val.name == name; });
|
||||
auto iter = std::find_if(m_storage.cbegin(), m_storage.cend(), [&name](const auto &val) {
|
||||
return val.name == name;
|
||||
});
|
||||
if (iter == m_storage.cend())
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error, "No value with name '{}'.", name);
|
||||
@@ -96,9 +91,7 @@ named_values::type named_values::get_type(const std::string &name) const
|
||||
// warning C4702: unreachable code
|
||||
// This is a future-proofing check, it'll be reachable if something breaks
|
||||
NVBENCH_MSVC_PUSH_DISABLE_WARNING(4702)
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Unknown variant type for entry '{}'.",
|
||||
name);
|
||||
NVBENCH_THROW(std::runtime_error, "Unknown variant type for entry '{}'.", name);
|
||||
},
|
||||
this->get_value(name));
|
||||
NVBENCH_MSVC_POP_WARNING()
|
||||
@@ -111,10 +104,7 @@ try
|
||||
}
|
||||
catch (std::exception &err)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Error looking up int64 value `{}`:\n{}",
|
||||
name,
|
||||
err.what());
|
||||
NVBENCH_THROW(std::runtime_error, "Error looking up int64 value `{}`:\n{}", name, err.what());
|
||||
}
|
||||
|
||||
nvbench::float64_t named_values::get_float64(const std::string &name) const
|
||||
@@ -124,10 +114,7 @@ try
|
||||
}
|
||||
catch (std::exception &err)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Error looking up float64 value `{}`:\n{}",
|
||||
name,
|
||||
err.what());
|
||||
NVBENCH_THROW(std::runtime_error, "Error looking up float64 value `{}`:\n{}", name, err.what());
|
||||
}
|
||||
|
||||
const std::string &named_values::get_string(const std::string &name) const
|
||||
@@ -137,10 +124,7 @@ try
|
||||
}
|
||||
catch (std::exception &err)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Error looking up string value `{}`:\n{}",
|
||||
name,
|
||||
err.what());
|
||||
NVBENCH_THROW(std::runtime_error, "Error looking up string value `{}`:\n{}", name, err.what());
|
||||
}
|
||||
|
||||
void named_values::set_int64(std::string name, nvbench::int64_t value)
|
||||
@@ -165,10 +149,9 @@ void named_values::set_value(std::string name, named_values::value_type value)
|
||||
|
||||
void named_values::remove_value(const std::string &name)
|
||||
{
|
||||
auto iter =
|
||||
std::find_if(m_storage.begin(), m_storage.end(), [&name](const auto &val) {
|
||||
return val.name == name;
|
||||
});
|
||||
auto iter = std::find_if(m_storage.begin(), m_storage.end(), [&name](const auto &val) {
|
||||
return val.name == name;
|
||||
});
|
||||
if (iter != m_storage.end())
|
||||
{
|
||||
m_storage.erase(iter);
|
||||
|
||||
@@ -82,20 +82,11 @@ std::string_view submatch_to_sv(const sv_submatch &in)
|
||||
//
|
||||
// So we're stuck with materializing a std::string and calling std::stoX(). Ah
|
||||
// well. At least it's not istream.
|
||||
void parse(std::string_view input, nvbench::int32_t &val)
|
||||
{
|
||||
val = std::stoi(std::string(input));
|
||||
}
|
||||
void parse(std::string_view input, nvbench::int32_t &val) { val = std::stoi(std::string(input)); }
|
||||
|
||||
void parse(std::string_view input, nvbench::int64_t &val)
|
||||
{
|
||||
val = std::stoll(std::string(input));
|
||||
}
|
||||
void parse(std::string_view input, nvbench::int64_t &val) { val = std::stoll(std::string(input)); }
|
||||
|
||||
void parse(std::string_view input, nvbench::float64_t &val)
|
||||
{
|
||||
val = std::stod(std::string(input));
|
||||
}
|
||||
void parse(std::string_view input, nvbench::float64_t &val) { val = std::stod(std::string(input)); }
|
||||
|
||||
void parse(std::string_view input, std::string &val) { val = input; }
|
||||
|
||||
@@ -112,9 +103,8 @@ std::vector<T> parse_list_values(std::string_view list_spec)
|
||||
"(?:,|$)" // Delimiters
|
||||
};
|
||||
|
||||
auto values_begin =
|
||||
sv_regex_iterator(list_spec.cbegin(), list_spec.cend(), value_regex);
|
||||
auto values_end = sv_regex_iterator{};
|
||||
auto values_begin = sv_regex_iterator(list_spec.cbegin(), list_spec.cend(), value_regex);
|
||||
auto values_end = sv_regex_iterator{};
|
||||
while (values_begin != values_end)
|
||||
{
|
||||
auto match = *values_begin++;
|
||||
@@ -131,8 +121,7 @@ std::vector<T> parse_list_values(std::string_view list_spec)
|
||||
// Parses a range specification "<start> : <stop> [ : <stride> ]" and returns
|
||||
// a vector filled with the specified range.
|
||||
template <typename T>
|
||||
std::vector<T> parse_range_values(std::string_view range_spec,
|
||||
nvbench::wrapped_type<T>)
|
||||
std::vector<T> parse_range_values(std::string_view range_spec, nvbench::wrapped_type<T>)
|
||||
{
|
||||
std::vector<T> range_params;
|
||||
|
||||
@@ -143,9 +132,8 @@ std::vector<T> parse_range_values(std::string_view range_spec,
|
||||
"(?:$|:)" // Delimiters
|
||||
};
|
||||
|
||||
auto values_begin =
|
||||
sv_regex_iterator(range_spec.cbegin(), range_spec.cend(), value_regex);
|
||||
auto values_end = sv_regex_iterator{};
|
||||
auto values_begin = sv_regex_iterator(range_spec.cbegin(), range_spec.cend(), value_regex);
|
||||
auto values_end = sv_regex_iterator{};
|
||||
for (; values_begin != values_end; ++values_begin)
|
||||
{
|
||||
auto match = *values_begin;
|
||||
@@ -221,25 +209,15 @@ std::vector<T> parse_values(std::string_view value_spec)
|
||||
"$"}; // EOS
|
||||
|
||||
sv_match match;
|
||||
if (std::regex_search(value_spec.cbegin(),
|
||||
value_spec.cend(),
|
||||
match,
|
||||
list_regex))
|
||||
if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, list_regex))
|
||||
{
|
||||
return parse_list_values<T>(submatch_to_sv(match[1]));
|
||||
}
|
||||
else if (std::regex_search(value_spec.cbegin(),
|
||||
value_spec.cend(),
|
||||
match,
|
||||
range_regex))
|
||||
else if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, range_regex))
|
||||
{
|
||||
return parse_range_values(submatch_to_sv(match[1]),
|
||||
nvbench::wrapped_type<T>{});
|
||||
return parse_range_values(submatch_to_sv(match[1]), nvbench::wrapped_type<T>{});
|
||||
}
|
||||
else if (std::regex_search(value_spec.cbegin(),
|
||||
value_spec.cend(),
|
||||
match,
|
||||
single_regex))
|
||||
else if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, single_regex))
|
||||
{
|
||||
T val;
|
||||
parse(submatch_to_sv(match[1]), val);
|
||||
@@ -247,9 +225,7 @@ std::vector<T> parse_values(std::string_view value_spec)
|
||||
}
|
||||
else
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Invalid axis value spec: {}",
|
||||
value_spec);
|
||||
NVBENCH_THROW(std::runtime_error, "Invalid axis value spec: {}", value_spec);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -514,8 +490,8 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
|
||||
this->update_int64_prop(first[0], first[1]);
|
||||
first += 2;
|
||||
}
|
||||
else if (arg == "--min-time" || arg == "--max-noise" ||
|
||||
arg == "--skip-time" || arg == "--timeout")
|
||||
else if (arg == "--min-time" || arg == "--max-noise" || arg == "--skip-time" ||
|
||||
arg == "--timeout")
|
||||
{
|
||||
check_params(1);
|
||||
this->update_float64_prop(first[0], first[1]);
|
||||
@@ -523,9 +499,7 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
|
||||
}
|
||||
else
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Unrecognized command-line argument: `{}`.",
|
||||
arg);
|
||||
NVBENCH_THROW(std::runtime_error, "Unrecognized command-line argument: `{}`.", arg);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -534,7 +508,7 @@ void option_parser::add_markdown_printer(const std::string &spec)
|
||||
try
|
||||
{
|
||||
std::ostream &stream = this->printer_spec_to_ostream(spec);
|
||||
auto &printer = m_printer.emplace<nvbench::markdown_printer>(stream, spec);
|
||||
auto &printer = m_printer.emplace<nvbench::markdown_printer>(stream, spec);
|
||||
if (spec == "stdout")
|
||||
{
|
||||
printer.set_color(m_color_md_stdout_printer);
|
||||
@@ -556,14 +530,10 @@ try
|
||||
}
|
||||
catch (std::exception &e)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Error while adding csv output for `{}`:\n{}",
|
||||
spec,
|
||||
e.what());
|
||||
NVBENCH_THROW(std::runtime_error, "Error while adding csv output for `{}`:\n{}", spec, e.what());
|
||||
}
|
||||
|
||||
void option_parser::add_json_printer(const std::string &spec,
|
||||
bool enable_binary)
|
||||
void option_parser::add_json_printer(const std::string &spec, bool enable_binary)
|
||||
try
|
||||
{
|
||||
std::ostream &stream = this->printer_spec_to_ostream(spec);
|
||||
@@ -624,10 +594,7 @@ void option_parser::print_help() const
|
||||
fmt::print("{}\n{}\n", ::cli_help_text, ::cli_help_axis_text);
|
||||
}
|
||||
|
||||
void option_parser::print_help_axis() const
|
||||
{
|
||||
fmt::print("{}\n", ::cli_help_axis_text);
|
||||
}
|
||||
void option_parser::print_help_axis() const { fmt::print("{}\n", ::cli_help_axis_text); }
|
||||
|
||||
void option_parser::set_persistence_mode(const std::string &state)
|
||||
try
|
||||
@@ -685,9 +652,7 @@ try
|
||||
{
|
||||
if (rate_val == nvbench::device_info::clock_rate::none)
|
||||
{
|
||||
fmt::print("Unlocking clocks for device '{}' ({}).\n",
|
||||
device.get_name(),
|
||||
device.get_id());
|
||||
fmt::print("Unlocking clocks for device '{}' ({}).\n", device.get_name(), device.get_id());
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -757,16 +722,12 @@ try
|
||||
}
|
||||
catch (std::exception &e)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Error handling option --benchmark `{}`:\n{}",
|
||||
name,
|
||||
e.what());
|
||||
NVBENCH_THROW(std::runtime_error, "Error handling option --benchmark `{}`:\n{}", name, e.what());
|
||||
}
|
||||
|
||||
void option_parser::replay_global_args()
|
||||
{
|
||||
this->parse_range(m_global_benchmark_args.cbegin(),
|
||||
m_global_benchmark_args.cend());
|
||||
this->parse_range(m_global_benchmark_args.cbegin(), m_global_benchmark_args.cend());
|
||||
}
|
||||
|
||||
void option_parser::update_devices(const std::string &devices)
|
||||
@@ -790,10 +751,7 @@ try
|
||||
}
|
||||
catch (std::exception &e)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Error handling option --devices `{}`:\n{}",
|
||||
devices,
|
||||
e.what());
|
||||
NVBENCH_THROW(std::runtime_error, "Error handling option --devices `{}`:\n{}", devices, e.what());
|
||||
}
|
||||
|
||||
void option_parser::update_axis(const std::string &spec)
|
||||
@@ -832,28 +790,20 @@ try
|
||||
switch (axis.get_type())
|
||||
{
|
||||
case axis_type::type:
|
||||
this->update_type_axis(static_cast<nvbench::type_axis &>(axis),
|
||||
values,
|
||||
flags);
|
||||
this->update_type_axis(static_cast<nvbench::type_axis &>(axis), values, flags);
|
||||
break;
|
||||
|
||||
case axis_type::int64:
|
||||
this->update_int64_axis(static_cast<nvbench::int64_axis &>(axis),
|
||||
values,
|
||||
flags);
|
||||
this->update_int64_axis(static_cast<nvbench::int64_axis &>(axis), values, flags);
|
||||
break;
|
||||
|
||||
case axis_type::float64:
|
||||
this->update_float64_axis(static_cast<nvbench::float64_axis &>(axis),
|
||||
values,
|
||||
flags);
|
||||
this->update_float64_axis(static_cast<nvbench::float64_axis &>(axis), values, flags);
|
||||
|
||||
break;
|
||||
|
||||
case axis_type::string:
|
||||
this->update_string_axis(static_cast<nvbench::string_axis &>(axis),
|
||||
values,
|
||||
flags);
|
||||
this->update_string_axis(static_cast<nvbench::string_axis &>(axis), values, flags);
|
||||
|
||||
break;
|
||||
|
||||
@@ -866,10 +816,7 @@ try
|
||||
}
|
||||
catch (std::exception &e)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Error handling option --axis `{}`:\n{}",
|
||||
spec,
|
||||
e.what());
|
||||
NVBENCH_THROW(std::runtime_error, "Error handling option --axis `{}`:\n{}", spec, e.what());
|
||||
}
|
||||
|
||||
void option_parser::update_int64_axis(int64_axis &axis,
|
||||
@@ -888,9 +835,7 @@ void option_parser::update_int64_axis(int64_axis &axis,
|
||||
}
|
||||
else
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Invalid flag for int64 axis: `{}`",
|
||||
flag_spec);
|
||||
NVBENCH_THROW(std::runtime_error, "Invalid flag for int64 axis: `{}`", flag_spec);
|
||||
}
|
||||
|
||||
auto input_values = parse_values<nvbench::int64_t>(value_spec);
|
||||
@@ -905,9 +850,7 @@ void option_parser::update_float64_axis(float64_axis &axis,
|
||||
// Validate flags:
|
||||
if (!flag_spec.empty())
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Invalid flag for float64 axis: `{}`",
|
||||
flag_spec);
|
||||
NVBENCH_THROW(std::runtime_error, "Invalid flag for float64 axis: `{}`", flag_spec);
|
||||
}
|
||||
|
||||
auto input_values = parse_values<nvbench::float64_t>(value_spec);
|
||||
@@ -922,9 +865,7 @@ void option_parser::update_string_axis(string_axis &axis,
|
||||
// Validate flags:
|
||||
if (!flag_spec.empty())
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Invalid flag for string axis: `{}`",
|
||||
flag_spec);
|
||||
NVBENCH_THROW(std::runtime_error, "Invalid flag for string axis: `{}`", flag_spec);
|
||||
}
|
||||
|
||||
auto input_values = parse_values<std::string>(value_spec);
|
||||
@@ -939,9 +880,7 @@ void option_parser::update_type_axis(type_axis &axis,
|
||||
// Validate flags:
|
||||
if (!flag_spec.empty())
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Invalid flag for type axis: `{}`",
|
||||
flag_spec);
|
||||
NVBENCH_THROW(std::runtime_error, "Invalid flag for type axis: `{}`", flag_spec);
|
||||
}
|
||||
|
||||
auto input_values = parse_values<std::string>(value_spec);
|
||||
@@ -949,8 +888,7 @@ void option_parser::update_type_axis(type_axis &axis,
|
||||
axis.set_active_inputs(input_values);
|
||||
}
|
||||
|
||||
void option_parser::update_int64_prop(const std::string &prop_arg,
|
||||
const std::string &prop_val)
|
||||
void option_parser::update_int64_prop(const std::string &prop_arg, const std::string &prop_val)
|
||||
try
|
||||
{
|
||||
// If no active benchmark, save args as global.
|
||||
@@ -983,8 +921,7 @@ catch (std::exception &e)
|
||||
e.what());
|
||||
}
|
||||
|
||||
void option_parser::update_float64_prop(const std::string &prop_arg,
|
||||
const std::string &prop_val)
|
||||
void option_parser::update_float64_prop(const std::string &prop_arg, const std::string &prop_val)
|
||||
try
|
||||
{
|
||||
// If no active benchmark, save args as global.
|
||||
|
||||
@@ -41,8 +41,7 @@ struct type_axis;
|
||||
*/
|
||||
struct option_parser
|
||||
{
|
||||
using benchmark_vector =
|
||||
std::vector<std::unique_ptr<nvbench::benchmark_base>>;
|
||||
using benchmark_vector = std::vector<std::unique_ptr<nvbench::benchmark_base>>;
|
||||
|
||||
option_parser();
|
||||
~option_parser();
|
||||
@@ -51,15 +50,9 @@ struct option_parser
|
||||
void parse(std::vector<std::string> args);
|
||||
|
||||
[[nodiscard]] benchmark_vector &get_benchmarks() { return m_benchmarks; };
|
||||
[[nodiscard]] const benchmark_vector &get_benchmarks() const
|
||||
{
|
||||
return m_benchmarks;
|
||||
};
|
||||
[[nodiscard]] const benchmark_vector &get_benchmarks() const { return m_benchmarks; };
|
||||
|
||||
[[nodiscard]] const std::vector<std::string> &get_args() const
|
||||
{
|
||||
return m_args;
|
||||
}
|
||||
[[nodiscard]] const std::vector<std::string> &get_args() const { return m_args; }
|
||||
|
||||
/*!
|
||||
* Returns the output format requested by the parse options.
|
||||
@@ -115,10 +108,8 @@ private:
|
||||
std::string_view value_spec,
|
||||
std::string_view flag_spec);
|
||||
|
||||
void update_int64_prop(const std::string &prop_arg,
|
||||
const std::string &prop_val);
|
||||
void update_float64_prop(const std::string &prop_arg,
|
||||
const std::string &prop_val);
|
||||
void update_int64_prop(const std::string &prop_arg, const std::string &prop_val);
|
||||
void update_float64_prop(const std::string &prop_arg, const std::string &prop_val);
|
||||
|
||||
void update_used_device_state() const;
|
||||
|
||||
|
||||
@@ -76,19 +76,16 @@ struct printer_base
|
||||
virtual ~printer_base();
|
||||
|
||||
// move-only
|
||||
printer_base(const printer_base &) = delete;
|
||||
printer_base(printer_base &&) = default;
|
||||
printer_base(const printer_base &) = delete;
|
||||
printer_base(printer_base &&) = default;
|
||||
printer_base &operator=(const printer_base &) = delete;
|
||||
printer_base &operator=(printer_base &&) = default;
|
||||
printer_base &operator=(printer_base &&) = default;
|
||||
|
||||
/*!
|
||||
* Called once with the command line arguments used to invoke the current
|
||||
* executable.
|
||||
*/
|
||||
void log_argv(const std::vector<std::string> &argv)
|
||||
{
|
||||
this->do_log_argv(argv);
|
||||
}
|
||||
void log_argv(const std::vector<std::string> &argv) { this->do_log_argv(argv); }
|
||||
|
||||
/*!
|
||||
* Print a summary of all detected devices, if supported.
|
||||
@@ -108,19 +105,13 @@ struct printer_base
|
||||
/*!
|
||||
* Print a log message at the specified log level.
|
||||
*/
|
||||
void log(nvbench::log_level level, const std::string &msg)
|
||||
{
|
||||
this->do_log(level, msg);
|
||||
}
|
||||
void log(nvbench::log_level level, const std::string &msg) { this->do_log(level, msg); }
|
||||
|
||||
/*!
|
||||
* Called before running the measurements associated with state.
|
||||
* Implementations are expected to call `log(log_level::run, ...)`.
|
||||
*/
|
||||
void log_run_state(const nvbench::state &exec_state)
|
||||
{
|
||||
this->do_log_run_state(exec_state);
|
||||
}
|
||||
void log_run_state(const nvbench::state &exec_state) { this->do_log_run_state(exec_state); }
|
||||
|
||||
/*!
|
||||
* Measurements may call this to allow a printer to perform extra processing
|
||||
@@ -181,10 +172,7 @@ struct printer_base
|
||||
return this->do_get_completed_state_count();
|
||||
}
|
||||
|
||||
virtual void set_total_state_count(std::size_t states)
|
||||
{
|
||||
this->do_set_total_state_count(states);
|
||||
}
|
||||
virtual void set_total_state_count(std::size_t states) { this->do_set_total_state_count(states); }
|
||||
[[nodiscard]] virtual std::size_t get_total_state_count() const
|
||||
{
|
||||
return this->do_get_total_state_count();
|
||||
@@ -193,17 +181,16 @@ struct printer_base
|
||||
|
||||
protected:
|
||||
// Implementation hooks for subclasses:
|
||||
virtual void do_log_argv(const std::vector<std::string>&) {}
|
||||
virtual void do_log_argv(const std::vector<std::string> &) {}
|
||||
virtual void do_print_device_info() {}
|
||||
virtual void do_print_log_preamble() {}
|
||||
virtual void do_print_log_epilogue() {}
|
||||
virtual void do_log(nvbench::log_level, const std::string &) {}
|
||||
virtual void do_log_run_state(const nvbench::state &) {}
|
||||
virtual void
|
||||
do_process_bulk_data_float64(nvbench::state &,
|
||||
const std::string &,
|
||||
const std::string &,
|
||||
const std::vector<nvbench::float64_t> &){};
|
||||
virtual void do_process_bulk_data_float64(nvbench::state &,
|
||||
const std::string &,
|
||||
const std::string &,
|
||||
const std::vector<nvbench::float64_t> &){};
|
||||
virtual void do_print_benchmark_list(const benchmark_vector &) {}
|
||||
virtual void do_print_benchmark_results(const benchmark_vector &) {}
|
||||
|
||||
|
||||
@@ -38,19 +38,10 @@ void printer_base::do_set_completed_state_count(std::size_t states)
|
||||
|
||||
void printer_base::do_add_completed_state() { ++m_completed_state_count; }
|
||||
|
||||
std::size_t printer_base::do_get_completed_state_count() const
|
||||
{
|
||||
return m_completed_state_count;
|
||||
}
|
||||
std::size_t printer_base::do_get_completed_state_count() const { return m_completed_state_count; }
|
||||
|
||||
void printer_base::do_set_total_state_count(std::size_t states)
|
||||
{
|
||||
m_total_state_count = states;
|
||||
}
|
||||
void printer_base::do_set_total_state_count(std::size_t states) { m_total_state_count = states; }
|
||||
|
||||
std::size_t printer_base::do_get_total_state_count() const
|
||||
{
|
||||
return m_total_state_count;
|
||||
}
|
||||
std::size_t printer_base::do_get_total_state_count() const { return m_total_state_count; }
|
||||
|
||||
} // namespace nvbench
|
||||
|
||||
@@ -40,10 +40,7 @@ struct printer_multiplex : nvbench::printer_base
|
||||
return static_cast<Format &>(*m_printers.back());
|
||||
}
|
||||
|
||||
[[nodiscard]] std::size_t get_printer_count() const
|
||||
{
|
||||
return m_printers.size();
|
||||
}
|
||||
[[nodiscard]] std::size_t get_printer_count() const { return m_printers.size(); }
|
||||
|
||||
protected:
|
||||
void do_log_argv(const std::vector<std::string> &argv) override;
|
||||
@@ -52,11 +49,10 @@ protected:
|
||||
void do_print_log_epilogue() override;
|
||||
void do_log(nvbench::log_level, const std::string &) override;
|
||||
void do_log_run_state(const nvbench::state &) override;
|
||||
void do_process_bulk_data_float64(
|
||||
nvbench::state &,
|
||||
const std::string &,
|
||||
const std::string &,
|
||||
const std::vector<nvbench::float64_t> &) override;
|
||||
void do_process_bulk_data_float64(nvbench::state &,
|
||||
const std::string &,
|
||||
const std::string &,
|
||||
const std::vector<nvbench::float64_t> &) override;
|
||||
void do_print_benchmark_list(const benchmark_vector &benches) override;
|
||||
void do_print_benchmark_results(const benchmark_vector &benches) override;
|
||||
void do_set_completed_state_count(std::size_t states) override;
|
||||
|
||||
@@ -67,11 +67,10 @@ void printer_multiplex::do_log_run_state(const nvbench::state &exec_state)
|
||||
}
|
||||
}
|
||||
|
||||
void printer_multiplex::do_process_bulk_data_float64(
|
||||
state &state,
|
||||
const std::string &tag,
|
||||
const std::string &hint,
|
||||
const std::vector<nvbench::float64_t> &data)
|
||||
void printer_multiplex::do_process_bulk_data_float64(state &state,
|
||||
const std::string &tag,
|
||||
const std::string &hint,
|
||||
const std::vector<nvbench::float64_t> &data)
|
||||
{
|
||||
for (auto &format_ptr : m_printers)
|
||||
{
|
||||
@@ -87,8 +86,7 @@ void printer_multiplex::do_print_benchmark_list(const benchmark_vector &benches)
|
||||
}
|
||||
}
|
||||
|
||||
void printer_multiplex::do_print_benchmark_results(
|
||||
const benchmark_vector &benches)
|
||||
void printer_multiplex::do_print_benchmark_results(const benchmark_vector &benches)
|
||||
{
|
||||
for (auto &format_ptr : m_printers)
|
||||
{
|
||||
|
||||
@@ -29,13 +29,11 @@ namespace nvbench
|
||||
namespace detail
|
||||
{
|
||||
template <typename T>
|
||||
using range_output_t = std::conditional_t<std::is_floating_point_v<T>,
|
||||
nvbench::float64_t,
|
||||
nvbench::int64_t>;
|
||||
using range_output_t =
|
||||
std::conditional_t<std::is_floating_point_v<T>, nvbench::float64_t, nvbench::int64_t>;
|
||||
}
|
||||
|
||||
template <typename InT,
|
||||
typename OutT = nvbench::detail::range_output_t<InT>>
|
||||
template <typename InT, typename OutT = nvbench::detail::range_output_t<InT>>
|
||||
auto range(InT start, InT end, InT stride = InT{1})
|
||||
{
|
||||
if constexpr (std::is_floating_point_v<InT>)
|
||||
|
||||
@@ -37,8 +37,7 @@ struct runner_base
|
||||
|
||||
void generate_states();
|
||||
|
||||
void handle_sampling_exception(const std::exception &e,
|
||||
nvbench::state &exec_state) const;
|
||||
void handle_sampling_exception(const std::exception &e, nvbench::state &exec_state) const;
|
||||
|
||||
void run_state_prologue(state &exec_state) const;
|
||||
void run_state_epilogue(state &exec_state) const;
|
||||
@@ -51,11 +50,10 @@ struct runner_base
|
||||
template <typename BenchmarkType>
|
||||
struct runner : public runner_base
|
||||
{
|
||||
using benchmark_type = BenchmarkType;
|
||||
using kernel_generator = typename benchmark_type::kernel_generator;
|
||||
using type_configs = typename benchmark_type::type_configs;
|
||||
static constexpr std::size_t num_type_configs =
|
||||
benchmark_type::num_type_configs;
|
||||
using benchmark_type = BenchmarkType;
|
||||
using kernel_generator = typename benchmark_type::kernel_generator;
|
||||
using type_configs = typename benchmark_type::type_configs;
|
||||
static constexpr std::size_t num_type_configs = benchmark_type::num_type_configs;
|
||||
|
||||
explicit runner(benchmark_type &bench)
|
||||
: runner_base{bench}
|
||||
@@ -86,38 +84,37 @@ private:
|
||||
|
||||
// Iterate through type_configs:
|
||||
std::size_t type_config_index = 0;
|
||||
nvbench::tl::foreach<type_configs>([&self = *this,
|
||||
&states = m_benchmark.m_states,
|
||||
&type_config_index,
|
||||
&device](auto type_config_wrapper) {
|
||||
// Get current type_config:
|
||||
using type_config = typename decltype(type_config_wrapper)::type;
|
||||
nvbench::tl::foreach<type_configs>(
|
||||
[&self = *this, &states = m_benchmark.m_states, &type_config_index, &device](
|
||||
auto type_config_wrapper) {
|
||||
// Get current type_config:
|
||||
using type_config = typename decltype(type_config_wrapper)::type;
|
||||
|
||||
// Find states with the current device / type_config
|
||||
for (nvbench::state &cur_state : states)
|
||||
{
|
||||
if (cur_state.get_device() == device &&
|
||||
cur_state.get_type_config_index() == type_config_index)
|
||||
// Find states with the current device / type_config
|
||||
for (nvbench::state &cur_state : states)
|
||||
{
|
||||
self.run_state_prologue(cur_state);
|
||||
try
|
||||
if (cur_state.get_device() == device &&
|
||||
cur_state.get_type_config_index() == type_config_index)
|
||||
{
|
||||
kernel_generator{}(cur_state, type_config{});
|
||||
if (cur_state.is_skipped())
|
||||
self.run_state_prologue(cur_state);
|
||||
try
|
||||
{
|
||||
self.print_skip_notification(cur_state);
|
||||
kernel_generator{}(cur_state, type_config{});
|
||||
if (cur_state.is_skipped())
|
||||
{
|
||||
self.print_skip_notification(cur_state);
|
||||
}
|
||||
}
|
||||
catch (std::exception &e)
|
||||
{
|
||||
self.handle_sampling_exception(e, cur_state);
|
||||
}
|
||||
self.run_state_epilogue(cur_state);
|
||||
}
|
||||
catch (std::exception &e)
|
||||
{
|
||||
self.handle_sampling_exception(e, cur_state);
|
||||
}
|
||||
self.run_state_epilogue(cur_state);
|
||||
}
|
||||
}
|
||||
|
||||
++type_config_index;
|
||||
});
|
||||
++type_config_index;
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -35,8 +35,7 @@ void runner_base::generate_states()
|
||||
m_benchmark.m_states = nvbench::detail::state_generator::create(m_benchmark);
|
||||
}
|
||||
|
||||
void runner_base::handle_sampling_exception(const std::exception &e,
|
||||
state &exec_state) const
|
||||
void runner_base::handle_sampling_exception(const std::exception &e, state &exec_state) const
|
||||
{
|
||||
// If the state is skipped, that means the execution framework class handled
|
||||
// the error already.
|
||||
@@ -62,8 +61,7 @@ void runner_base::handle_sampling_exception(const std::exception &e,
|
||||
void runner_base::run_state_prologue(nvbench::state &exec_state) const
|
||||
{
|
||||
// Log if a printer exists:
|
||||
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
|
||||
printer_opt_ref.has_value())
|
||||
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
printer.log_run_state(exec_state);
|
||||
@@ -73,19 +71,16 @@ void runner_base::run_state_prologue(nvbench::state &exec_state) const
|
||||
void runner_base::run_state_epilogue(state &exec_state) const
|
||||
{
|
||||
// Notify the printer that the state has completed::
|
||||
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
|
||||
printer_opt_ref.has_value())
|
||||
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
printer.add_completed_state();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void runner_base::print_skip_notification(state &exec_state) const
|
||||
{
|
||||
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
|
||||
printer_opt_ref.has_value())
|
||||
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
printer.log(nvbench::log_level::skip, exec_state.get_skip_reason());
|
||||
|
||||
@@ -58,106 +58,68 @@ struct state_tester;
|
||||
struct state
|
||||
{
|
||||
// move-only
|
||||
state(const state &) = delete;
|
||||
state(state &&) = default;
|
||||
state(const state &) = delete;
|
||||
state(state &&) = default;
|
||||
state &operator=(const state &) = delete;
|
||||
state &operator=(state &&) = default;
|
||||
state &operator=(state &&) = default;
|
||||
|
||||
[[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const
|
||||
{
|
||||
return m_cuda_stream;
|
||||
}
|
||||
void set_cuda_stream(nvbench::cuda_stream &&stream)
|
||||
{
|
||||
m_cuda_stream = std::move(stream);
|
||||
}
|
||||
[[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const { return m_cuda_stream; }
|
||||
void set_cuda_stream(nvbench::cuda_stream &&stream) { m_cuda_stream = std::move(stream); }
|
||||
|
||||
/// The CUDA device associated with with this benchmark state. May be
|
||||
/// nullopt for CPU-only benchmarks.
|
||||
[[nodiscard]] const std::optional<nvbench::device_info> &get_device() const
|
||||
{
|
||||
return m_device;
|
||||
}
|
||||
[[nodiscard]] const std::optional<nvbench::device_info> &get_device() const { return m_device; }
|
||||
|
||||
/// An index into a benchmark::type_configs type_list. Returns 0 if no type
|
||||
/// axes in the associated benchmark.
|
||||
[[nodiscard]] std::size_t get_type_config_index() const
|
||||
{
|
||||
return m_type_config_index;
|
||||
}
|
||||
[[nodiscard]] std::size_t get_type_config_index() const { return m_type_config_index; }
|
||||
|
||||
[[nodiscard]] nvbench::int64_t get_int64(const std::string &axis_name) const;
|
||||
[[nodiscard]] nvbench::int64_t
|
||||
get_int64_or_default(const std::string &axis_name,
|
||||
nvbench::int64_t default_value) const;
|
||||
[[nodiscard]] nvbench::int64_t get_int64_or_default(const std::string &axis_name,
|
||||
nvbench::int64_t default_value) const;
|
||||
|
||||
[[nodiscard]] nvbench::float64_t
|
||||
get_float64(const std::string &axis_name) const;
|
||||
[[nodiscard]] nvbench::float64_t
|
||||
get_float64_or_default(const std::string &axis_name,
|
||||
nvbench::float64_t default_value) const;
|
||||
[[nodiscard]] nvbench::float64_t get_float64(const std::string &axis_name) const;
|
||||
[[nodiscard]] nvbench::float64_t get_float64_or_default(const std::string &axis_name,
|
||||
nvbench::float64_t default_value) const;
|
||||
|
||||
[[nodiscard]] const std::string &
|
||||
get_string(const std::string &axis_name) const;
|
||||
[[nodiscard]] const std::string &
|
||||
get_string_or_default(const std::string &axis_name,
|
||||
const std::string &default_value) const;
|
||||
[[nodiscard]] const std::string &get_string(const std::string &axis_name) const;
|
||||
[[nodiscard]] const std::string &get_string_or_default(const std::string &axis_name,
|
||||
const std::string &default_value) const;
|
||||
|
||||
void add_element_count(std::size_t elements, std::string column_name = {});
|
||||
|
||||
void set_element_count(std::size_t elements) { m_element_count = elements; }
|
||||
[[nodiscard]] std::size_t get_element_count() const
|
||||
{
|
||||
return m_element_count;
|
||||
}
|
||||
[[nodiscard]] std::size_t get_element_count() const { return m_element_count; }
|
||||
|
||||
template <typename ElementType>
|
||||
void add_global_memory_reads(std::size_t count, std::string column_name = {})
|
||||
{
|
||||
this->add_global_memory_reads(count * sizeof(ElementType),
|
||||
std::move(column_name));
|
||||
this->add_global_memory_reads(count * sizeof(ElementType), std::move(column_name));
|
||||
}
|
||||
void add_global_memory_reads(std::size_t bytes, std::string column_name = {});
|
||||
|
||||
template <typename ElementType>
|
||||
void add_global_memory_writes(std::size_t count, std::string column_name = {})
|
||||
{
|
||||
this->add_global_memory_writes(count * sizeof(ElementType),
|
||||
std::move(column_name));
|
||||
this->add_global_memory_writes(count * sizeof(ElementType), std::move(column_name));
|
||||
}
|
||||
void add_global_memory_writes(std::size_t bytes,
|
||||
std::string column_name = {});
|
||||
void add_global_memory_writes(std::size_t bytes, std::string column_name = {});
|
||||
|
||||
void add_buffer_size(std::size_t num_bytes,
|
||||
std::string summary_tag,
|
||||
std::string column_name = {},
|
||||
std::string description = {});
|
||||
|
||||
void set_global_memory_rw_bytes(std::size_t bytes)
|
||||
{
|
||||
m_global_memory_rw_bytes = bytes;
|
||||
}
|
||||
[[nodiscard]] std::size_t get_global_memory_rw_bytes() const
|
||||
{
|
||||
return m_global_memory_rw_bytes;
|
||||
}
|
||||
void set_global_memory_rw_bytes(std::size_t bytes) { m_global_memory_rw_bytes = bytes; }
|
||||
[[nodiscard]] std::size_t get_global_memory_rw_bytes() const { return m_global_memory_rw_bytes; }
|
||||
|
||||
void skip(std::string reason) { m_skip_reason = std::move(reason); }
|
||||
[[nodiscard]] bool is_skipped() const { return !m_skip_reason.empty(); }
|
||||
[[nodiscard]] const std::string &get_skip_reason() const
|
||||
{
|
||||
return m_skip_reason;
|
||||
}
|
||||
[[nodiscard]] const std::string &get_skip_reason() const { return m_skip_reason; }
|
||||
|
||||
/// Execute at least this many trials per measurement. @{
|
||||
[[nodiscard]] nvbench::int64_t get_min_samples() const
|
||||
{
|
||||
return m_min_samples;
|
||||
}
|
||||
void set_min_samples(nvbench::int64_t min_samples)
|
||||
{
|
||||
m_min_samples = min_samples;
|
||||
}
|
||||
[[nodiscard]] nvbench::int64_t get_min_samples() const { return m_min_samples; }
|
||||
void set_min_samples(nvbench::int64_t min_samples) { m_min_samples = min_samples; }
|
||||
/// @}
|
||||
|
||||
/// If true, the benchmark is only run once, skipping all warmup runs and only
|
||||
@@ -222,20 +184,14 @@ struct state
|
||||
}
|
||||
///@}
|
||||
|
||||
[[nodiscard]] const named_values &get_axis_values() const
|
||||
{
|
||||
return m_axis_values;
|
||||
}
|
||||
[[nodiscard]] const named_values &get_axis_values() const { return m_axis_values; }
|
||||
|
||||
/*!
|
||||
* Return a string of "axis_name1=input_string1 axis_name2=input_string2 ..."
|
||||
*/
|
||||
[[nodiscard]] std::string get_axis_values_as_string(bool color = false) const;
|
||||
|
||||
[[nodiscard]] const benchmark_base &get_benchmark() const
|
||||
{
|
||||
return m_benchmark;
|
||||
}
|
||||
[[nodiscard]] const benchmark_base &get_benchmark() const { return m_benchmark; }
|
||||
|
||||
void collect_l1_hit_rates() { m_collect_l1_hit_rates = true; }
|
||||
void collect_l2_hit_rates() { m_collect_l2_hit_rates = true; }
|
||||
@@ -252,26 +208,11 @@ struct state
|
||||
collect_dram_throughput();
|
||||
}
|
||||
|
||||
[[nodiscard]] bool is_l1_hit_rate_collected() const
|
||||
{
|
||||
return m_collect_l1_hit_rates;
|
||||
}
|
||||
[[nodiscard]] bool is_l2_hit_rate_collected() const
|
||||
{
|
||||
return m_collect_l2_hit_rates;
|
||||
}
|
||||
[[nodiscard]] bool is_stores_efficiency_collected() const
|
||||
{
|
||||
return m_collect_stores_efficiency;
|
||||
}
|
||||
[[nodiscard]] bool is_loads_efficiency_collected() const
|
||||
{
|
||||
return m_collect_loads_efficiency;
|
||||
}
|
||||
[[nodiscard]] bool is_dram_throughput_collected() const
|
||||
{
|
||||
return m_collect_dram_throughput;
|
||||
}
|
||||
[[nodiscard]] bool is_l1_hit_rate_collected() const { return m_collect_l1_hit_rates; }
|
||||
[[nodiscard]] bool is_l2_hit_rate_collected() const { return m_collect_l2_hit_rates; }
|
||||
[[nodiscard]] bool is_stores_efficiency_collected() const { return m_collect_stores_efficiency; }
|
||||
[[nodiscard]] bool is_loads_efficiency_collected() const { return m_collect_loads_efficiency; }
|
||||
[[nodiscard]] bool is_dram_throughput_collected() const { return m_collect_dram_throughput; }
|
||||
|
||||
[[nodiscard]] bool is_cupti_required() const
|
||||
{
|
||||
@@ -306,8 +247,7 @@ struct state
|
||||
template <typename KernelLauncher>
|
||||
void exec(KernelLauncher &&kernel_launcher)
|
||||
{
|
||||
this->exec(nvbench::exec_tag::none,
|
||||
std::forward<KernelLauncher>(kernel_launcher));
|
||||
this->exec(nvbench::exec_tag::none, std::forward<KernelLauncher>(kernel_launcher));
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
@@ -65,9 +65,8 @@ nvbench::int64_t state::get_int64(const std::string &axis_name) const
|
||||
return m_axis_values.get_int64(axis_name);
|
||||
}
|
||||
|
||||
nvbench::int64_t
|
||||
state::get_int64_or_default(const std::string &axis_name,
|
||||
nvbench::int64_t default_value) const
|
||||
nvbench::int64_t state::get_int64_or_default(const std::string &axis_name,
|
||||
nvbench::int64_t default_value) const
|
||||
try
|
||||
{
|
||||
return this->get_int64(axis_name);
|
||||
@@ -82,9 +81,8 @@ nvbench::float64_t state::get_float64(const std::string &axis_name) const
|
||||
return m_axis_values.get_float64(axis_name);
|
||||
}
|
||||
|
||||
nvbench::float64_t
|
||||
state::get_float64_or_default(const std::string &axis_name,
|
||||
nvbench::float64_t default_value) const
|
||||
nvbench::float64_t state::get_float64_or_default(const std::string &axis_name,
|
||||
nvbench::float64_t default_value) const
|
||||
try
|
||||
{
|
||||
return this->get_float64(axis_name);
|
||||
@@ -99,9 +97,8 @@ const std::string &state::get_string(const std::string &axis_name) const
|
||||
return m_axis_values.get_string(axis_name);
|
||||
}
|
||||
|
||||
const std::string &
|
||||
state::get_string_or_default(const std::string &axis_name,
|
||||
const std::string &default_value) const
|
||||
const std::string &state::get_string_or_default(const std::string &axis_name,
|
||||
const std::string &default_value) const
|
||||
try
|
||||
{
|
||||
return this->get_string(axis_name);
|
||||
@@ -125,20 +122,18 @@ summary &state::add_summary(summary s)
|
||||
const summary &state::get_summary(std::string_view tag) const
|
||||
{
|
||||
// Check tags first
|
||||
auto iter =
|
||||
std::find_if(m_summaries.cbegin(),
|
||||
m_summaries.cend(),
|
||||
[&tag](const auto &s) { return s.get_tag() == tag; });
|
||||
auto iter = std::find_if(m_summaries.cbegin(), m_summaries.cend(), [&tag](const auto &s) {
|
||||
return s.get_tag() == tag;
|
||||
});
|
||||
if (iter != m_summaries.cend())
|
||||
{
|
||||
return *iter;
|
||||
}
|
||||
|
||||
// Then names:
|
||||
iter =
|
||||
std::find_if(m_summaries.cbegin(),
|
||||
m_summaries.cend(),
|
||||
[&tag](const auto &s) { return s.get_string("name") == tag; });
|
||||
iter = std::find_if(m_summaries.cbegin(), m_summaries.cend(), [&tag](const auto &s) {
|
||||
return s.get_string("name") == tag;
|
||||
});
|
||||
if (iter != m_summaries.cend())
|
||||
{
|
||||
return *iter;
|
||||
@@ -150,20 +145,18 @@ const summary &state::get_summary(std::string_view tag) const
|
||||
summary &state::get_summary(std::string_view tag)
|
||||
{
|
||||
// Check tags first
|
||||
auto iter =
|
||||
std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
|
||||
return s.get_tag() == tag;
|
||||
});
|
||||
auto iter = std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
|
||||
return s.get_tag() == tag;
|
||||
});
|
||||
if (iter != m_summaries.end())
|
||||
{
|
||||
return *iter;
|
||||
}
|
||||
|
||||
// Then names:
|
||||
iter =
|
||||
std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
|
||||
return s.get_string("name") == tag;
|
||||
});
|
||||
iter = std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
|
||||
return s.get_string("name") == tag;
|
||||
});
|
||||
if (iter != m_summaries.end())
|
||||
{
|
||||
return *iter;
|
||||
@@ -187,18 +180,17 @@ std::string state::get_axis_values_as_string(bool color) const
|
||||
// Create a Key=Value list of all parameters:
|
||||
fmt::memory_buffer buffer;
|
||||
|
||||
auto append_key_value = [&buffer, &style](const std::string &key,
|
||||
const auto &value,
|
||||
std::string value_fmtstr = "{}") {
|
||||
constexpr auto key_format = fmt::emphasis::italic;
|
||||
constexpr auto value_format = fmt::emphasis::bold;
|
||||
auto append_key_value =
|
||||
[&buffer, &style](const std::string &key, const auto &value, std::string value_fmtstr = "{}") {
|
||||
constexpr auto key_format = fmt::emphasis::italic;
|
||||
constexpr auto value_format = fmt::emphasis::bold;
|
||||
|
||||
fmt::format_to(buffer,
|
||||
"{}{}={}",
|
||||
buffer.size() == 0 ? "" : " ",
|
||||
fmt::format(style(key_format), "{}", key),
|
||||
fmt::format(style(value_format), value_fmtstr, value));
|
||||
};
|
||||
fmt::format_to(buffer,
|
||||
"{}{}={}",
|
||||
buffer.size() == 0 ? "" : " ",
|
||||
fmt::format(style(key_format), "{}", key),
|
||||
fmt::format(style(value_format), value_fmtstr, value));
|
||||
};
|
||||
|
||||
if (m_device)
|
||||
{
|
||||
@@ -211,8 +203,7 @@ std::string state::get_axis_values_as_string(bool color) const
|
||||
const auto axis_type = m_axis_values.get_type(name);
|
||||
|
||||
// Handle power-of-two int64 axes differently:
|
||||
if (axis_type == named_values::type::int64 &&
|
||||
axes.get_int64_axis(name).is_power_of_two())
|
||||
if (axis_type == named_values::type::int64 && axes.get_int64_axis(name).is_power_of_two())
|
||||
{
|
||||
const nvbench::int64_t value = m_axis_values.get_int64(name);
|
||||
const nvbench::int64_t exponent = int64_axis::compute_log2(value);
|
||||
@@ -242,10 +233,9 @@ std::string state::get_short_description(bool color) const
|
||||
return color ? fmt_style : no_style;
|
||||
};
|
||||
|
||||
return fmt::format(
|
||||
"{} [{}]",
|
||||
fmt::format(style(fmt::emphasis::bold), "{}", m_benchmark.get().get_name()),
|
||||
this->get_axis_values_as_string(color));
|
||||
return fmt::format("{} [{}]",
|
||||
fmt::format(style(fmt::emphasis::bold), "{}", m_benchmark.get().get_name()),
|
||||
this->get_axis_values_as_string(color));
|
||||
}
|
||||
|
||||
void state::add_element_count(std::size_t elements, std::string column_name)
|
||||
|
||||
@@ -36,25 +36,13 @@ struct string_axis final : public axis_base
|
||||
|
||||
~string_axis() final;
|
||||
|
||||
void set_inputs(std::vector<std::string> inputs)
|
||||
{
|
||||
m_values = std::move(inputs);
|
||||
}
|
||||
[[nodiscard]] const std::string &get_value(std::size_t i) const
|
||||
{
|
||||
return m_values[i];
|
||||
}
|
||||
void set_inputs(std::vector<std::string> inputs) { m_values = std::move(inputs); }
|
||||
[[nodiscard]] const std::string &get_value(std::size_t i) const { return m_values[i]; }
|
||||
|
||||
private:
|
||||
std::unique_ptr<axis_base> do_clone() const
|
||||
{
|
||||
return std::make_unique<string_axis>(*this);
|
||||
}
|
||||
std::unique_ptr<axis_base> do_clone() const { return std::make_unique<string_axis>(*this); }
|
||||
std::size_t do_get_size() const final { return m_values.size(); }
|
||||
std::string do_get_input_string(std::size_t i) const final
|
||||
{
|
||||
return m_values[i];
|
||||
}
|
||||
std::string do_get_input_string(std::size_t i) const final { return m_values[i]; }
|
||||
std::string do_get_description(std::size_t) const final { return {}; }
|
||||
|
||||
std::vector<std::string> m_values;
|
||||
|
||||
@@ -92,10 +92,10 @@ struct summary : public nvbench::named_values
|
||||
{}
|
||||
|
||||
// move-only
|
||||
summary(const summary &) = delete;
|
||||
summary(summary &&) = default;
|
||||
summary(const summary &) = delete;
|
||||
summary(summary &&) = default;
|
||||
summary &operator=(const summary &) = delete;
|
||||
summary &operator=(summary &&) = default;
|
||||
summary &operator=(summary &&) = default;
|
||||
|
||||
void set_tag(std::string tag) { m_tag = std::move(tag); }
|
||||
[[nodiscard]] const std::string &get_tag() const { return m_tag; }
|
||||
|
||||
@@ -38,8 +38,8 @@ namespace nvbench
|
||||
__global__ void sleep_kernel(double seconds)
|
||||
{
|
||||
const auto start = cuda::std::chrono::high_resolution_clock::now();
|
||||
const auto ns = cuda::std::chrono::nanoseconds(
|
||||
static_cast<nvbench::int64_t>(seconds * 1000 * 1000 * 1000));
|
||||
const auto ns =
|
||||
cuda::std::chrono::nanoseconds(static_cast<nvbench::int64_t>(seconds * 1000 * 1000 * 1000));
|
||||
const auto finish = start + ns;
|
||||
|
||||
auto now = cuda::std::chrono::high_resolution_clock::now();
|
||||
@@ -53,7 +53,7 @@ __global__ void sleep_kernel(double seconds)
|
||||
* Naive copy of `n` values from `in` -> `out`.
|
||||
*/
|
||||
template <typename T, typename U>
|
||||
__global__ void copy_kernel(const T* in, U* out, std::size_t n)
|
||||
__global__ void copy_kernel(const T *in, U *out, std::size_t n)
|
||||
{
|
||||
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = blockDim.x * gridDim.x;
|
||||
@@ -68,7 +68,7 @@ __global__ void copy_kernel(const T* in, U* out, std::size_t n)
|
||||
* For `i <- [0,n)`, `out[i] = in[i] % 2`.
|
||||
*/
|
||||
template <typename T, typename U>
|
||||
__global__ void mod2_kernel(const T* in, U* out, std::size_t n)
|
||||
__global__ void mod2_kernel(const T *in, U *out, std::size_t n)
|
||||
{
|
||||
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = blockDim.x * gridDim.x;
|
||||
@@ -79,4 +79,4 @@ __global__ void mod2_kernel(const T* in, U* out, std::size_t n)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace nvbench
|
||||
|
||||
@@ -43,7 +43,7 @@ struct type_axis final : public axis_base
|
||||
template <typename TypeList>
|
||||
void set_inputs();
|
||||
|
||||
void set_active_inputs(const std::vector<std::string>& inputs);
|
||||
void set_active_inputs(const std::vector<std::string> &inputs);
|
||||
|
||||
[[nodiscard]] bool get_is_active(const std::string &input) const;
|
||||
[[nodiscard]] bool get_is_active(std::size_t index) const;
|
||||
@@ -57,23 +57,13 @@ struct type_axis final : public axis_base
|
||||
/**
|
||||
* The index in this axis of the type with the specified `input_string`.
|
||||
*/
|
||||
[[nodiscard]] std::size_t
|
||||
get_type_index(const std::string &input_string) const;
|
||||
[[nodiscard]] std::size_t get_type_index(const std::string &input_string) const;
|
||||
|
||||
private:
|
||||
std::unique_ptr<axis_base> do_clone() const
|
||||
{
|
||||
return std::make_unique<type_axis>(*this);
|
||||
}
|
||||
std::unique_ptr<axis_base> do_clone() const { return std::make_unique<type_axis>(*this); }
|
||||
std::size_t do_get_size() const final { return m_input_strings.size(); }
|
||||
std::string do_get_input_string(std::size_t i) const final
|
||||
{
|
||||
return m_input_strings[i];
|
||||
}
|
||||
std::string do_get_description(std::size_t i) const final
|
||||
{
|
||||
return m_descriptions[i];
|
||||
}
|
||||
std::string do_get_input_string(std::size_t i) const final { return m_input_strings[i]; }
|
||||
std::string do_get_description(std::size_t i) const final { return m_descriptions[i]; }
|
||||
|
||||
std::vector<std::string> m_input_strings;
|
||||
std::vector<std::string> m_descriptions;
|
||||
|
||||
@@ -35,10 +35,10 @@ void type_axis::set_active_inputs(const std::vector<std::string> &inputs)
|
||||
{
|
||||
m_mask.clear();
|
||||
m_mask.resize(m_input_strings.size(), false);
|
||||
for (const auto& input : inputs)
|
||||
for (const auto &input : inputs)
|
||||
{
|
||||
const auto idx = this->get_type_index(input);
|
||||
m_mask[idx] = true;
|
||||
m_mask[idx] = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,21 +47,16 @@ bool type_axis::get_is_active(const std::string &input) const
|
||||
return this->get_is_active(this->get_type_index(input));
|
||||
}
|
||||
|
||||
bool type_axis::get_is_active(std::size_t idx) const
|
||||
{
|
||||
return m_mask.at(idx);
|
||||
}
|
||||
bool type_axis::get_is_active(std::size_t idx) const { return m_mask.at(idx); }
|
||||
|
||||
std::size_t type_axis::get_active_count() const
|
||||
{
|
||||
return static_cast<std::size_t>(
|
||||
std::count(m_mask.cbegin(), m_mask.cend(), true));
|
||||
return static_cast<std::size_t>(std::count(m_mask.cbegin(), m_mask.cend(), true));
|
||||
}
|
||||
|
||||
std::size_t type_axis::get_type_index(const std::string &input_string) const
|
||||
{
|
||||
auto it =
|
||||
std::find(m_input_strings.cbegin(), m_input_strings.cend(), input_string);
|
||||
auto it = std::find(m_input_strings.cbegin(), m_input_strings.cend(), input_string);
|
||||
if (it == m_input_strings.end())
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
|
||||
@@ -30,17 +30,17 @@ namespace nvbench
|
||||
std::string demangle(const std::string &str);
|
||||
|
||||
template <typename T>
|
||||
std::string demangle() { return demangle(typeid(T).name()); }
|
||||
std::string demangle()
|
||||
{
|
||||
return demangle(typeid(T).name());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct type_strings
|
||||
{
|
||||
// The string used to identify the type in shorthand (e.g. output tables and
|
||||
// CLI options):
|
||||
static std::string input_string()
|
||||
{
|
||||
return nvbench::demangle<T>();
|
||||
}
|
||||
static std::string input_string() { return nvbench::demangle<T>(); }
|
||||
|
||||
// A more descriptive identifier for the type, if input_string is not a common
|
||||
// identifier. May be blank if `input_string` is obvious.
|
||||
@@ -56,10 +56,7 @@ struct type_strings<std::integral_constant<T, Value>>
|
||||
|
||||
// A more descriptive identifier for the type, if input_string is not a common
|
||||
// identifier. May be blank if `input_string` is obvious.
|
||||
static std::string description()
|
||||
{
|
||||
return nvbench::demangle<std::integral_constant<T, Value>>();
|
||||
}
|
||||
static std::string description() { return nvbench::demangle<std::integral_constant<T, Value>>(); }
|
||||
};
|
||||
|
||||
} // namespace nvbench
|
||||
@@ -67,15 +64,15 @@ struct type_strings<std::integral_constant<T, Value>>
|
||||
/*!
|
||||
* Declare an `input_string` and `description` to use with a specific `type`.
|
||||
*/
|
||||
#define NVBENCH_DECLARE_TYPE_STRINGS(Type, InputString, Description) \
|
||||
namespace nvbench \
|
||||
{ \
|
||||
template <> \
|
||||
struct type_strings<Type> \
|
||||
{ \
|
||||
static std::string input_string() { return {InputString}; } \
|
||||
static std::string description() { return {Description}; } \
|
||||
}; \
|
||||
#define NVBENCH_DECLARE_TYPE_STRINGS(Type, InputString, Description) \
|
||||
namespace nvbench \
|
||||
{ \
|
||||
template <> \
|
||||
struct type_strings<Type> \
|
||||
{ \
|
||||
static std::string input_string() { return {InputString}; } \
|
||||
static std::string description() { return {Description}; } \
|
||||
}; \
|
||||
}
|
||||
|
||||
NVBENCH_DECLARE_TYPE_STRINGS(nvbench::int8_t, "I8", "int8_t");
|
||||
|
||||
Reference in New Issue
Block a user