Files
nvbench/nvbench/state.cuh
2026-02-02 14:42:07 -06:00

354 lines
13 KiB
Plaintext

/*
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nvbench/cuda_stream.cuh>
#include <nvbench/device_info.cuh>
#include <nvbench/exec_tag.cuh>
#include <nvbench/named_values.cuh>
#include <nvbench/stopping_criterion.cuh>
#include <nvbench/summary.cuh>
#include <nvbench/types.cuh>
#include <functional>
#include <optional>
#include <string>
#include <vector>
namespace nvbench
{
struct benchmark_base;
namespace detail
{
struct state_generator;
struct state_tester;
} // namespace detail
/**
* Stores all information about a particular benchmark configuration.
*
* One state object exists for every combination of a benchmark's parameter
* axes. It provides access to:
* - Parameter values (get_int64, get_float64, get_string)
* - The names of parameters from type axes are stored as strings.
* - Skip information (skip, is_skipped, get_skip_reason)
* - If the benchmark fails or is invalid, it may be skipped with an
* informative message.
* - Summaries (add_summary, get_summary, get_summaries)
* - Summaries store measurement information as key/value pairs.
* See nvbench::summary for details.
*/
struct state
{
// move-only
state(const state &) = delete;
state(state &&) = default;
state &operator=(const state &) = delete;
state &operator=(state &&) = default;
/// If a stream exists, return that. Otherwise, create a new stream using the current
/// device (or the current device if none is set), save it, and return it.
/// @sa get_cuda_stream_optional
[[nodiscard]] nvbench::cuda_stream &get_cuda_stream()
{
if (!m_cuda_stream.has_value())
{
m_cuda_stream = nvbench::cuda_stream{m_device};
}
return m_cuda_stream.value();
}
[[nodiscard]] const std::optional<nvbench::cuda_stream> &get_cuda_stream_optional() const
{
return m_cuda_stream;
}
void set_cuda_stream(nvbench::cuda_stream &&stream) { m_cuda_stream = std::move(stream); }
/// The CUDA device associated with with this benchmark state. May be
/// nullopt for CPU-only benchmarks.
[[nodiscard]] const std::optional<nvbench::device_info> &get_device() const { return m_device; }
/// If true, the benchmark measurements only record CPU time and assume no GPU work is performed.
/// @{
// No setter, this should not be modified after construction, as it is a benchmark-wide property.
[[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; }
/// @}
/// An index into a benchmark::type_configs type_list. Returns 0 if no type
/// axes in the associated benchmark.
[[nodiscard]] std::size_t get_type_config_index() const { return m_type_config_index; }
[[nodiscard]] nvbench::int64_t get_int64(const std::string &axis_name) const;
[[nodiscard]] nvbench::int64_t get_int64_or_default(const std::string &axis_name,
nvbench::int64_t default_value) const;
[[nodiscard]] nvbench::float64_t get_float64(const std::string &axis_name) const;
[[nodiscard]] nvbench::float64_t get_float64_or_default(const std::string &axis_name,
nvbench::float64_t default_value) const;
[[nodiscard]] const std::string &get_string(const std::string &axis_name) const;
[[nodiscard]] const std::string &get_string_or_default(const std::string &axis_name,
const std::string &default_value) const;
void add_element_count(std::size_t elements, std::string column_name = {});
void set_element_count(std::size_t elements) { m_element_count = elements; }
[[nodiscard]] std::size_t get_element_count() const { return m_element_count; }
template <typename ElementType>
void add_global_memory_reads(std::size_t count, std::string column_name = {})
{
this->add_global_memory_reads(count * sizeof(ElementType), std::move(column_name));
}
void add_global_memory_reads(std::size_t bytes, std::string column_name = {});
template <typename ElementType>
void add_global_memory_writes(std::size_t count, std::string column_name = {})
{
this->add_global_memory_writes(count * sizeof(ElementType), std::move(column_name));
}
void add_global_memory_writes(std::size_t bytes, std::string column_name = {});
void add_buffer_size(std::size_t num_bytes,
std::string summary_tag,
std::string column_name = {},
std::string description = {});
void set_global_memory_rw_bytes(std::size_t bytes) { m_global_memory_rw_bytes = bytes; }
[[nodiscard]] std::size_t get_global_memory_rw_bytes() const { return m_global_memory_rw_bytes; }
void skip(std::string reason) { m_skip_reason = std::move(reason); }
[[nodiscard]] bool is_skipped() const { return !m_skip_reason.empty(); }
[[nodiscard]] const std::string &get_skip_reason() const { return m_skip_reason; }
/// Execute at least this many trials per measurement. @{
[[nodiscard]] nvbench::int64_t get_min_samples() const { return m_min_samples; }
void set_min_samples(nvbench::int64_t min_samples) { m_min_samples = min_samples; }
/// @}
[[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
{
return m_criterion_params;
}
/// Control the stopping criterion for the measurement loop.
/// @{
[[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; }
void set_stopping_criterion(std::string criterion);
/// @}
/// If true, the benchmark is only run once, skipping all warmup runs and only
/// executing a single non-batched measurement. This is intended for use with
/// external profiling tools. @{
[[nodiscard]] bool get_run_once() const { return m_run_once; }
void set_run_once(bool v) { m_run_once = v; }
/// @}
/// If true, the batched measurements of benchmark are not run. This is intended for use to
/// save resources when only non-batched measurements are of interest, although batched
/// measurements are meaningful and code to exercise them is compiled. This option has no
/// effect for CPU only benchmarks and for benchmarks tagged with no_batch tag. @{
[[nodiscard]] bool get_skip_batched() const { return m_skip_batched; }
void set_skip_batched(bool v) { m_skip_batched = v; }
/// @}
/// If true, the benchmark does not use the blocking_kernel. This is intended
/// for use with external profiling tools. @{
[[nodiscard]] bool get_disable_blocking_kernel() const { return m_disable_blocking_kernel; }
void set_disable_blocking_kernel(bool v) { m_disable_blocking_kernel = v; }
/// @}
/// If a warmup run finishes in less than `skip_time`, the measurement will
/// be skipped.
/// Extremely fast kernels (< 5000 ns) often timeout before they can
/// accumulate `min_time` measurements, and are often uninteresting. Setting
/// this value can help improve performance by skipping time consuming
/// measurement that don't provide much information.
/// Default value is -1., which disables the feature.
/// @{
[[nodiscard]] nvbench::float64_t get_skip_time() const { return m_skip_time; }
void set_skip_time(nvbench::float64_t skip_time) { m_skip_time = skip_time; }
/// @}
/// If a measurement take more than `timeout` seconds to complete, stop the
/// measurement early. A warning should be printed if this happens.
/// This setting overrides all other termination criteria.
/// Note that this is measured in CPU walltime, not sample time.
/// @{
[[nodiscard]] nvbench::float64_t get_timeout() const { return m_timeout; }
void set_timeout(nvbench::float64_t timeout) { m_timeout = timeout; }
/// @}
[[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; }
void set_throttle_threshold(nvbench::float32_t throttle_threshold)
{
m_throttle_threshold = throttle_threshold;
}
[[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const
{
return m_throttle_recovery_delay;
}
void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay)
{
m_throttle_recovery_delay = throttle_recovery_delay;
}
/// If a `KernelLauncher` syncs and `nvbench::exec_tag::sync` is not passed
/// to `state.exec(...)`, a deadlock may occur. If a `blocking_kernel` blocks
/// for more than `blocking_kernel_timeout` seconds, an error will be printed
/// and the kernel will unblock to prevent deadlocks.
/// A negative value disables the timeout.
/// @{
[[nodiscard]] nvbench::float64_t get_blocking_kernel_timeout() const
{
return m_blocking_kernel_timeout;
}
void set_blocking_kernel_timeout(nvbench::float64_t timeout)
{
m_blocking_kernel_timeout = timeout;
}
///@}
[[nodiscard]] const named_values &get_axis_values() const { return m_axis_values; }
/*!
* Return a string of "axis_name1=input_string1 axis_name2=input_string2 ..."
*/
[[nodiscard]] std::string get_axis_values_as_string(bool color = false) const;
[[nodiscard]] const benchmark_base &get_benchmark() const { return m_benchmark; }
void collect_l1_hit_rates() { m_collect_l1_hit_rates = true; }
void collect_l2_hit_rates() { m_collect_l2_hit_rates = true; }
void collect_stores_efficiency() { m_collect_stores_efficiency = true; }
void collect_loads_efficiency() { m_collect_loads_efficiency = true; }
void collect_dram_throughput() { m_collect_dram_throughput = true; }
void collect_cupti_metrics()
{
collect_l1_hit_rates();
collect_l2_hit_rates();
collect_stores_efficiency();
collect_loads_efficiency();
collect_dram_throughput();
}
[[nodiscard]] bool is_l1_hit_rate_collected() const { return m_collect_l1_hit_rates; }
[[nodiscard]] bool is_l2_hit_rate_collected() const { return m_collect_l2_hit_rates; }
[[nodiscard]] bool is_stores_efficiency_collected() const { return m_collect_stores_efficiency; }
[[nodiscard]] bool is_loads_efficiency_collected() const { return m_collect_loads_efficiency; }
[[nodiscard]] bool is_dram_throughput_collected() const { return m_collect_dram_throughput; }
[[nodiscard]] bool is_cupti_required() const
{
// clang-format off
return is_l2_hit_rate_collected() ||
is_l1_hit_rate_collected() ||
is_stores_efficiency_collected() ||
is_loads_efficiency_collected() ||
is_dram_throughput_collected();
// clang-format on
}
summary &add_summary(std::string summary_tag);
summary &add_summary(summary s);
[[nodiscard]] const summary &get_summary(std::string_view tag) const;
[[nodiscard]] summary &get_summary(std::string_view tag);
[[nodiscard]] const std::vector<summary> &get_summaries() const;
[[nodiscard]] std::vector<summary> &get_summaries();
/// A single line description of the state:
///
/// ```
/// <bench_name> [<parameters>]
/// ```
[[nodiscard]] std::string get_short_description(bool color = false) const;
// TODO This will need detailed docs and include a reference to an appropriate
// section of the user's guide
template <typename ExecTags, typename KernelLauncher>
void exec(ExecTags, KernelLauncher &&kernel_launcher);
template <typename KernelLauncher>
void exec(KernelLauncher &&kernel_launcher)
{
this->exec(nvbench::exec_tag::none, std::forward<KernelLauncher>(kernel_launcher));
}
private:
friend struct nvbench::detail::state_generator;
friend struct nvbench::detail::state_tester;
explicit state(const benchmark_base &bench);
state(const benchmark_base &bench,
nvbench::named_values values,
std::optional<nvbench::device_info> device,
std::size_t type_config_index);
[[nodiscard]] bool skip_hot_measurement() const { return get_run_once() || get_skip_batched(); }
std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
nvbench::named_values m_axis_values;
std::optional<nvbench::device_info> m_device;
std::size_t m_type_config_index{};
bool m_is_cpu_only{false};
bool m_run_once{false};
bool m_disable_blocking_kernel{false};
bool m_skip_batched{false};
nvbench::criterion_params m_criterion_params;
std::string m_stopping_criterion;
nvbench::int64_t m_min_samples;
nvbench::float64_t m_skip_time;
nvbench::float64_t m_timeout;
nvbench::float32_t m_throttle_threshold; // [% of default SM clock rate]
nvbench::float32_t m_throttle_recovery_delay; // [seconds]
std::optional<nvbench::cuda_stream> m_cuda_stream;
// Deadlock protection. See blocking_kernel's class doc for details.
nvbench::float64_t m_blocking_kernel_timeout{30.0};
std::vector<nvbench::summary> m_summaries;
std::string m_skip_reason;
std::size_t m_element_count{};
std::size_t m_global_memory_rw_bytes{};
bool m_collect_l1_hit_rates{};
bool m_collect_l2_hit_rates{};
bool m_collect_stores_efficiency{};
bool m_collect_loads_efficiency{};
bool m_collect_dram_throughput{};
};
} // namespace nvbench
#define NVBENCH_STATE_EXEC_GUARD
#include <nvbench/detail/state_exec.cuh>
#undef NVBENCH_STATE_EXEC_GUARD