mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-03-14 20:27:24 +00:00
Merge pull request #322 from oleksandr-pavlyk/feature/save-frequencies
Save frequencies when bulk-saving of times is enabled SM clock rates are now always collected, even if throttling threshold is set to zero
This commit is contained in:
@@ -44,7 +44,7 @@ measure_cold_base::measure_cold_base(state &exec_state)
|
||||
exec_state.get_stopping_criterion())}
|
||||
, m_disable_blocking_kernel{exec_state.get_disable_blocking_kernel()}
|
||||
, m_run_once{exec_state.get_run_once()}
|
||||
, m_check_throttling(!exec_state.get_run_once() && exec_state.get_throttle_threshold() > 0.f)
|
||||
, m_check_throttling(!exec_state.get_run_once())
|
||||
, m_min_samples{exec_state.get_min_samples()}
|
||||
, m_skip_time{exec_state.get_skip_time()}
|
||||
, m_timeout{exec_state.get_timeout()}
|
||||
@@ -53,8 +53,10 @@ measure_cold_base::measure_cold_base(state &exec_state)
|
||||
{
|
||||
if (m_min_samples > 0)
|
||||
{
|
||||
m_cuda_times.reserve(static_cast<std::size_t>(m_min_samples));
|
||||
m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
|
||||
const auto reserve_size = static_cast<std::size_t>(m_min_samples);
|
||||
m_sm_clock_rates.reserve(reserve_size);
|
||||
m_cuda_times.reserve(reserve_size);
|
||||
m_cpu_times.reserve(reserve_size);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,6 +88,7 @@ void measure_cold_base::initialize()
|
||||
m_dynamic_throttle_recovery_delay = m_throttle_recovery_delay;
|
||||
m_throttle_discard_count = 0;
|
||||
|
||||
m_sm_clock_rates.clear();
|
||||
m_cuda_times.clear();
|
||||
m_cpu_times.clear();
|
||||
|
||||
@@ -140,6 +143,7 @@ void measure_cold_base::record_measurements()
|
||||
}
|
||||
m_throttle_discard_count = 0;
|
||||
|
||||
m_sm_clock_rates.push_back(current_clock_rate);
|
||||
m_sm_clock_rate_accumulator += current_clock_rate;
|
||||
}
|
||||
|
||||
@@ -445,6 +449,7 @@ void measure_cold_base::generate_summaries()
|
||||
m_total_samples));
|
||||
|
||||
printer.process_bulk_data(m_state, "nv/cold/sample_times", "sample_times", m_cuda_times);
|
||||
printer.process_bulk_data(m_state, "nv/cold/sample_freqs", "sample_freqs", m_sm_clock_rates);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -85,27 +85,27 @@ protected:
|
||||
nvbench::state &m_state;
|
||||
|
||||
nvbench::launch m_launch;
|
||||
nvbench::cuda_timer m_cuda_timer;
|
||||
nvbench::cpu_timer m_cpu_timer;
|
||||
nvbench::cpu_timer m_walltime_timer;
|
||||
nvbench::detail::l2flush m_l2flush;
|
||||
nvbench::blocking_kernel m_blocker;
|
||||
nvbench::cuda_timer m_cuda_timer{};
|
||||
nvbench::cpu_timer m_cpu_timer{};
|
||||
nvbench::cpu_timer m_walltime_timer{};
|
||||
nvbench::detail::l2flush m_l2flush{};
|
||||
nvbench::blocking_kernel m_blocker{};
|
||||
|
||||
nvbench::criterion_params m_criterion_params;
|
||||
nvbench::criterion_params m_criterion_params{};
|
||||
nvbench::stopping_criterion_base &m_stopping_criterion;
|
||||
nvbench::detail::gpu_frequency m_gpu_frequency;
|
||||
nvbench::detail::gpu_frequency m_gpu_frequency{};
|
||||
|
||||
bool m_disable_blocking_kernel{false};
|
||||
bool m_run_once{false};
|
||||
bool m_check_throttling;
|
||||
bool m_check_throttling{true};
|
||||
|
||||
nvbench::int64_t m_min_samples{};
|
||||
|
||||
nvbench::float64_t m_skip_time{};
|
||||
nvbench::float64_t m_timeout{};
|
||||
|
||||
nvbench::float32_t m_throttle_threshold; // [% of default SM clock rate]
|
||||
nvbench::float32_t m_throttle_recovery_delay; // [seconds]
|
||||
nvbench::float32_t m_throttle_threshold{}; // [% of default SM clock rate]
|
||||
nvbench::float32_t m_throttle_recovery_delay{}; // [seconds]
|
||||
|
||||
// Dynamically increased when repeated throttling occurs
|
||||
// without successfully recording a sample.
|
||||
@@ -123,11 +123,12 @@ protected:
|
||||
nvbench::float64_t m_total_cpu_time{};
|
||||
|
||||
nvbench::float64_t m_sm_clock_rate_accumulator{};
|
||||
std::vector<nvbench::float64_t> m_sm_clock_rates{};
|
||||
|
||||
std::vector<nvbench::float64_t> m_cuda_times;
|
||||
std::vector<nvbench::float64_t> m_cpu_times;
|
||||
std::vector<nvbench::float64_t> m_cuda_times{};
|
||||
std::vector<nvbench::float64_t> m_cpu_times{};
|
||||
|
||||
bool m_max_time_exceeded{};
|
||||
bool m_max_time_exceeded{false};
|
||||
};
|
||||
|
||||
struct measure_cold_base::kernel_launch_timer
|
||||
@@ -136,25 +137,40 @@ struct measure_cold_base::kernel_launch_timer
|
||||
: m_measure{measure}
|
||||
, m_disable_blocking_kernel{measure.m_disable_blocking_kernel}
|
||||
, m_run_once{measure.m_run_once}
|
||||
, m_check_throttling{measure.m_check_throttling}
|
||||
{}
|
||||
|
||||
explicit kernel_launch_timer(measure_cold_base &measure, bool disable_blocking_kernel)
|
||||
: m_measure{measure}
|
||||
, m_disable_blocking_kernel{disable_blocking_kernel}
|
||||
, m_run_once{measure.m_run_once}
|
||||
, m_check_throttling{measure.m_check_throttling}
|
||||
{}
|
||||
|
||||
explicit kernel_launch_timer(measure_cold_base &measure,
|
||||
bool disable_blocking_kernel,
|
||||
bool run_once)
|
||||
bool run_once,
|
||||
bool check_throttling)
|
||||
: m_measure{measure}
|
||||
, m_disable_blocking_kernel{disable_blocking_kernel}
|
||||
, m_run_once{run_once}
|
||||
, m_check_throttling{check_throttling}
|
||||
{}
|
||||
|
||||
__forceinline__ void start()
|
||||
{
|
||||
m_measure.flush_device_l2();
|
||||
m_measure.sync_stream();
|
||||
|
||||
// start CPU timer irrespective of use of blocking kernel
|
||||
// Ref: https://github.com/NVIDIA/nvbench/issues/249
|
||||
m_measure.m_cpu_timer.start();
|
||||
|
||||
if (!m_disable_blocking_kernel)
|
||||
{
|
||||
m_measure.block_stream();
|
||||
}
|
||||
if (m_measure.m_check_throttling)
|
||||
if (m_check_throttling)
|
||||
{
|
||||
m_measure.gpu_frequency_start();
|
||||
}
|
||||
@@ -163,22 +179,19 @@ struct measure_cold_base::kernel_launch_timer
|
||||
m_measure.profiler_start();
|
||||
}
|
||||
m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
|
||||
// start CPU timer irrespective of use of blocking kernel
|
||||
// Ref: https://github.com/NVIDIA/nvbench/issues/249
|
||||
m_measure.m_cpu_timer.start();
|
||||
}
|
||||
|
||||
__forceinline__ void stop()
|
||||
{
|
||||
m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream());
|
||||
if (m_check_throttling)
|
||||
{
|
||||
m_measure.gpu_frequency_stop();
|
||||
}
|
||||
if (!m_disable_blocking_kernel)
|
||||
{
|
||||
m_measure.unblock_stream();
|
||||
}
|
||||
if (m_measure.m_check_throttling)
|
||||
{
|
||||
m_measure.gpu_frequency_stop();
|
||||
}
|
||||
m_measure.sync_stream();
|
||||
if (m_run_once)
|
||||
{
|
||||
@@ -191,6 +204,7 @@ private:
|
||||
measure_cold_base &m_measure;
|
||||
bool m_disable_blocking_kernel;
|
||||
bool m_run_once;
|
||||
bool m_check_throttling;
|
||||
};
|
||||
|
||||
template <typename KernelLauncher>
|
||||
@@ -227,7 +241,7 @@ private:
|
||||
// disable use of blocking kernel for warm-up run
|
||||
// see https://github.com/NVIDIA/nvbench/issues/240
|
||||
constexpr bool disable_blocking_kernel = true;
|
||||
kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once);
|
||||
kernel_launch_timer timer(*this, disable_blocking_kernel);
|
||||
|
||||
this->launch_kernel(timer);
|
||||
this->check_skip_time(m_cuda_timer.get_duration());
|
||||
@@ -238,7 +252,7 @@ private:
|
||||
// do not use blocking kernel if benchmark is only run once, e.g., when profiling
|
||||
// ref: https://github.com/NVIDIA/nvbench/issue/242
|
||||
const bool disable_blocking_kernel = m_run_once || m_disable_blocking_kernel;
|
||||
kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once);
|
||||
kernel_launch_timer timer(*this, disable_blocking_kernel);
|
||||
do
|
||||
{
|
||||
this->launch_kernel(timer);
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
#include <ostream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
@@ -106,27 +107,49 @@ void write_named_values(JsonNode &node, const nvbench::named_values &values)
|
||||
} // end foreach value name
|
||||
}
|
||||
|
||||
template <std::size_t buffer_nbytes>
|
||||
void write_out_values(std::ofstream &out, const std::vector<nvbench::float64_t> &data)
|
||||
// choose buffer to be block size of modern SSD
|
||||
// see: https://github.com/NVIDIA/nvbench/issues/255
|
||||
static constexpr std::size_t preferred_buffer_nbytes = 4096;
|
||||
|
||||
template <std::size_t N, std::size_t... Is>
|
||||
void swap_bytes_impl(char *p, std::index_sequence<Is...>)
|
||||
{
|
||||
static constexpr std::size_t value_nbytes = sizeof(nvbench::float32_t);
|
||||
((std::swap(p[Is], p[N - 1 - Is])), ...);
|
||||
}
|
||||
|
||||
template <std::size_t WordSize>
|
||||
void big_endian_to_little_endian(char *word)
|
||||
{
|
||||
if constexpr (WordSize > 1)
|
||||
{
|
||||
static_assert((WordSize & (WordSize - 1)) == 0, "WordSize must be a power of two");
|
||||
swap_bytes_impl<WordSize>(word, std::make_index_sequence<WordSize / 2>{});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename StorageT, std::size_t buffer_nbytes = preferred_buffer_nbytes>
|
||||
void write_out_values_as(std::ofstream &out, const std::vector<nvbench::float64_t> &data)
|
||||
{
|
||||
static_assert(std::is_floating_point_v<StorageT>);
|
||||
static_assert(std::is_convertible_v<nvbench::float64_t, StorageT>);
|
||||
|
||||
static constexpr std::size_t value_nbytes = sizeof(StorageT);
|
||||
static_assert(buffer_nbytes % value_nbytes == 0);
|
||||
|
||||
alignas(alignof(nvbench::float32_t)) char buffer[buffer_nbytes];
|
||||
alignas(alignof(StorageT)) char buffer[buffer_nbytes];
|
||||
std::size_t bytes_in_buffer = 0;
|
||||
|
||||
for (auto value64 : data)
|
||||
{
|
||||
const auto value32 = static_cast<nvbench::float32_t>(value64);
|
||||
const auto value = static_cast<StorageT>(value64);
|
||||
auto value_subbuffer = &buffer[bytes_in_buffer];
|
||||
std::memcpy(value_subbuffer, &value32, value_nbytes);
|
||||
std::memcpy(value_subbuffer, &value, value_nbytes);
|
||||
|
||||
// the c++17 implementation of is_little_endian isn't constexpr, but
|
||||
// all supported compilers optimize this branch as if it were.
|
||||
if (!is_little_endian())
|
||||
{
|
||||
std::swap(value_subbuffer[0], value_subbuffer[3]);
|
||||
std::swap(value_subbuffer[1], value_subbuffer[2]);
|
||||
big_endian_to_little_endian<value_nbytes>(value_subbuffer);
|
||||
}
|
||||
bytes_in_buffer += value_nbytes;
|
||||
|
||||
@@ -145,6 +168,20 @@ void write_out_values(std::ofstream &out, const std::vector<nvbench::float64_t>
|
||||
}
|
||||
}
|
||||
|
||||
// save data using statically downcasting to float32 format
|
||||
template <std::size_t buffer_nbytes = preferred_buffer_nbytes>
|
||||
void write_out_values_as_float32(std::ofstream &out, const std::vector<nvbench::float64_t> &data)
|
||||
{
|
||||
write_out_values_as<nvbench::float32_t, buffer_nbytes>(out, data);
|
||||
}
|
||||
|
||||
// save data using float64 format
|
||||
template <std::size_t buffer_nbytes = preferred_buffer_nbytes>
|
||||
void write_out_values_as_float64(std::ofstream &out, const std::vector<nvbench::float64_t> &data)
|
||||
{
|
||||
write_out_values_as<nvbench::float64_t, buffer_nbytes>(out, data);
|
||||
}
|
||||
|
||||
} // end namespace
|
||||
|
||||
namespace nvbench
|
||||
@@ -206,10 +243,7 @@ void json_printer::do_process_bulk_data_float64(state &state,
|
||||
out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit);
|
||||
out.open(result_path, std::ios::binary | std::ios::out);
|
||||
|
||||
// choose buffer to be block size of modern SSD
|
||||
// see: https://github.com/NVIDIA/nvbench/issues/255
|
||||
constexpr std::size_t buffer_nbytes = 4096;
|
||||
write_out_values<buffer_nbytes>(out, data);
|
||||
write_out_values_as_float32(out, data);
|
||||
}
|
||||
catch (std::exception &e)
|
||||
{
|
||||
@@ -241,6 +275,66 @@ void json_printer::do_process_bulk_data_float64(state &state,
|
||||
fmt::format("Wrote '{}' in {:>6.3f}ms", result_path.string(), timer.get_duration() * 1000));
|
||||
}
|
||||
} // end hint == sample_times
|
||||
|
||||
if (hint == "sample_freqs")
|
||||
{
|
||||
nvbench::cpu_timer timer;
|
||||
timer.start();
|
||||
|
||||
fs::path result_path{m_stream_name + "-freqs-bin/"};
|
||||
try
|
||||
{
|
||||
if (!fs::exists(result_path))
|
||||
{
|
||||
if (!fs::create_directory(result_path))
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "Failed to create result directory '{}'.");
|
||||
}
|
||||
}
|
||||
else if (!fs::is_directory(result_path))
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error, "{}", "'{}' exists and is not a directory.");
|
||||
}
|
||||
|
||||
const auto file_id = m_num_jsonbin_freq_files++;
|
||||
result_path /= fmt::format("{:d}.bin", file_id);
|
||||
|
||||
std::ofstream out;
|
||||
out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit);
|
||||
out.open(result_path, std::ios::binary | std::ios::out);
|
||||
|
||||
write_out_values_as_float32(out, data);
|
||||
}
|
||||
catch (std::exception &e)
|
||||
{
|
||||
if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
printer.log(
|
||||
nvbench::log_level::warn,
|
||||
fmt::format("Error writing {} ({}) to {}: {}", tag, hint, result_path.string(), e.what()));
|
||||
}
|
||||
} // end catch
|
||||
|
||||
auto &summ = state.add_summary(fmt::format("nv/json/freqs-bin:{}", tag));
|
||||
summ.set_string("name", "Samples Frequencies File");
|
||||
summ.set_string("hint", "file/sample_freqs");
|
||||
summ.set_string("description",
|
||||
"Binary file containing sample frequencies as little-endian "
|
||||
"float32.");
|
||||
summ.set_string("filename", result_path.string());
|
||||
summ.set_int64("size", static_cast<nvbench::int64_t>(data.size()));
|
||||
summ.set_string("hide", "Not needed in table.");
|
||||
|
||||
timer.stop();
|
||||
if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
printer.log(
|
||||
nvbench::log_level::info,
|
||||
fmt::format("Wrote '{}' in {:>6.3f}ms", result_path.string(), timer.get_duration() * 1000));
|
||||
}
|
||||
} // end hint == sample_freqs
|
||||
}
|
||||
|
||||
static void add_devices_section(nlohmann::ordered_json &root)
|
||||
|
||||
@@ -73,6 +73,7 @@ protected:
|
||||
|
||||
bool m_enable_binary_output{false};
|
||||
std::size_t m_num_jsonbin_files{};
|
||||
std::size_t m_num_jsonbin_freq_files{};
|
||||
|
||||
std::vector<std::string> m_argv;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user