mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-03-14 20:27:24 +00:00
Merge pull request #206 from gevtushenko/throttle
Discard measurements while GPU is throttling
This commit is contained in:
@@ -133,6 +133,18 @@
|
||||
* Applies to the most recent `--benchmark`, or all benchmarks if specified
|
||||
before any `--benchmark` arguments.
|
||||
|
||||
* `--throttle-threshold <value>`
|
||||
* Set the GPU throttle threshold as percentage of the peak clock rate.
|
||||
* Default is 75%.
|
||||
* Applies to the most recent `--benchmark`, or all benchmarks if specified
|
||||
before any `--benchmark` arguments.
|
||||
|
||||
* `--throttle-recovery-delay <value>`
|
||||
* Set the GPU throttle recovery delay in seconds.
|
||||
* Default is 0.05 seconds.
|
||||
* Applies to the most recent `--benchmark`, or all benchmarks if specified
|
||||
before any `--benchmark` arguments.
|
||||
|
||||
* `--run-once`
|
||||
* Only run the benchmark once, skipping any warmup runs and batched
|
||||
measurements.
|
||||
|
||||
@@ -29,6 +29,8 @@ set(srcs
|
||||
detail/measure_hot.cu
|
||||
detail/state_generator.cxx
|
||||
detail/stdrel_criterion.cxx
|
||||
detail/gpu_frequency.cxx
|
||||
detail/timestamps_kernel.cu
|
||||
|
||||
internal/nvml.cxx
|
||||
)
|
||||
|
||||
@@ -247,12 +247,32 @@ struct benchmark_base
|
||||
}
|
||||
/// @}
|
||||
|
||||
[[nodiscard]] nvbench::criterion_params& get_criterion_params() { return m_criterion_params; }
|
||||
[[nodiscard]] const nvbench::criterion_params& get_criterion_params() const { return m_criterion_params; }
|
||||
[[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; }
|
||||
|
||||
void set_throttle_threshold(nvbench::float32_t throttle_threshold)
|
||||
{
|
||||
m_throttle_threshold = throttle_threshold;
|
||||
}
|
||||
|
||||
[[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const
|
||||
{
|
||||
return m_throttle_recovery_delay;
|
||||
}
|
||||
|
||||
void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay)
|
||||
{
|
||||
m_throttle_recovery_delay = throttle_recovery_delay;
|
||||
}
|
||||
|
||||
[[nodiscard]] nvbench::criterion_params &get_criterion_params() { return m_criterion_params; }
|
||||
[[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
|
||||
{
|
||||
return m_criterion_params;
|
||||
}
|
||||
|
||||
/// Control the stopping criterion for the measurement loop.
|
||||
/// @{
|
||||
[[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; }
|
||||
[[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; }
|
||||
benchmark_base &set_stopping_criterion(std::string criterion)
|
||||
{
|
||||
m_stopping_criterion = std::move(criterion);
|
||||
@@ -282,6 +302,9 @@ protected:
|
||||
nvbench::float64_t m_skip_time{-1.};
|
||||
nvbench::float64_t m_timeout{15.};
|
||||
|
||||
nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate]
|
||||
nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds]
|
||||
|
||||
nvbench::criterion_params m_criterion_params;
|
||||
std::string m_stopping_criterion{"stdrel"};
|
||||
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
*/
|
||||
|
||||
#include <nvbench/benchmark_base.cuh>
|
||||
|
||||
#include <nvbench/detail/transform_reduce.cuh>
|
||||
|
||||
namespace nvbench
|
||||
@@ -45,7 +44,10 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
|
||||
result->m_skip_time = m_skip_time;
|
||||
result->m_timeout = m_timeout;
|
||||
|
||||
result->m_criterion_params = m_criterion_params;
|
||||
result->m_criterion_params = m_criterion_params;
|
||||
result->m_throttle_threshold = m_throttle_threshold;
|
||||
result->m_throttle_recovery_delay = m_throttle_recovery_delay;
|
||||
|
||||
result->m_stopping_criterion = m_stopping_criterion;
|
||||
|
||||
return result;
|
||||
|
||||
53
nvbench/detail/gpu_frequency.cuh
Normal file
53
nvbench/detail/gpu_frequency.cuh
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
* Copyright 2025 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <nvbench/detail/timestamps_kernel.cuh>
|
||||
#include <nvbench/types.cuh>
|
||||
|
||||
namespace nvbench::detail
|
||||
{
|
||||
|
||||
struct cuda_stream;
|
||||
|
||||
struct gpu_frequency
|
||||
{
|
||||
gpu_frequency() = default;
|
||||
|
||||
// move-only
|
||||
gpu_frequency(const gpu_frequency &) = delete;
|
||||
gpu_frequency(gpu_frequency &&) = default;
|
||||
gpu_frequency &operator=(const gpu_frequency &) = delete;
|
||||
gpu_frequency &operator=(gpu_frequency &&) = default;
|
||||
|
||||
void start(const nvbench::cuda_stream &stream) { m_start.record(stream); }
|
||||
|
||||
void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); }
|
||||
|
||||
[[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
|
||||
nvbench::float32_t throttle_threshold);
|
||||
|
||||
[[nodiscard]] nvbench::float32_t get_clock_frequency();
|
||||
|
||||
private:
|
||||
nvbench::detail::timestamps_kernel m_start;
|
||||
nvbench::detail::timestamps_kernel m_stop;
|
||||
};
|
||||
|
||||
} // namespace nvbench::detail
|
||||
47
nvbench/detail/gpu_frequency.cxx
Normal file
47
nvbench/detail/gpu_frequency.cxx
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright 2025 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <nvbench/detail/gpu_frequency.cuh>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
namespace nvbench::detail
|
||||
{
|
||||
|
||||
nvbench::float32_t gpu_frequency::get_clock_frequency()
|
||||
{
|
||||
nvbench::uint64_t elapsed_ns = m_stop.m_host_timestamps[0] - m_start.m_host_timestamps[0];
|
||||
nvbench::uint64_t elapsed_clocks = m_stop.m_host_timestamps[1] - m_start.m_host_timestamps[1];
|
||||
nvbench::float32_t clock_rate = float(elapsed_clocks) / float(elapsed_ns) * 1000000000.f;
|
||||
return clock_rate;
|
||||
}
|
||||
|
||||
bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
|
||||
nvbench::float32_t throttle_threshold)
|
||||
{
|
||||
float threshold = peak_sm_clock_rate_hz * throttle_threshold;
|
||||
|
||||
if (this->get_clock_frequency() < threshold)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace nvbench::detail
|
||||
@@ -26,7 +26,9 @@
|
||||
#include <nvbench/summary.cuh>
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <limits>
|
||||
#include <thread>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
@@ -44,11 +46,14 @@ measure_cold_base::measure_cold_base(state &exec_state)
|
||||
, m_min_samples{exec_state.get_min_samples()}
|
||||
, m_skip_time{exec_state.get_skip_time()}
|
||||
, m_timeout{exec_state.get_timeout()}
|
||||
, m_throttle_threshold(exec_state.get_throttle_threshold())
|
||||
, m_throttle_recovery_delay(exec_state.get_throttle_recovery_delay())
|
||||
{
|
||||
if (m_min_samples > 0)
|
||||
{
|
||||
m_cuda_times.reserve(static_cast<std::size_t>(m_min_samples));
|
||||
m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
|
||||
m_sm_clock_rates.reserve(static_cast<std::size_t>(m_min_samples));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,6 +83,7 @@ void measure_cold_base::initialize()
|
||||
|
||||
m_cuda_times.clear();
|
||||
m_cpu_times.clear();
|
||||
m_sm_clock_rates.clear();
|
||||
|
||||
m_stopping_criterion.initialize(m_criterion_params);
|
||||
}
|
||||
@@ -86,6 +92,40 @@ void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); }
|
||||
|
||||
void measure_cold_base::record_measurements()
|
||||
{
|
||||
if (!m_run_once)
|
||||
{
|
||||
auto peak_clock_rate = static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
|
||||
|
||||
if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold))
|
||||
{
|
||||
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
printer.log(nvbench::log_level::warn,
|
||||
fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
|
||||
"({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample "
|
||||
"and pausing for {}s.",
|
||||
current_clock_rate / 1000000.0f,
|
||||
peak_clock_rate / 1000000.0f,
|
||||
100.0f * (current_clock_rate / peak_clock_rate),
|
||||
100.0f * m_throttle_threshold,
|
||||
m_total_samples,
|
||||
m_throttle_recovery_delay));
|
||||
}
|
||||
|
||||
if (m_throttle_recovery_delay > 0.0f)
|
||||
{ // let the GPU cool down
|
||||
std::this_thread::sleep_for(std::chrono::duration<float>(m_throttle_recovery_delay));
|
||||
}
|
||||
|
||||
// ignore this measurement
|
||||
return;
|
||||
}
|
||||
|
||||
m_sm_clock_rates.push_back(peak_clock_rate);
|
||||
}
|
||||
|
||||
// Update and record timers and counters:
|
||||
const auto cur_cuda_time = m_cuda_timer.get_duration();
|
||||
const auto cur_cpu_time = m_cpu_timer.get_duration();
|
||||
@@ -298,6 +338,18 @@ void measure_cold_base::generate_summaries()
|
||||
summ.set_string("hide", "Hidden by default.");
|
||||
}
|
||||
|
||||
if (!m_sm_clock_rates.empty())
|
||||
{
|
||||
auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean");
|
||||
summ.set_string("name", "Clock Rate");
|
||||
summ.set_string("hint", "frequency");
|
||||
summ.set_string("description", "Mean SM clock rate");
|
||||
summ.set_string("hide", "Hidden by default.");
|
||||
summ.set_float64("value",
|
||||
nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(),
|
||||
m_sm_clock_rates.cend()));
|
||||
}
|
||||
|
||||
// Log if a printer exists:
|
||||
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
|
||||
@@ -18,24 +18,26 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <nvbench/blocking_kernel.cuh>
|
||||
#include <nvbench/cpu_timer.cuh>
|
||||
#include <nvbench/cuda_call.cuh>
|
||||
#include <nvbench/cuda_timer.cuh>
|
||||
#include <nvbench/detail/gpu_frequency.cuh>
|
||||
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
|
||||
#include <nvbench/detail/l2flush.cuh>
|
||||
#include <nvbench/detail/statistics.cuh>
|
||||
#include <nvbench/device_info.cuh>
|
||||
#include <nvbench/exec_tag.cuh>
|
||||
#include <nvbench/launch.cuh>
|
||||
#include <nvbench/stopping_criterion.cuh>
|
||||
|
||||
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
|
||||
#include <nvbench/detail/l2flush.cuh>
|
||||
#include <nvbench/detail/statistics.cuh>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "nvbench/types.cuh"
|
||||
|
||||
namespace nvbench
|
||||
{
|
||||
|
||||
@@ -64,6 +66,8 @@ protected:
|
||||
bool is_finished();
|
||||
void run_trials_epilogue();
|
||||
void generate_summaries();
|
||||
void gpu_frequency_start() { m_gpu_frequency.start(m_launch.get_stream()); }
|
||||
void gpu_frequency_stop() { m_gpu_frequency.stop(m_launch.get_stream()); }
|
||||
|
||||
void check_skip_time(nvbench::float64_t warmup_time);
|
||||
|
||||
@@ -87,7 +91,8 @@ protected:
|
||||
nvbench::blocking_kernel m_blocker;
|
||||
|
||||
nvbench::criterion_params m_criterion_params;
|
||||
nvbench::stopping_criterion_base& m_stopping_criterion;
|
||||
nvbench::stopping_criterion_base &m_stopping_criterion;
|
||||
nvbench::detail::gpu_frequency m_gpu_frequency;
|
||||
|
||||
bool m_disable_blocking_kernel{false};
|
||||
bool m_run_once{false};
|
||||
@@ -97,6 +102,9 @@ protected:
|
||||
nvbench::float64_t m_skip_time{};
|
||||
nvbench::float64_t m_timeout{};
|
||||
|
||||
nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate]
|
||||
nvbench::float32_t m_throttle_recovery_delay; // [seconds]
|
||||
|
||||
nvbench::int64_t m_total_samples{};
|
||||
|
||||
nvbench::float64_t m_min_cuda_time{};
|
||||
@@ -109,6 +117,7 @@ protected:
|
||||
|
||||
std::vector<nvbench::float64_t> m_cuda_times;
|
||||
std::vector<nvbench::float64_t> m_cpu_times;
|
||||
std::vector<nvbench::float32_t> m_sm_clock_rates;
|
||||
|
||||
bool m_max_time_exceeded{};
|
||||
};
|
||||
@@ -128,6 +137,10 @@ struct measure_cold_base::kernel_launch_timer
|
||||
{
|
||||
m_measure.block_stream();
|
||||
}
|
||||
if (!m_measure.m_run_once)
|
||||
{
|
||||
m_measure.gpu_frequency_start();
|
||||
}
|
||||
m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
|
||||
if (m_disable_blocking_kernel)
|
||||
{
|
||||
@@ -143,6 +156,10 @@ struct measure_cold_base::kernel_launch_timer
|
||||
m_measure.m_cpu_timer.start();
|
||||
m_measure.unblock_stream();
|
||||
}
|
||||
if (!m_measure.m_run_once)
|
||||
{
|
||||
m_measure.gpu_frequency_stop();
|
||||
}
|
||||
m_measure.sync_stream();
|
||||
m_measure.m_cpu_timer.stop();
|
||||
}
|
||||
|
||||
82
nvbench/detail/timestamps_kernel.cu
Normal file
82
nvbench/detail/timestamps_kernel.cu
Normal file
@@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Copyright 2025 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <nvbench/cuda_call.cuh>
|
||||
#include <nvbench/cuda_stream.cuh>
|
||||
#include <nvbench/detail/timestamps_kernel.cuh>
|
||||
#include <nvbench/types.cuh>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
__global__ void get_timestamps_kernel(nvbench::uint64_t *global_timestamp,
|
||||
nvbench::uint64_t *sm0_timestamp)
|
||||
{
|
||||
nvbench::uint32_t smid;
|
||||
asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
|
||||
if (smid == 0)
|
||||
{
|
||||
nvbench::uint64_t gts, lts;
|
||||
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(gts));
|
||||
lts = clock64();
|
||||
|
||||
*global_timestamp = gts;
|
||||
*sm0_timestamp = lts;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace nvbench::detail
|
||||
{
|
||||
|
||||
timestamps_kernel::timestamps_kernel()
|
||||
{
|
||||
NVBENCH_CUDA_CALL(
|
||||
cudaHostRegister(&m_host_timestamps, sizeof(nvbench::uint64_t) * 2, cudaHostRegisterMapped));
|
||||
NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_timestamps, &m_host_timestamps, 0));
|
||||
}
|
||||
|
||||
timestamps_kernel::~timestamps_kernel()
|
||||
{
|
||||
NVBENCH_CUDA_CALL_NOEXCEPT(cudaHostUnregister(&m_host_timestamps));
|
||||
}
|
||||
|
||||
void timestamps_kernel::record(const nvbench::cuda_stream &stream)
|
||||
{
|
||||
m_host_timestamps[0] = 0;
|
||||
m_host_timestamps[1] = 0;
|
||||
|
||||
int device_id = 0;
|
||||
int num_sms = 0;
|
||||
|
||||
NVBENCH_CUDA_CALL(cudaGetDevice(&device_id));
|
||||
NVBENCH_CUDA_CALL(
|
||||
cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device_id));
|
||||
|
||||
get_timestamps_kernel<<<static_cast<unsigned int>(num_sms), 1, 0, stream.get_stream()>>>(
|
||||
m_device_timestamps,
|
||||
m_device_timestamps + 1);
|
||||
}
|
||||
|
||||
} // namespace nvbench
|
||||
50
nvbench/detail/timestamps_kernel.cuh
Normal file
50
nvbench/detail/timestamps_kernel.cuh
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright 2025 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <nvbench/types.cuh>
|
||||
|
||||
namespace nvbench
|
||||
{
|
||||
|
||||
struct cuda_stream;
|
||||
|
||||
namespace detail
|
||||
{
|
||||
|
||||
struct timestamps_kernel
|
||||
{
|
||||
timestamps_kernel();
|
||||
~timestamps_kernel();
|
||||
|
||||
void record(const nvbench::cuda_stream &stream);
|
||||
|
||||
// move-only
|
||||
timestamps_kernel(const timestamps_kernel &) = delete;
|
||||
timestamps_kernel(timestamps_kernel &&) = default;
|
||||
timestamps_kernel &operator=(const timestamps_kernel &) = delete;
|
||||
timestamps_kernel &operator=(timestamps_kernel &&) = default;
|
||||
|
||||
nvbench::uint64_t m_host_timestamps[2];
|
||||
nvbench::uint64_t *m_device_timestamps{};
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
} // namespace nvbench
|
||||
@@ -18,17 +18,18 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <nvbench/config.cuh>
|
||||
#include <nvbench/cuda_call.cuh>
|
||||
#include <nvbench/detail/device_scope.cuh>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <cstdint> // CHAR_BIT
|
||||
#include <stdexcept>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
|
||||
#include <string_view>
|
||||
|
||||
// forward declare this for internal storage
|
||||
struct nvmlDevice_st;
|
||||
|
||||
@@ -108,7 +109,7 @@ struct device_info
|
||||
/// @return The default clock rate of the SM in Hz.
|
||||
[[nodiscard]] std::size_t get_sm_default_clock_rate() const
|
||||
{ // kHz -> Hz
|
||||
return static_cast<std::size_t>(m_prop.clockRate * 1000);
|
||||
return static_cast<std::size_t>(m_prop.clockRate) * 1000;
|
||||
}
|
||||
|
||||
/// @return The number of physical streaming multiprocessors on this device.
|
||||
|
||||
@@ -294,6 +294,10 @@ void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_
|
||||
{
|
||||
table.add_cell(row, tag, header, this->do_format_item_rate(summ));
|
||||
}
|
||||
else if (hint == "frequency")
|
||||
{
|
||||
table.add_cell(row, tag, header, this->do_format_frequency(summ));
|
||||
}
|
||||
else if (hint == "bytes")
|
||||
{
|
||||
table.add_cell(row, tag, header, this->do_format_bytes(summ));
|
||||
@@ -399,6 +403,27 @@ std::string markdown_printer::do_format_item_rate(const summary &data)
|
||||
}
|
||||
}
|
||||
|
||||
std::string markdown_printer::do_format_frequency(const nvbench::summary &data)
|
||||
{
|
||||
const auto frequency_hz = data.get_float64("value");
|
||||
if (frequency_hz >= 1e9)
|
||||
{
|
||||
return fmt::format("{:0.3f} GHz", frequency_hz * 1e-9);
|
||||
}
|
||||
else if (frequency_hz >= 1e6)
|
||||
{
|
||||
return fmt::format("{:0.3f} MHz", frequency_hz * 1e-6);
|
||||
}
|
||||
else if (frequency_hz >= 1e3)
|
||||
{
|
||||
return fmt::format("{:0.3f} KHz", frequency_hz * 1e-3);
|
||||
}
|
||||
else
|
||||
{
|
||||
return fmt::format("{:0.3f} Hz", frequency_hz);
|
||||
}
|
||||
}
|
||||
|
||||
std::string markdown_printer::do_format_bytes(const summary &data)
|
||||
{
|
||||
const auto bytes = static_cast<nvbench::float64_t>(data.get_int64("value"));
|
||||
|
||||
@@ -64,6 +64,7 @@ protected:
|
||||
virtual std::string do_format_default(const nvbench::summary &data);
|
||||
virtual std::string do_format_duration(const nvbench::summary &seconds);
|
||||
virtual std::string do_format_item_rate(const nvbench::summary &items_per_sec);
|
||||
virtual std::string do_format_frequency(const nvbench::summary &frequency_hz);
|
||||
virtual std::string do_format_bytes(const nvbench::summary &bytes);
|
||||
virtual std::string do_format_byte_rate(const nvbench::summary &bytes_per_sec);
|
||||
virtual std::string do_format_sample_size(const nvbench::summary &count);
|
||||
|
||||
@@ -16,28 +16,24 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <nvbench/option_parser.cuh>
|
||||
|
||||
#include <nvbench/benchmark_base.cuh>
|
||||
#include <nvbench/benchmark_manager.cuh>
|
||||
#include <nvbench/csv_printer.cuh>
|
||||
#include <nvbench/criterion_manager.cuh>
|
||||
#include <nvbench/csv_printer.cuh>
|
||||
#include <nvbench/detail/throw.cuh>
|
||||
#include <nvbench/device_manager.cuh>
|
||||
#include <nvbench/git_revision.cuh>
|
||||
#include <nvbench/json_printer.cuh>
|
||||
#include <nvbench/markdown_printer.cuh>
|
||||
#include <nvbench/option_parser.cuh>
|
||||
#include <nvbench/printer_base.cuh>
|
||||
#include <nvbench/range.cuh>
|
||||
#include <nvbench/version.cuh>
|
||||
|
||||
#include <nvbench/detail/throw.cuh>
|
||||
|
||||
// These are generated from the markdown docs by CMake in the build directory:
|
||||
#include <nvbench/internal/cli_help.cuh>
|
||||
#include <nvbench/internal/cli_help_axis.cuh>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
@@ -48,10 +44,12 @@
|
||||
#include <regex>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include <fmt/format.h>
|
||||
#include <string_view>
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
@@ -515,7 +513,8 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
|
||||
this->update_int64_prop(first[0], first[1]);
|
||||
first += 2;
|
||||
}
|
||||
else if (arg == "--skip-time" || arg == "--timeout")
|
||||
else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" ||
|
||||
arg == "--throttle-recovery-delay")
|
||||
{
|
||||
check_params(1);
|
||||
this->update_float64_prop(first[0], first[1]);
|
||||
@@ -623,7 +622,7 @@ void option_parser::print_version() const
|
||||
NVBENCH_GIT_VERSION);
|
||||
}
|
||||
|
||||
void option_parser::print_list(printer_base& printer) const
|
||||
void option_parser::print_list(printer_base &printer) const
|
||||
{
|
||||
const auto &bench_mgr = nvbench::benchmark_manager::get();
|
||||
printer.print_device_info();
|
||||
@@ -979,10 +978,9 @@ catch (std::exception &e)
|
||||
e.what());
|
||||
}
|
||||
|
||||
void option_parser::update_criterion_prop(
|
||||
const std::string &prop_arg,
|
||||
const std::string &prop_val,
|
||||
const nvbench::named_values::type type)
|
||||
void option_parser::update_criterion_prop(const std::string &prop_arg,
|
||||
const std::string &prop_val,
|
||||
const nvbench::named_values::type type)
|
||||
try
|
||||
{
|
||||
// If no active benchmark, save args as global.
|
||||
@@ -993,8 +991,8 @@ try
|
||||
return;
|
||||
}
|
||||
|
||||
benchmark_base &bench = *m_benchmarks.back();
|
||||
nvbench::criterion_params& criterion_params = bench.get_criterion_params();
|
||||
benchmark_base &bench = *m_benchmarks.back();
|
||||
nvbench::criterion_params &criterion_params = bench.get_criterion_params();
|
||||
std::string name(prop_arg.begin() + 2, prop_arg.end());
|
||||
if (type == nvbench::named_values::type::float64)
|
||||
{
|
||||
@@ -1022,7 +1020,7 @@ try
|
||||
NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg);
|
||||
}
|
||||
}
|
||||
catch (std::exception& e)
|
||||
catch (std::exception &e)
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error,
|
||||
"Error handling option `{} {}`:\n{}",
|
||||
@@ -1054,6 +1052,14 @@ try
|
||||
{
|
||||
bench.set_timeout(value);
|
||||
}
|
||||
else if (prop_arg == "--throttle-threshold")
|
||||
{
|
||||
bench.set_throttle_threshold(static_cast<nvbench::float32_t>(value) / 100.0f);
|
||||
}
|
||||
else if (prop_arg == "--throttle-recovery-delay")
|
||||
{
|
||||
bench.set_throttle_recovery_delay(static_cast<nvbench::float32_t>(value));
|
||||
}
|
||||
else
|
||||
{
|
||||
NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg);
|
||||
|
||||
@@ -80,7 +80,7 @@ private:
|
||||
std::ostream &printer_spec_to_ostream(const std::string &spec);
|
||||
|
||||
void print_version() const;
|
||||
void print_list(printer_base& printer) const;
|
||||
void print_list(printer_base &printer) const;
|
||||
void print_help() const;
|
||||
void print_help_axis() const;
|
||||
|
||||
|
||||
@@ -22,9 +22,9 @@
|
||||
#include <nvbench/device_info.cuh>
|
||||
#include <nvbench/exec_tag.cuh>
|
||||
#include <nvbench/named_values.cuh>
|
||||
#include <nvbench/stopping_criterion.cuh>
|
||||
#include <nvbench/summary.cuh>
|
||||
#include <nvbench/types.cuh>
|
||||
#include <nvbench/stopping_criterion.cuh>
|
||||
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
@@ -136,8 +136,11 @@ struct state
|
||||
|
||||
/// Control the stopping criterion for the measurement loop.
|
||||
/// @{
|
||||
[[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; }
|
||||
void set_stopping_criterion(std::string criterion) { m_stopping_criterion = std::move(criterion); }
|
||||
[[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; }
|
||||
void set_stopping_criterion(std::string criterion)
|
||||
{
|
||||
m_stopping_criterion = std::move(criterion);
|
||||
}
|
||||
/// @}
|
||||
|
||||
/// If true, the benchmark is only run once, skipping all warmup runs and only
|
||||
@@ -200,6 +203,23 @@ struct state
|
||||
void set_timeout(nvbench::float64_t timeout) { m_timeout = timeout; }
|
||||
/// @}
|
||||
|
||||
[[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; }
|
||||
|
||||
void set_throttle_threshold(nvbench::float32_t throttle_threshold)
|
||||
{
|
||||
m_throttle_threshold = throttle_threshold;
|
||||
}
|
||||
|
||||
[[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const
|
||||
{
|
||||
return m_throttle_recovery_delay;
|
||||
}
|
||||
|
||||
void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay)
|
||||
{
|
||||
m_throttle_recovery_delay = throttle_recovery_delay;
|
||||
}
|
||||
|
||||
/// If a `KernelLauncher` syncs and `nvbench::exec_tag::sync` is not passed
|
||||
/// to `state.exec(...)`, a deadlock may occur. If a `blocking_kernel` blocks
|
||||
/// for more than `blocking_kernel_timeout` seconds, an error will be printed
|
||||
@@ -311,6 +331,9 @@ private:
|
||||
nvbench::float64_t m_skip_time;
|
||||
nvbench::float64_t m_timeout;
|
||||
|
||||
nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate]
|
||||
nvbench::float32_t m_throttle_recovery_delay; // [seconds]
|
||||
|
||||
// Deadlock protection. See blocking_kernel's class doc for details.
|
||||
nvbench::float64_t m_blocking_kernel_timeout{30.0};
|
||||
|
||||
|
||||
@@ -15,20 +15,18 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <nvbench/state.cuh>
|
||||
|
||||
#include <nvbench/benchmark_base.cuh>
|
||||
#include <nvbench/detail/throw.cuh>
|
||||
#include <nvbench/state.cuh>
|
||||
#include <nvbench/types.cuh>
|
||||
|
||||
#include <fmt/color.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#include <fmt/color.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
namespace nvbench
|
||||
{
|
||||
|
||||
@@ -42,6 +40,8 @@ state::state(const benchmark_base &bench)
|
||||
, m_min_samples{bench.get_min_samples()}
|
||||
, m_skip_time{bench.get_skip_time()}
|
||||
, m_timeout{bench.get_timeout()}
|
||||
, m_throttle_threshold{bench.get_throttle_threshold()}
|
||||
, m_throttle_recovery_delay{bench.get_throttle_recovery_delay()}
|
||||
{}
|
||||
|
||||
state::state(const benchmark_base &bench,
|
||||
@@ -60,6 +60,8 @@ state::state(const benchmark_base &bench,
|
||||
, m_min_samples{bench.get_min_samples()}
|
||||
, m_skip_time{bench.get_skip_time()}
|
||||
, m_timeout{bench.get_timeout()}
|
||||
, m_throttle_threshold{bench.get_throttle_threshold()}
|
||||
, m_throttle_recovery_delay{bench.get_throttle_recovery_delay()}
|
||||
{}
|
||||
|
||||
nvbench::int64_t state::get_int64(const std::string &axis_name) const
|
||||
|
||||
Reference in New Issue
Block a user