Profile only the kernels involved in the benchmark (#277)

Co-authored-by: Allison Piper <alliepiper16@gmail.com>
This commit is contained in:
comeyrd
2025-10-21 13:51:37 -04:00
committed by GitHub
parent 9b133a94bc
commit 92d2e01cd1

View File

@@ -32,6 +32,7 @@
#include <nvbench/stopping_criterion.cuh>
#include <nvbench/types.cuh>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <utility>
@@ -76,7 +77,8 @@ protected:
{
NVBENCH_CUDA_CALL(cudaStreamSynchronize(m_launch.get_stream()));
}
__forceinline__ void profiler_start() const { NVBENCH_CUDA_CALL(cudaProfilerStart()); }
__forceinline__ void profiler_stop() const { NVBENCH_CUDA_CALL(cudaProfilerStop()); }
void block_stream();
__forceinline__ void unblock_stream() { m_blocker.unblock(); }
@@ -133,11 +135,15 @@ struct measure_cold_base::kernel_launch_timer
kernel_launch_timer(measure_cold_base &measure)
: m_measure{measure}
, m_disable_blocking_kernel{measure.m_disable_blocking_kernel}
, m_run_once{measure.m_run_once}
{}
explicit kernel_launch_timer(measure_cold_base &measure, bool disable_blocking_kernel)
explicit kernel_launch_timer(measure_cold_base &measure,
bool disable_blocking_kernel,
bool run_once)
: m_measure{measure}
, m_disable_blocking_kernel{disable_blocking_kernel}
, m_run_once{run_once}
{}
__forceinline__ void start()
@@ -152,6 +158,10 @@ struct measure_cold_base::kernel_launch_timer
{
m_measure.gpu_frequency_start();
}
if (m_run_once)
{
m_measure.profiler_start();
}
m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
// start CPU timer irrespective of use of blocking kernel
// Ref: https://github.com/NVIDIA/nvbench/issues/249
@@ -170,12 +180,17 @@ struct measure_cold_base::kernel_launch_timer
m_measure.gpu_frequency_stop();
}
m_measure.sync_stream();
if (m_run_once)
{
m_measure.profiler_stop();
}
m_measure.m_cpu_timer.stop();
}
private:
measure_cold_base &m_measure;
bool m_disable_blocking_kernel;
bool m_run_once;
};
template <typename KernelLauncher>
@@ -212,7 +227,7 @@ private:
// disable use of blocking kernel for warm-up run
// see https://github.com/NVIDIA/nvbench/issues/240
constexpr bool disable_blocking_kernel = true;
kernel_launch_timer timer(*this, disable_blocking_kernel);
kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once);
this->launch_kernel(timer);
this->check_skip_time(m_cuda_timer.get_duration());
@@ -223,7 +238,7 @@ private:
// do not use blocking kernel if benchmark is only run once, e.g., when profiling
// ref: https://github.com/NVIDIA/nvbench/issue/242
const bool disable_blocking_kernel = m_run_once || m_disable_blocking_kernel;
kernel_launch_timer timer(*this, disable_blocking_kernel);
kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once);
do
{
this->launch_kernel(timer);