diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index 66cb635..f3fd80a 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -32,6 +32,7 @@ #include #include +#include #include #include @@ -76,7 +77,8 @@ protected: { NVBENCH_CUDA_CALL(cudaStreamSynchronize(m_launch.get_stream())); } - + __forceinline__ void profiler_start() const { NVBENCH_CUDA_CALL(cudaProfilerStart()); } + __forceinline__ void profiler_stop() const { NVBENCH_CUDA_CALL(cudaProfilerStop()); } void block_stream(); __forceinline__ void unblock_stream() { m_blocker.unblock(); } @@ -133,11 +135,15 @@ struct measure_cold_base::kernel_launch_timer kernel_launch_timer(measure_cold_base &measure) : m_measure{measure} , m_disable_blocking_kernel{measure.m_disable_blocking_kernel} + , m_run_once{measure.m_run_once} {} - explicit kernel_launch_timer(measure_cold_base &measure, bool disable_blocking_kernel) + explicit kernel_launch_timer(measure_cold_base &measure, + bool disable_blocking_kernel, + bool run_once) : m_measure{measure} , m_disable_blocking_kernel{disable_blocking_kernel} + , m_run_once{run_once} {} __forceinline__ void start() @@ -152,6 +158,10 @@ struct measure_cold_base::kernel_launch_timer { m_measure.gpu_frequency_start(); } + if (m_run_once) + { + m_measure.profiler_start(); + } m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream()); // start CPU timer irrespective of use of blocking kernel // Ref: https://github.com/NVIDIA/nvbench/issues/249 @@ -170,12 +180,17 @@ struct measure_cold_base::kernel_launch_timer m_measure.gpu_frequency_stop(); } m_measure.sync_stream(); + if (m_run_once) + { + m_measure.profiler_stop(); + } m_measure.m_cpu_timer.stop(); } private: measure_cold_base &m_measure; bool m_disable_blocking_kernel; + bool m_run_once; }; template @@ -212,7 +227,7 @@ private: // disable use of blocking kernel for warm-up run // see https://github.com/NVIDIA/nvbench/issues/240 constexpr bool disable_blocking_kernel = true; - kernel_launch_timer timer(*this, disable_blocking_kernel); + kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once); this->launch_kernel(timer); this->check_skip_time(m_cuda_timer.get_duration()); @@ -223,7 +238,7 @@ private: // do not use blocking kernel if benchmark is only run once, e.g., when profiling // ref: https://github.com/NVIDIA/nvbench/issue/242 const bool disable_blocking_kernel = m_run_once || m_disable_blocking_kernel; - kernel_launch_timer timer(*this, disable_blocking_kernel); + kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once); do { this->launch_kernel(timer);