mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-03-14 20:27:24 +00:00
115 lines
3.4 KiB
Plaintext
115 lines
3.4 KiB
Plaintext
/*
|
|
* Copyright 2021 NVIDIA Corporation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
|
* (the "License"); you may not use this file except in compliance with
|
|
* the License.
|
|
*
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <nvbench/types.cuh>
|
|
|
|
namespace nvbench
|
|
{
|
|
|
|
struct cuda_stream;
|
|
|
|
/**
|
|
* Blocks a CUDA stream -- many sharp edges, read docs carefully.
|
|
*
|
|
* @warning This helper breaks the CUDA programming model and will cause
|
|
* deadlocks if misused. It should not be used outside of benchmarking.
|
|
* See caveats section below.
|
|
*
|
|
* This is used to improve the precision of timing with CUDA events. Consider
|
|
* the following pattern for timing a kernel launch:
|
|
*
|
|
* ```
|
|
* NVBENCH_CUDA_CALL(cudaEventRecord(start_event));
|
|
* my_kernel<<<...>>>();
|
|
* NVBENCH_CUDA_CALL(cudaEventRecord(stop_event));
|
|
* ```
|
|
*
|
|
* The `start_event` may be recorded a non-trivial amount of time before
|
|
* `my_kernel` is ready to launch due to various work submission latencies. To
|
|
* reduce the impact of these latencies, blocking_kernel can be used to prevent
|
|
* the `start_event` from being recorded until all work is queued:
|
|
*
|
|
* ```
|
|
* blocking_kernel blocker;
|
|
* blocker.block(stream);
|
|
*
|
|
* NVBENCH_CUDA_CALL(cudaEventRecord(start_event));
|
|
* my_kernel<<<...>>>();
|
|
* NVBENCH_CUDA_CALL(cudaEventRecord(stop_event))
|
|
*
|
|
* blocker.unblock();
|
|
* ```
|
|
*
|
|
* The work submitted after `blocker.block(stream)` will not execute until
|
|
* `blocker.unblock()` is called.
|
|
*
|
|
* ## Timeout
|
|
*
|
|
* The `block` method takes a `timeout` argument. If this is greater than 0,
|
|
* the blocking kernel will print an error message and unblock after `timeout`
|
|
* seconds.
|
|
*
|
|
* ## Caveats and warnings
|
|
*
|
|
* - Every call to `block()` must be followed by a call to `unblock()`.
|
|
* - Do not queue "too much" work while blocking.
|
|
* - Amount of work depends on device and driver.
|
|
* - Do tests and schedule conservatively (~32 kernel launches max).
|
|
* - This helper does NOT guarantee that the work submitted while blocking will
|
|
* execute uninterrupted.
|
|
* - Kernels on other streams may run between the `cudaEventRecord` calls
|
|
* in the above example.
|
|
*/
|
|
struct blocking_kernel
|
|
{
|
|
blocking_kernel();
|
|
~blocking_kernel();
|
|
|
|
void block(const nvbench::cuda_stream &stream, nvbench::float64_t timeout);
|
|
|
|
__forceinline__ void unblock()
|
|
{
|
|
volatile nvbench::int32_t &flag = m_host_flag;
|
|
flag = 1;
|
|
|
|
const volatile nvbench::int32_t &timeout_flag = m_host_timeout_flag;
|
|
if (timeout_flag)
|
|
{
|
|
blocking_kernel::timeout_detected();
|
|
}
|
|
}
|
|
|
|
// move-only
|
|
blocking_kernel(const blocking_kernel &) = delete;
|
|
blocking_kernel(blocking_kernel &&) = default;
|
|
blocking_kernel &operator=(const blocking_kernel &) = delete;
|
|
blocking_kernel &operator=(blocking_kernel &&) = default;
|
|
|
|
private:
|
|
nvbench::int32_t m_host_flag{};
|
|
nvbench::int32_t m_host_timeout_flag{};
|
|
nvbench::int32_t *m_device_flag{};
|
|
nvbench::int32_t *m_device_timeout_flag{};
|
|
|
|
static void timeout_detected();
|
|
};
|
|
|
|
} // namespace nvbench
|