mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-04-19 22:38:52 +00:00
Merge pull request #76 from PointKernel/add-implicit-stream-support
Add implicit stream benchmarking support
This commit is contained in:
@@ -60,6 +60,7 @@ various NVBench features and usecases:
|
||||
- [Enums and compile-time-constant-integral parameter axes](examples/enums.cu)
|
||||
- [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
|
||||
- [Skipping benchmark configurations](examples/skip.cu)
|
||||
- [Benchmarking on a specific stream](examples/stream.cu)
|
||||
- [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu)
|
||||
- [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_timer.cu)
|
||||
|
||||
|
||||
@@ -41,6 +41,36 @@ attributes to be modified.
|
||||
NVBENCH_BENCH(my_benchmark).set_name("my_kernel<<<num_blocks, 256>>>");
|
||||
```
|
||||
|
||||
# CUDA Streams
|
||||
|
||||
NVBench records GPU execution times on a specific CUDA stream. By default, a new
|
||||
stream is created and passed to the `KernelLauncher` via the
|
||||
`nvbench::launch::get_stream()` method, as shown in
|
||||
[Minimal Benchmark](#minimal-benchmark). All benchmarked kernels and other
|
||||
stream-ordered work must be launched on this stream for NVBench to capture it.
|
||||
|
||||
In some instances, it may be inconvenient or impossible to specify an explicit
|
||||
CUDA stream for the benchmarked operation to use. For example, a library may
|
||||
manage and use its own streams, or an opaque API may always launch work on the
|
||||
default stream. In these situations, users may provide NVBench with an explicit
|
||||
stream via `nvbench::state::set_cuda_stream` and `nvbench::make_stream_view`.
|
||||
It is assumed that all work of interest executes on or synchronizes with this
|
||||
stream.
|
||||
|
||||
```cpp
|
||||
void my_benchmark(nvbench::state& state) {
|
||||
cudaStream_t default_stream = 0;
|
||||
state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
|
||||
state.exec([](nvbench::launch&) {
|
||||
my_func(); // a host API invoking GPU kernels on the default stream
|
||||
my_kernel<<<num_blocks, 256>>>(); // or a kernel launched with the default stream
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(my_benchmark);
|
||||
```
|
||||
|
||||
A full example can be found in [examples/stream.cu](../examples/stream.cu).
|
||||
|
||||
# Parameter Axes
|
||||
|
||||
Some kernels will be used with a variety of options, input data types/sizes, and
|
||||
|
||||
@@ -4,6 +4,7 @@ set(example_srcs
|
||||
exec_tag_sync.cu
|
||||
exec_tag_timer.cu
|
||||
skip.cu
|
||||
stream.cu
|
||||
throughput.cu
|
||||
auto_throughput.cu
|
||||
)
|
||||
|
||||
60
examples/stream.cu
Normal file
60
examples/stream.cu
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright 2022 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <nvbench/nvbench.cuh>
|
||||
|
||||
// Grab some testing kernels from NVBench:
|
||||
#include <nvbench/test_kernels.cuh>
|
||||
|
||||
// Thrust vectors simplify memory management:
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
// A function to benchmark but does not expose an explicit stream argument.
|
||||
void copy(int32_t *input, int32_t *output, std::size_t const num_values)
|
||||
{
|
||||
nvbench::copy_kernel<<<256, 256>>>(input, output, num_values);
|
||||
}
|
||||
|
||||
// `stream_bench` copies a 64 MiB buffer of int32_t on a CUDA stream specified
|
||||
// by the user.
|
||||
//
|
||||
// By default, NVBench creates and provides an explicit stream via
|
||||
// `launch::get_stream()` to pass to every stream-ordered operation. Sometimes
|
||||
// it is inconvenient or impossible to specify an explicit CUDA stream to every
|
||||
// stream-ordered operation. In this case, users may specify a target stream via
|
||||
// `state::set_cuda_stream`. It is assumed that all work of interest executes on
|
||||
// or synchronizes with this stream.
|
||||
void stream_bench(nvbench::state &state)
|
||||
{
|
||||
// Allocate input data:
|
||||
const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
|
||||
thrust::device_vector<nvbench::int32_t> input(num_values);
|
||||
thrust::device_vector<nvbench::int32_t> output(num_values);
|
||||
|
||||
// Set CUDA default stream as the target stream. Note the default stream
|
||||
// is non-owning.
|
||||
cudaStream_t default_stream = 0;
|
||||
state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
|
||||
|
||||
state.exec([&input, &output, num_values](nvbench::launch &) {
|
||||
copy(thrust::raw_pointer_cast(input.data()),
|
||||
thrust::raw_pointer_cast(output.data()),
|
||||
num_values);
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(stream_bench);
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
* Copyright 2021-2022 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
@@ -22,25 +22,88 @@
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace nvbench
|
||||
{
|
||||
|
||||
// RAII wrapper for a cudaStream_t.
|
||||
/**
|
||||
* Manages and provides access to a CUDA stream.
|
||||
*
|
||||
* May be owning or non-owning. If the stream is owned, it will be freed with
|
||||
* `cudaStreamDestroy` when the `cuda_stream`'s lifetime ends. Non-owning
|
||||
* `cuda_stream`s are sometimes referred to as views.
|
||||
*
|
||||
* @sa nvbench::make_cuda_stream_view
|
||||
*/
|
||||
struct cuda_stream
|
||||
{
|
||||
cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream)); }
|
||||
~cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamDestroy(m_stream)); }
|
||||
/**
|
||||
* Constructs a cuda_stream that owns a new stream, created with
|
||||
* `cudaStreamCreate`.
|
||||
*/
|
||||
cuda_stream()
|
||||
: m_stream{[]() {
|
||||
cudaStream_t s;
|
||||
NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
|
||||
return s;
|
||||
}(),
|
||||
stream_deleter{true}}
|
||||
{}
|
||||
|
||||
/**
|
||||
* Constructs a `cuda_stream` from an explicit cudaStream_t.
|
||||
*
|
||||
* @param owning If true, `cudaStreamCreate(stream)` will be called from this
|
||||
* `cuda_stream`'s destructor.
|
||||
*
|
||||
* @sa nvbench::make_cuda_stream_view
|
||||
*/
|
||||
cuda_stream(cudaStream_t stream, bool owning)
|
||||
: m_stream{stream, stream_deleter{owning}}
|
||||
{}
|
||||
|
||||
~cuda_stream() = default;
|
||||
|
||||
// move-only
|
||||
cuda_stream(const cuda_stream &) = delete;
|
||||
cuda_stream(cuda_stream &&) = default;
|
||||
cuda_stream &operator=(const cuda_stream &) = delete;
|
||||
cuda_stream(cuda_stream &&) = default;
|
||||
cuda_stream &operator=(cuda_stream &&) = default;
|
||||
|
||||
operator cudaStream_t() const { return m_stream; }
|
||||
/**
|
||||
* @return The `cudaStream_t` managed by this `cuda_stream`.
|
||||
* @{
|
||||
*/
|
||||
operator cudaStream_t() const { return m_stream.get(); }
|
||||
|
||||
cudaStream_t get_stream() const { return m_stream.get(); }
|
||||
/**@}*/
|
||||
|
||||
private:
|
||||
cudaStream_t m_stream;
|
||||
struct stream_deleter
|
||||
{
|
||||
using pointer = cudaStream_t;
|
||||
bool owning;
|
||||
|
||||
constexpr void operator()(pointer s) const noexcept
|
||||
{
|
||||
if (owning)
|
||||
{
|
||||
NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(s));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<cudaStream_t, stream_deleter> m_stream;
|
||||
};
|
||||
|
||||
/**
|
||||
* Creates a non-owning view of the specified `stream`.
|
||||
*/
|
||||
inline nvbench::cuda_stream make_cuda_stream_view(cudaStream_t stream)
|
||||
{
|
||||
return cuda_stream{stream, false};
|
||||
}
|
||||
|
||||
} // namespace nvbench
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
* Copyright 2021-2022 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
@@ -39,6 +39,7 @@ namespace nvbench::detail
|
||||
|
||||
measure_cold_base::measure_cold_base(state &exec_state)
|
||||
: m_state{exec_state}
|
||||
, m_launch{m_state.get_cuda_stream()}
|
||||
, m_run_once{exec_state.get_run_once()}
|
||||
, m_min_samples{exec_state.get_min_samples()}
|
||||
, m_max_noise{exec_state.get_max_noise()}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
* Copyright 2021-2022 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
@@ -169,8 +169,14 @@ std::vector<std::string> add_metrics(nvbench::state &state)
|
||||
} // namespace
|
||||
|
||||
measure_cupti_base::measure_cupti_base(state &exec_state)
|
||||
try : m_state{exec_state}, m_cupti(*m_state.get_device(), add_metrics(m_state))
|
||||
// clang-format off
|
||||
// (formatter doesn't handle `try :` very well...)
|
||||
try
|
||||
: m_state{exec_state}
|
||||
, m_launch{m_state.get_cuda_stream()}
|
||||
, m_cupti{*m_state.get_device(), add_metrics(m_state)}
|
||||
{}
|
||||
// clang-format on
|
||||
catch (const std::exception &ex)
|
||||
{
|
||||
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
* Copyright 2021-2022 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
@@ -37,6 +37,7 @@ namespace nvbench::detail
|
||||
|
||||
measure_hot_base::measure_hot_base(state &exec_state)
|
||||
: m_state{exec_state}
|
||||
, m_launch{m_state.get_cuda_stream()}
|
||||
, m_min_samples{exec_state.get_min_samples()}
|
||||
, m_min_time{exec_state.get_min_time()}
|
||||
, m_skip_time{exec_state.get_skip_time()}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
* Copyright 2021-2022 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
@@ -23,22 +23,43 @@
|
||||
namespace nvbench
|
||||
{
|
||||
|
||||
/**
|
||||
* Configuration object used to communicate with a `KernelLauncher`.
|
||||
*
|
||||
* The `KernelLauncher` passed into `nvbench::state::exec` is required to
|
||||
* accept an `nvbench::launch` argument:
|
||||
*
|
||||
* ```cpp
|
||||
* state.exec([](nvbench::launch &launch) {
|
||||
* kernel<<<M, N, 0, launch.get_stream()>>>();
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
struct launch
|
||||
{
|
||||
explicit launch(const nvbench::cuda_stream &stream)
|
||||
: m_stream{stream}
|
||||
{}
|
||||
|
||||
// move-only
|
||||
launch() = default;
|
||||
launch(const launch &) = delete;
|
||||
launch(launch &&) = default;
|
||||
launch &operator=(const launch &) = delete;
|
||||
launch &operator=(launch &&) = default;
|
||||
|
||||
/**
|
||||
* @return a CUDA stream that all kernels and other stream-ordered CUDA work
|
||||
* must use. This stream can be changed by the `KernelGenerator` using the
|
||||
* `nvbench::state::set_cuda_stream` method.
|
||||
*/
|
||||
__forceinline__ const nvbench::cuda_stream &get_stream() const
|
||||
{
|
||||
return m_stream;
|
||||
};
|
||||
|
||||
private:
|
||||
nvbench::cuda_stream m_stream;
|
||||
// The stream is owned by the `nvbench::state` associated with this launch.
|
||||
const nvbench::cuda_stream &m_stream;
|
||||
};
|
||||
|
||||
} // namespace nvbench
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
* Copyright 2021-2022 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <nvbench/cuda_stream.cuh>
|
||||
#include <nvbench/device_info.cuh>
|
||||
#include <nvbench/exec_tag.cuh>
|
||||
#include <nvbench/named_values.cuh>
|
||||
@@ -62,6 +63,15 @@ struct state
|
||||
state &operator=(const state &) = delete;
|
||||
state &operator=(state &&) = default;
|
||||
|
||||
[[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const
|
||||
{
|
||||
return m_cuda_stream;
|
||||
}
|
||||
void set_cuda_stream(nvbench::cuda_stream &&stream)
|
||||
{
|
||||
m_cuda_stream = std::move(stream);
|
||||
}
|
||||
|
||||
/// The CUDA device associated with with this benchmark state. May be
|
||||
/// nullopt for CPU-only benchmarks.
|
||||
[[nodiscard]] const std::optional<nvbench::device_info> &get_device() const
|
||||
@@ -259,11 +269,13 @@ struct state
|
||||
|
||||
[[nodiscard]] bool is_cupti_required() const
|
||||
{
|
||||
return is_l2_hit_rate_collected()
|
||||
|| is_l1_hit_rate_collected()
|
||||
|| is_stores_efficiency_collected()
|
||||
|| is_loads_efficiency_collected()
|
||||
|| is_dram_throughput_collected();
|
||||
// clang-format off
|
||||
return is_l2_hit_rate_collected() ||
|
||||
is_l1_hit_rate_collected() ||
|
||||
is_stores_efficiency_collected() ||
|
||||
is_loads_efficiency_collected() ||
|
||||
is_dram_throughput_collected();
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
summary &add_summary(std::string summary_tag);
|
||||
@@ -303,6 +315,7 @@ private:
|
||||
std::optional<nvbench::device_info> device,
|
||||
std::size_t type_config_index);
|
||||
|
||||
nvbench::cuda_stream m_cuda_stream;
|
||||
std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
|
||||
nvbench::named_values m_axis_values;
|
||||
std::optional<nvbench::device_info> m_device;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
* Copyright 2021-2022 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
@@ -51,6 +51,24 @@ struct state_tester : public nvbench::state
|
||||
|
||||
using nvbench::detail::state_tester;
|
||||
|
||||
void test_streams()
|
||||
{
|
||||
dummy_bench bench;
|
||||
|
||||
state_tester state{bench};
|
||||
|
||||
// Test non-owning stream
|
||||
cudaStream_t default_stream = 0;
|
||||
state.set_cuda_stream(nvbench::cuda_stream{default_stream, false});
|
||||
ASSERT(state.get_cuda_stream() == default_stream);
|
||||
|
||||
// Test owning stream
|
||||
auto stream = nvbench::cuda_stream{};
|
||||
auto gold = stream.get_stream();
|
||||
state.set_cuda_stream(std::move(stream));
|
||||
ASSERT(state.get_cuda_stream() == gold);
|
||||
}
|
||||
|
||||
void test_params()
|
||||
{
|
||||
dummy_bench bench;
|
||||
@@ -110,6 +128,7 @@ void test_defaults()
|
||||
|
||||
int main()
|
||||
{
|
||||
test_streams();
|
||||
test_params();
|
||||
test_summaries();
|
||||
test_defaults();
|
||||
|
||||
Reference in New Issue
Block a user