Merge pull request #76 from PointKernel/add-implicit-stream-support

Add implicit stream benchmarking support
This commit is contained in:
Allison Vacanti
2022-02-11 13:38:06 -05:00
committed by GitHub
11 changed files with 237 additions and 21 deletions

View File

@@ -60,6 +60,7 @@ various NVBench features and usecases:
- [Enums and compile-time-constant-integral parameter axes](examples/enums.cu)
- [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
- [Skipping benchmark configurations](examples/skip.cu)
- [Benchmarking on a specific stream](examples/stream.cu)
- [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu)
- [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_timer.cu)

View File

@@ -41,6 +41,36 @@ attributes to be modified.
NVBENCH_BENCH(my_benchmark).set_name("my_kernel<<<num_blocks, 256>>>");
```
# CUDA Streams
NVBench records GPU execution times on a specific CUDA stream. By default, a new
stream is created and passed to the `KernelLauncher` via the
`nvbench::launch::get_stream()` method, as shown in
[Minimal Benchmark](#minimal-benchmark). All benchmarked kernels and other
stream-ordered work must be launched on this stream for NVBench to capture it.
In some instances, it may be inconvenient or impossible to specify an explicit
CUDA stream for the benchmarked operation to use. For example, a library may
manage and use its own streams, or an opaque API may always launch work on the
default stream. In these situations, users may provide NVBench with an explicit
stream via `nvbench::state::set_cuda_stream` and `nvbench::make_stream_view`.
It is assumed that all work of interest executes on or synchronizes with this
stream.
```cpp
void my_benchmark(nvbench::state& state) {
cudaStream_t default_stream = 0;
state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
state.exec([](nvbench::launch&) {
my_func(); // a host API invoking GPU kernels on the default stream
my_kernel<<<num_blocks, 256>>>(); // or a kernel launched with the default stream
});
}
NVBENCH_BENCH(my_benchmark);
```
A full example can be found in [examples/stream.cu](../examples/stream.cu).
# Parameter Axes
Some kernels will be used with a variety of options, input data types/sizes, and

View File

@@ -4,6 +4,7 @@ set(example_srcs
exec_tag_sync.cu
exec_tag_timer.cu
skip.cu
stream.cu
throughput.cu
auto_throughput.cu
)

60
examples/stream.cu Normal file
View File

@@ -0,0 +1,60 @@
/*
* Copyright 2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/nvbench.cuh>
// Grab some testing kernels from NVBench:
#include <nvbench/test_kernels.cuh>
// Thrust vectors simplify memory management:
#include <thrust/device_vector.h>
// A function to benchmark but does not expose an explicit stream argument.
void copy(int32_t *input, int32_t *output, std::size_t const num_values)
{
nvbench::copy_kernel<<<256, 256>>>(input, output, num_values);
}
// `stream_bench` copies a 64 MiB buffer of int32_t on a CUDA stream specified
// by the user.
//
// By default, NVBench creates and provides an explicit stream via
// `launch::get_stream()` to pass to every stream-ordered operation. Sometimes
// it is inconvenient or impossible to specify an explicit CUDA stream to every
// stream-ordered operation. In this case, users may specify a target stream via
// `state::set_cuda_stream`. It is assumed that all work of interest executes on
// or synchronizes with this stream.
void stream_bench(nvbench::state &state)
{
// Allocate input data:
const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
thrust::device_vector<nvbench::int32_t> input(num_values);
thrust::device_vector<nvbench::int32_t> output(num_values);
// Set CUDA default stream as the target stream. Note the default stream
// is non-owning.
cudaStream_t default_stream = 0;
state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
state.exec([&input, &output, num_values](nvbench::launch &) {
copy(thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),
num_values);
});
}
NVBENCH_BENCH(stream_bench);

View File

@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
@@ -22,25 +22,88 @@
#include <cuda_runtime_api.h>
#include <memory>
namespace nvbench
{
// RAII wrapper for a cudaStream_t.
/**
* Manages and provides access to a CUDA stream.
*
* May be owning or non-owning. If the stream is owned, it will be freed with
* `cudaStreamDestroy` when the `cuda_stream`'s lifetime ends. Non-owning
* `cuda_stream`s are sometimes referred to as views.
*
* @sa nvbench::make_cuda_stream_view
*/
struct cuda_stream
{
cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream)); }
~cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamDestroy(m_stream)); }
/**
* Constructs a cuda_stream that owns a new stream, created with
* `cudaStreamCreate`.
*/
cuda_stream()
: m_stream{[]() {
cudaStream_t s;
NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
return s;
}(),
stream_deleter{true}}
{}
/**
* Constructs a `cuda_stream` from an explicit cudaStream_t.
*
* @param owning If true, `cudaStreamCreate(stream)` will be called from this
* `cuda_stream`'s destructor.
*
* @sa nvbench::make_cuda_stream_view
*/
cuda_stream(cudaStream_t stream, bool owning)
: m_stream{stream, stream_deleter{owning}}
{}
~cuda_stream() = default;
// move-only
cuda_stream(const cuda_stream &) = delete;
cuda_stream(cuda_stream &&) = default;
cuda_stream &operator=(const cuda_stream &) = delete;
cuda_stream(cuda_stream &&) = default;
cuda_stream &operator=(cuda_stream &&) = default;
operator cudaStream_t() const { return m_stream; }
/**
* @return The `cudaStream_t` managed by this `cuda_stream`.
* @{
*/
operator cudaStream_t() const { return m_stream.get(); }
cudaStream_t get_stream() const { return m_stream.get(); }
/**@}*/
private:
cudaStream_t m_stream;
struct stream_deleter
{
using pointer = cudaStream_t;
bool owning;
constexpr void operator()(pointer s) const noexcept
{
if (owning)
{
NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(s));
}
}
};
std::unique_ptr<cudaStream_t, stream_deleter> m_stream;
};
/**
* Creates a non-owning view of the specified `stream`.
*/
inline nvbench::cuda_stream make_cuda_stream_view(cudaStream_t stream)
{
return cuda_stream{stream, false};
}
} // namespace nvbench

View File

@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
@@ -39,6 +39,7 @@ namespace nvbench::detail
measure_cold_base::measure_cold_base(state &exec_state)
: m_state{exec_state}
, m_launch{m_state.get_cuda_stream()}
, m_run_once{exec_state.get_run_once()}
, m_min_samples{exec_state.get_min_samples()}
, m_max_noise{exec_state.get_max_noise()}

View File

@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
@@ -169,8 +169,14 @@ std::vector<std::string> add_metrics(nvbench::state &state)
} // namespace
measure_cupti_base::measure_cupti_base(state &exec_state)
try : m_state{exec_state}, m_cupti(*m_state.get_device(), add_metrics(m_state))
// clang-format off
// (formatter doesn't handle `try :` very well...)
try
: m_state{exec_state}
, m_launch{m_state.get_cuda_stream()}
, m_cupti{*m_state.get_device(), add_metrics(m_state)}
{}
// clang-format on
catch (const std::exception &ex)
{
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();

View File

@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
@@ -37,6 +37,7 @@ namespace nvbench::detail
measure_hot_base::measure_hot_base(state &exec_state)
: m_state{exec_state}
, m_launch{m_state.get_cuda_stream()}
, m_min_samples{exec_state.get_min_samples()}
, m_min_time{exec_state.get_min_time()}
, m_skip_time{exec_state.get_skip_time()}

View File

@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
@@ -23,22 +23,43 @@
namespace nvbench
{
/**
* Configuration object used to communicate with a `KernelLauncher`.
*
* The `KernelLauncher` passed into `nvbench::state::exec` is required to
* accept an `nvbench::launch` argument:
*
* ```cpp
* state.exec([](nvbench::launch &launch) {
* kernel<<<M, N, 0, launch.get_stream()>>>();
* }
* ```
*/
struct launch
{
explicit launch(const nvbench::cuda_stream &stream)
: m_stream{stream}
{}
// move-only
launch() = default;
launch(const launch &) = delete;
launch(launch &&) = default;
launch &operator=(const launch &) = delete;
launch &operator=(launch &&) = default;
/**
* @return a CUDA stream that all kernels and other stream-ordered CUDA work
* must use. This stream can be changed by the `KernelGenerator` using the
* `nvbench::state::set_cuda_stream` method.
*/
__forceinline__ const nvbench::cuda_stream &get_stream() const
{
return m_stream;
};
private:
nvbench::cuda_stream m_stream;
// The stream is owned by the `nvbench::state` associated with this launch.
const nvbench::cuda_stream &m_stream;
};
} // namespace nvbench

View File

@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
@@ -18,6 +18,7 @@
#pragma once
#include <nvbench/cuda_stream.cuh>
#include <nvbench/device_info.cuh>
#include <nvbench/exec_tag.cuh>
#include <nvbench/named_values.cuh>
@@ -62,6 +63,15 @@ struct state
state &operator=(const state &) = delete;
state &operator=(state &&) = default;
[[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const
{
return m_cuda_stream;
}
void set_cuda_stream(nvbench::cuda_stream &&stream)
{
m_cuda_stream = std::move(stream);
}
/// The CUDA device associated with with this benchmark state. May be
/// nullopt for CPU-only benchmarks.
[[nodiscard]] const std::optional<nvbench::device_info> &get_device() const
@@ -259,11 +269,13 @@ struct state
[[nodiscard]] bool is_cupti_required() const
{
return is_l2_hit_rate_collected()
|| is_l1_hit_rate_collected()
|| is_stores_efficiency_collected()
|| is_loads_efficiency_collected()
|| is_dram_throughput_collected();
// clang-format off
return is_l2_hit_rate_collected() ||
is_l1_hit_rate_collected() ||
is_stores_efficiency_collected() ||
is_loads_efficiency_collected() ||
is_dram_throughput_collected();
// clang-format on
}
summary &add_summary(std::string summary_tag);
@@ -303,6 +315,7 @@ private:
std::optional<nvbench::device_info> device,
std::size_t type_config_index);
nvbench::cuda_stream m_cuda_stream;
std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
nvbench::named_values m_axis_values;
std::optional<nvbench::device_info> m_device;

View File

@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
@@ -51,6 +51,24 @@ struct state_tester : public nvbench::state
using nvbench::detail::state_tester;
void test_streams()
{
dummy_bench bench;
state_tester state{bench};
// Test non-owning stream
cudaStream_t default_stream = 0;
state.set_cuda_stream(nvbench::cuda_stream{default_stream, false});
ASSERT(state.get_cuda_stream() == default_stream);
// Test owning stream
auto stream = nvbench::cuda_stream{};
auto gold = stream.get_stream();
state.set_cuda_stream(std::move(stream));
ASSERT(state.get_cuda_stream() == gold);
}
void test_params()
{
dummy_bench bench;
@@ -110,6 +128,7 @@ void test_defaults()
int main()
{
test_streams();
test_params();
test_summaries();
test_defaults();