diff --git a/README.md b/README.md index 3f1db1b..ed06981 100644 --- a/README.md +++ b/README.md @@ -51,13 +51,22 @@ command-line options for configuring benchmark execution at runtime. See the [CLI overview](docs/cli_help.md) and [CLI axis specification](docs/cli_help_axis.md) for more information. -## Example Sandbox Project +## Examples -To get started with NVBench, consider trying out -the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo). This -repository contains a simple CMake project that uses NVBench to build an example -benchmark. It's a great way to experiment with the library without a lot of -investment. +This repository provides a number of [examples](examples/) that demonstrate +various NVBench features and usecases: + +- [Runtime and compile-time parameter sweeps](examples/axes.cu) +- [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu) +- [Skipping benchmark configurations](examples/skip.cu) +- [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu) +- [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_sync.cu) + +To get started using NVBench with your own kernels, consider trying out +the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo) +. `nvbench_demo` provides a simple CMake project that uses NVBench to build an +example benchmark. It's a great way to experiment with the library without a lot +of investment. # License diff --git a/docs/benchmarks.md b/docs/benchmarks.md index ee436f9..1741e57 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -223,8 +223,12 @@ state.add_global_memory_reads(size); state.add_global_memory_writes(size); ``` -For meaningful results, specify the input element count, and include all reads -and writes to global memory. +In general:: +- Add only the input element count (no outputs). +- Add all reads and writes to global memory. + +More examples can found in [examples/throughput.cu](../examples/throughput.cu). + # Skip Uninteresting / Invalid Benchmarks @@ -263,6 +267,8 @@ using Us = nvbench::type_list<...>; NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(Ts, Us)); ``` +More examples can found in [examples/skip.cu](../examples/skip.cu). + # Execution Tags For Special Cases By default, NVBench assumes that the entire execution time of the @@ -304,6 +310,9 @@ void sync_example(nvbench::state& state) NVBENCH_BENCH(sync_example); ``` +See [examples/exec_tag_sync.cu](../examples/exec_tag_sync.cu) for a complete +example. + ## Explicit timer mode: `nvbench::exec_tag::timer` For some kernels, the working data may need to be reset between launches. This @@ -342,6 +351,9 @@ void timer_example(nvbench::state& state) NVBENCH_BENCH(timer_example); ``` +See [examples/exec_tag_timer.cu](../examples/exec_tag_timer.cu) for a complete +example. + # Beware: Combinatorial Explosion Is Lurking Be very careful of how quickly the configuration space can grow. The following diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 64bd75d..d266671 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,5 +1,9 @@ set(example_srcs axes.cu + exec_tag_sync.cu + exec_tag_timer.cu + skip.cu + throughput.cu ) foreach(example_src IN LISTS example_srcs) diff --git a/examples/axes.cu b/examples/axes.cu index 610263c..70adb76 100644 --- a/examples/axes.cu +++ b/examples/axes.cu @@ -24,8 +24,6 @@ // Thrust vectors simplify memory management: #include -#include - //============================================================================== // Simple benchmark with no parameter axes: void simple(nvbench::state &state) diff --git a/examples/exec_tag_sync.cu b/examples/exec_tag_sync.cu new file mode 100644 index 0000000..2e262ee --- /dev/null +++ b/examples/exec_tag_sync.cu @@ -0,0 +1,58 @@ +/* + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +// Grab some testing kernels from NVBench: +#include + +// Thrust vectors simplify memory management: +#include + +// Used to initialize input data: +#include + +// `sequence_bench` measures the execution time of `thrust::sequence`. Since +// algorithms in `thrust::` implicitly sync the CUDA device, the +// `nvbench::exec_tag::sync` must be passed to `state.exec(...)`. +// +// By default, NVBench uses some tricks to improve the GPU timing stability. +// This provides more accurate results, but will cause a deadlock if the lambda +// passed to `state.exec(...)` synchronizes. The `nvbench::exec_tag::sync` tag +// tells NVBench to run the benchmark safely. +// +// This tag will also disable the batch measurements, since the synchronization +// will throw off the batch results. +void sequence_bench(nvbench::state &state) +{ + // Allocate input data: + const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t); + thrust::device_vector data(num_values); + + // Provide throughput information: + state.add_element_count(num_values); + state.add_global_memory_writes(num_values); + + // nvbench::exec_tag::sync indicates that this will implicitly sync: + state.exec(nvbench::exec_tag::sync, [&data](nvbench::launch &launch) { + thrust::sequence(thrust::device.on(launch.get_stream()), + data.begin(), + data.end()); + }); +} +NVBENCH_BENCH(sequence_bench); diff --git a/examples/exec_tag_timer.cu b/examples/exec_tag_timer.cu new file mode 100644 index 0000000..bf92665 --- /dev/null +++ b/examples/exec_tag_timer.cu @@ -0,0 +1,73 @@ +/* + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +// Grab some testing kernels from NVBench: +#include + +// Thrust vectors simplify memory management: +#include + +// Used to initialize input data: +#include + +// mod2_inplace performs an in-place mod2 over every element in `data`. `data` +// is reset to `input` each iteration. A manual timer is requested by passing +// `nvbench::exec_tag::timer` to `state.exec(...)`, which is used to only time +// the mod2, and not the reset. +// +// Note that this disables the batch timings, since the reset phase will throw +// off the batch results. + +void mod2_inplace(nvbench::state &state) +{ + // Allocate input data: + const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t); + thrust::device_vector input(num_values); + thrust::sequence(input.begin(), input.end()); + + // Working data buffer: + thrust::device_vector data(num_values); + + // Provide throughput information: + state.add_element_count(num_values); + state.add_global_memory_reads(num_values); + state.add_global_memory_writes(num_values); + + // Request timer with `nvbench::exec_tag::timer`: + state.exec(nvbench::exec_tag::timer, + // Lambda now takes a `timer` argument: + [&input, &data, num_values](nvbench::launch &launch, auto &timer) { + // Reset working data: + data = input; + + // Start timer: + timer.start(); + + // Run kernel of interest: + nvbench::mod2_kernel<<<256, 256, 0, launch.get_stream()>>>( + thrust::raw_pointer_cast(input.data()), + thrust::raw_pointer_cast(input.data()), + num_values); + + // Stop timer: + timer.stop(); + }); +} +NVBENCH_BENCH(mod2_inplace); diff --git a/examples/skip.cu b/examples/skip.cu new file mode 100644 index 0000000..fc96656 --- /dev/null +++ b/examples/skip.cu @@ -0,0 +1,128 @@ +/* + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +// Grab some testing kernels from NVBench: +#include + +// Thrust vectors simplify memory management: +#include + +// std::enable_if_t +#include + +//============================================================================== +// `runtime_skip` demonstrates how to skip benchmarks at runtime. +// +// Two parameter axes are swept (see axes.cu), but some configurations are +// skipped by calling `state.skip` with a skip reason string. This reason +// is printed to the log and captured in JSON output. +void runtime_skip(nvbench::state &state) +{ + const auto duration = state.get_float64("Duration"); + const auto kramble = state.get_string("Kramble"); + + // Skip Baz benchmarks with < 0.8 ms duration. + if (kramble == "Baz" && duration < 0.8e-3) + { + state.skip("Short 'Baz' benchmarks are skipped."); + return; + } + + // Skip Foo benchmarks with > 0.3 ms duration. + if (kramble == "Foo" && duration > 0.3e-3) + { + state.skip("Long 'Foo' benchmarks are skipped."); + return; + } + + // Run all others: + state.exec([duration](nvbench::launch &launch) { + nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration); + }); +} +NVBENCH_BENCH(runtime_skip) + // 0, 0.25, 0.5, 0.75, and 1.0 milliseconds + .add_float64_axis("Duration", + nvbench::range(0., + 1.1e-3, // .1e-3 slop for fp precision + 0.25e-3)) + .add_string_axis("Kramble", {"Foo", "Bar", "Baz"}); + +//============================================================================== +// `skip_overload` demonstrates how to skip benchmarks at compile-time via +// overload resolution. +// +// Two type axes are swept, but configurations where InputType == OutputType are +// skipped. +template +void skip_overload(nvbench::state &state, + nvbench::type_list) +{ + // This is a contrived example that focuses on the skip overloads, so this is + // just a sleep kernel: + state.exec([](nvbench::launch &launch) { + nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); + }); +} +// Overload of skip_overload that is called when InputType == OutputType. +template +void skip_overload(nvbench::state &state, nvbench::type_list) +{ + state.skip("InputType == OutputType."); +} +// The same type_list is used for both inputs/outputs. +using sst_types = nvbench::type_list; +// Setup benchmark: +NVBENCH_BENCH_TYPES(skip_overload, NVBENCH_TYPE_AXES(sst_types, sst_types)) + .set_type_axes_names({"In", "Out"}); + +//============================================================================== +// `skip_sfinae` demonstrates how to skip benchmarks at compile-time using +// SFINAE to handle more complex skip conditions. +// +// Two type axes are swept, but configurations where sizeof(InputType) > +// sizeof(OutputType) are skipped. + +// Enable this overload if InputType is not larger than OutputType +template +std::enable_if_t<(sizeof(InputType) <= sizeof(OutputType)), void> +skip_sfinae(nvbench::state &state, nvbench::type_list) +{ + // This is a contrived example that focuses on the skip overloads, so this is + // just a sleep kernel: + state.exec([](nvbench::launch &launch) { + nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); + }); +} +// Enable this overload if InputType is larger than OutputType +template +std::enable_if_t<(sizeof(InputType) > sizeof(OutputType)), void> +skip_sfinae(nvbench::state &state, nvbench::type_list) +{ + state.skip("sizeof(InputType) > sizeof(OutputType)."); +} +// The same type_list is used for both inputs/outputs. +using sn_types = nvbench::type_list; +// Setup benchmark: +NVBENCH_BENCH_TYPES(skip_sfinae, NVBENCH_TYPE_AXES(sn_types, sn_types)) + .set_type_axes_names({"In", "Out"}); diff --git a/examples/throughput.cu b/examples/throughput.cu new file mode 100644 index 0000000..5621ebd --- /dev/null +++ b/examples/throughput.cu @@ -0,0 +1,60 @@ +/* + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +// Grab some testing kernels from NVBench: +#include + +// Thrust vectors simplify memory management: +#include + +// `throughput_bench` copies a 64 MiB buffer of int32_t, and reports throughput +// in a variety of ways. +// +// Calling `state.add_element_count(num_elements)` with the number of input +// items will report the item throughput rate in elements-per-second. +// +// Calling `state.add_global_memory_reads(num_elements)` and/or +// `state.add_global_memory_writes(num_elements)` will report global device +// memory throughput as a percentage of the current device's peak global memory +// bandwidth, and also in bytes-per-second. +// +// All of these methods take an optional second `column_name` argument, which +// will add a new column to the output with the reported element count / buffer +// size and column name. +void throughput_bench(nvbench::state &state) +{ + // Allocate input data: + const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t); + thrust::device_vector input(num_values); + thrust::device_vector output(num_values); + + // Provide throughput information: + state.add_element_count(num_values, "NumElements"); + state.add_global_memory_reads(num_values, "DataSize"); + state.add_global_memory_writes(num_values); + + state.exec([&input, &output, num_values](nvbench::launch &launch) { + nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>( + thrust::raw_pointer_cast(input.data()), + thrust::raw_pointer_cast(output.data()), + num_values); + }); +} +NVBENCH_BENCH(throughput_bench); diff --git a/nvbench/test_kernels.cuh b/nvbench/test_kernels.cuh index 83f3565..e08db31 100644 --- a/nvbench/test_kernels.cuh +++ b/nvbench/test_kernels.cuh @@ -25,6 +25,8 @@ /*! * @file test_kernels.cuh * A collection of simple kernels for testing purposes. + * + * Note that these kernels are written to be short and simple, not performant. */ namespace nvbench @@ -62,4 +64,19 @@ __global__ void copy_kernel(const T* in, U* out, std::size_t n) } } +/*! + * For `i <- [0,n)`, `out[i] = in[i] % 2`. + */ +template +__global__ void mod2_kernel(const T* in, U* out, std::size_t n) +{ + const auto init = blockIdx.x * blockDim.x + threadIdx.x; + const auto step = blockDim.x * gridDim.x; + + for (auto i = init; i < n; i += step) + { + out[i] = static_cast(in[i] % 2); + } +} + }