From a6b26ef7be38cd8844f26464efa9eb2c8dbbdb72 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Wed, 3 Mar 2021 15:57:53 -0500 Subject: [PATCH] Add initial README.md. --- README.md | 405 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 405 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..5c2c181 --- /dev/null +++ b/README.md @@ -0,0 +1,405 @@ +# Overview + +This project is a work-in-progress. Everything is subject to change. + +NVBench is a C++17 library designed to simplify CUDA kernel benchmarking. +Simultaneous parameter sweeps across multiple axes is supported, including +template parameters. Various timings are reported, including "cold" execution +(clear device L2, single run per timed region) and "batch" execution (launch +multiple kernels within a single timed region). + +## Scope and Related Tools + +NVBench will measure the CPU and CUDA GPU execution time of a ***single +host-side critical region*** per benchmark. It is intended for regression +testing and parameter tuning of individual kernels. For in-depth analysis of +end-to-end performance of multiple applications, the NVIDIA Nsight tools are +more appropriate. + +NVBench is focused on evaluating the performance of CUDA kernels and is not +optimized for CPU microbenchmarks. This may change in the future, but for now, +consider using Google Benchmark for high resolution CPU benchmarks. + +# Minimal Benchmark + +A basic kernel benchmark can be created with just a few lines of CUDA C++: + +```cpp +void my_benchmark(nvbench::state& state) { + state.exec([](nvbench::launch& launch) { + my_kernel<<>>(); + }); +} +NVBENCH_CREATE(my_benchmark); +``` + +There are three main components in the definition of a benchmark: + +- A `KernelGenerator` callable (`my_benchmark` above) +- A `KernelLauncher` callable (the lambda passed to `nvbench::exec`), and +- A `BenchmarkDeclaration` using `NVBENCH_CREATE` or similar macros. + +The `KernelGenerator` is called with an `nvbench::state` object that provides +configuration information, as shown in later sections. The generator is +responsible for configuring and instantiating a `KernelLauncher`, which is +(unsurprisingly) responsible for launching a kernel. The launcher should contain +only the minimum amount of code necessary to start the CUDA kernel, +since `nvbench::exec` will execute it repeatedly to gather timing information. +An `nvbench::launch` object is provided to the launcher to specify kernel +execution details, such as the CUDA stream to use. `NVBENCH_CREATE` registers +the benchmark with NVBench and initializes various attributes, including its +name and parameter axes. + +# Benchmark Name + +By default, a benchmark is named by converting the first argument +of `NVBENCH_CREATE` into a string. + +This can be changed to something more descriptive if desired. +The `NVBENCH_CREATE` macro produces a customization object that allows such +attributes to be modified. + +```cpp +NVBENCH_CREATE(my_benchmark).set_name("my_kernel<<>>"); +``` + +# Parameter Axes + +Some kernels will be used with a variety of options, input data types/sizes, and +other factors that impact performance. NVBench explores these different +scenarios by sweeping through a set of user-defined parameter axes. + +A parameter axis defines a set of interesting values for a single kernel +parameter — for example, the size of the input, or the type of values being +processed. These parameter axes are used to customize a `KernelGenerator` with +static and runtime configurations. There are four supported types of parameters: +int64, float64, string, and type. + +## Int64 Axes + +A common example of a parameter axis is to vary the number of input values a +kernel should process during a benchmark measurement. An `int64_axis` is ideal +for this: + +```cpp +void benchmark(nvbench::state& state) +{ + const auto num_inputs = state.get_int64("NumInputs"); + thrust::device_vector data = generate_input(num_inputs); + + state.exec([&data](nvbench::launch& launch) { + my_kernel<<>>(data.begin(), data.end()); + }); +} +NVBENCH_CREATE(benchmark).add_int64_axis("NumInputs", {16, 64, 256, 1024, 4096}); +``` + +NVBench will run the `benchmark` kernel generator once for each specified value +in the "NumInputs" axis. The `state` object provides the current parameter value +to `benchmark`. + +### Int64 Power-Of-Two Axes + +Using powers-of-two is quite common for these sorts of axes. `int64_axis` has a +unique power-of-two mode that simplifies how such axes are defined and helps +provide more readable output. A power-of-two int64 axis is defined using the +integer exponents, but the benchmark will be run with the computed 2^N value. + +```cpp +// Equivalent to above, {16, 64, 256, 1024, 4096} = {2^4, 2^6, 2^8, 2^10, 2^12} +NVBENCH_CREATE(benchmark).add_int64_power_of_two_axis("NumInputs", + {4, 6, 8, 10, 12}); +// Or, as shown in a later section: +NVBENCH_CREATE(benchmark).add_int64_power_of_two_axis("NumInputs", + nvbench::range(4, 12, 2}); +``` + +## Float64 Axes + +For floating point numbers, a `float64_axis` is available: + +```cpp +void benchmark(nvbench::state& state) +{ + const auto quality = state.get_float64("Quality"); + + state.exec([&quality](nvbench::launch& launch) + { + my_kernel<<>>(quality); + }); +} +NVBENCH_CREATE(benchmark).add_float64_axis("Quality", {0.05, 0.1, 0.25, 0.5, 0.75, 1.}); +``` + +## String Axes + +For non-numeric data, an axis of arbitrary strings provides additional +flexibility: + +```cpp +void benchmark(nvbench::state& state) +{ + const auto rng_dist = state.get_string("RNG Distribution"); + thrust::device_vector data = generate_input(rng_dist); + + state.exec([&data](nvbench::launch& launch) + { + my_kernel<<>>(data.begin(), data.end()); + }); +} +NVBENCH_CREATE(benchmark).add_string_axis("RNG Distribution", {"Uniform", "Gaussian"}); +``` + +## Type Axes + +Another common situation involves benchmarking a templated kernel with multiple +compile-time configurations. NVBench strives to make such benchmarks as easy to +write as possible through the use of type axes. + +A `type_axis` is a list of types (`T1`, `T2`, `Ts`...) wrapped in +a `nvbench::type_list`. The kernel generator becomes a template +function and will be instantiated using types defined by the axis. The current +configuration's type is passed into the kernel generator using +a `nvbench::type_list`. + +```cpp +template +void my_benchmark(nvbench::state& state, nvbench::type_list) +{ + thrust::device_vector data = generate_input(); + + state.exec([&data](nvbench::launch& launch) + { + my_kernel<<>>(data.begin(), data.end()); + }); +} +using my_types = nvbench::type_list; +NVBENCH_CREATE_TEMPLATE(my_benchmark, NVBENCH_TYPE_AXES(my_types)) + .set_type_axis_names({"ValueType"}); +``` + +The `NVBENCH_TYPE_AXES` macro is unfortunately necessary to prevent commas in +the `type_list<...>` from breaking macro parsing. + +## `nvbench::range` + +Since parameter sweeps often explore a range of evenly-spaced numeric values, a +strided range can be generated using the `nvbench::range(start, end, stride=1)` +helper. + +```cpp +assert(nvbench::range(2, 5) == {2, 3, 4, 5}); +assert(nvbench::range(2.0, 5.0) == {2.0, 3.0, 4.0, 5.0}); +assert(nvbench::range(2, 12, 2) == {2, 4, 6, 8, 10, 12}); +assert(nvbench::range(2, 12, 5) == {2, 7, 12}); +assert(nvbench::range(2, 12, 6) == {2, 8}); +assert(nvbench::range(0.0, 10.0, 2.5) == { 0.0, 2.5, 5.0, 7.5, 10.0}); +``` + +Note that start and end are inclusive. This utility can be used to define axis +values for all numeric axes. + +## Multiple Parameter Axes + +If more than one axis is defined, the complete cartesian product of all axes +will be benchmarked. For example, consider a benchmark with two type axes, one +int64 axis, and one float64 axis: + +```cpp +// InputTypes: {char, int, unsigned int} +// OutputTypes: {float, double} +// NumInputs: {2^10, 2^20, 2^30} +// Quality: {0.5, 1.0} + +using input_types = nvbench::type_list; +using output_types = nvbench::type_list; +NVBENCH_CREATE_TEMPLATE(benchmark, NVBENCH_TYPE_AXES(input_types, output_types)) + .set_type_axes_names({"InputType", "OutputType"}) + .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 10)) + .add_float64_axis("Quality", {0.5, 1.0}); +``` + +This would generate a total of 36 configurations and instantiate the benchmark 6 +times. Keep the rapid growth of these combinations in mind when choosing the +number of values in an axis. See the section about combinatorial explosion for +more example and information. + +# Throughput Measurements + +In additional to raw timing information, NVBench can track a kernel's +throughput, reporting the amount of data processed as: + +- Number of items per second +- Number of bytes per second +- Percentage of device's peak memory bandwidth utilized + +To enable throughput measurements, the kernel generator can specify the number +of items and/or bytes handled in a single kernel execution using +the `nvbench::state` API. + +```cpp +state.add_element_count(size); +state.add_global_memory_reads(size); +state.add_global_memory_writes(size); +``` + +For meaningful results, specify the input element count, and include all reads +and writes to global memory. + +# Skip Uninteresting / Invalid Benchmarks + +Sometimes particular combinations of parameters aren't useful or interesting — +or for type axes, some configurations may not even compile. + +The `nvbench::state` object provides a `skip("Reason")` method that can be used +to avoid running these benchmarks. To skip uncompilable type axis +configurations, create an overload for the kernel generator that selects for the +invalid type combination: + +```cpp +template +void my_benchmark(nvbench::state& state, nvbench::type_list) +{ + // Skip benchmarks at runtime: + if (should_skip_this_config) + { + state.skip("Reason for skip."); + return; + } + + /* ... */ +}; + +// Skip benchmarks are compile time -- for example, always skip when T == U +// (Note that the `type_list` argument defines the same type twice). +template +void my_benchmark(nvbench::state& state, + nvbench::type_list) +{ + state.skip("T must not be the same type as U."); +} +using Ts = nvbench::type_list<...>; +using Us = nvbench::type_list<...>; +NVBENCH_CREATE_TEMPLATE(my_benchmark, NVBENCH_TYPE_AXES(Ts, Us)); +``` + +# Execution Tags For Special Cases + +By default, NVBench assumes that the entire execution time of the +`KernelLauncher` should be measured, and that no syncs are performed ( +e.g. `cudaDeviceSynchronize`, `cudaStreamSynchronize`, `cudaEventSynchronize`, +etc). + +Execution tags may be passed to `state.exec` when this these assumptions are not +true: + +- `nvbench::exec_tag::sync` tells NVBench that the kernel launcher will + synchronize internally, and +- `nvbench::exec_tag::timer` requests a timer object that can be used to + restrict the timed region. + +Multiple execution tags may be combined using `operator|`, e.g. + +```cpp +state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [](nvbench::launch &launch, auto& timer) { /*...*/ }); +``` + +The following sections provide more detail. + +## Benchmarks that sync: `nvbench::exec_tag::sync` + +If a `KernelLauncher` synchronizes the CUDA device internally without passing +this tag, the benchmark will deadlock at runtime. Passing the `sync` tag will +fix this. Note that the `sync` exec tag will disable batch measurements. + +```cpp +void sync_example(nvbench::state& state) +{ + // Pass the `sync` exec tag to tell NVBench that this benchmark will sync: + state.exec(nvbench::exec_tag::sync, [](nvbench::launch& launch) { + /* Benchmark that implicitly syncs here. */ + }); +} +NVBENCH_CREATE(timer_example); +``` + +## Explicit timer mode: `nvbench::exec_tag::timer` + +For some kernels, the working data may need to be reset between launches. This +is particularly common for kernels that modify their input in-place. + +Resetting the input data to prepare for a new trial shouldn't be included in the +benchmark's execution time. NVBench provides a manual timer mode that allows the +kernel launcher to specify the critical section to be measured and exclude any +per-trial reset operations. + +To enable the manual timer mode, pass the tag object `nvbench::exec_tag::timer` +to `state.exec`, and declare the kernel launcher with an +additional `auto& timer` argument. + +Note that using manual timer mode disables batch measurements. + +```cpp +void timer_example(nvbench::state& state) +{ + // Pass the `timer` exec tag to request a timer: + state.exec(nvbench::exec_tag::timer, + // Lambda now accepts a timer: + [](nvbench::launch& launch, auto& timer) + { + /* Reset code here, excluded from timing */ + + /* Timed region is explicitly marked. + * The timer handles any synchronization, flushes, etc when/if + * needed for the current measurement. + */ + timer.start(); + /* Launch kernel on `launch.get_stream()` here */ + timer.stop(); + }); +} +NVBENCH_CREATE(timer_example); +``` + +# Beware: Combinatorial Explosion Is Lurking + +Be very careful of how quickly the configuration space can grow. The following +example generates 960 total runtime benchmark configurations, and will compile +192 different static parametrizations of the kernel generator. This is likely +excessive, especially for routine regression testing. + +```cpp +using value_types = nvbench::type_list; +using op_types = nvbench::type_list, + thrust::multiplies<>, + thrust::maximum<>>; + +NVBENCH_CREATE_TEMPLATE(my_benchmark, + NVBENCH_TYPE_AXES(value_types, + value_types, + value_types, + op_types>)) + .set_type_axes_names({"T", "U", "V", "Op"}) + .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 5)); +``` + +``` +960 total configs += 4 [T=(U8, I32, F32, F64)] +* 4 [U=(U8, I32, F32, F64)] +* 4 [V=(U8, I32, F32, F64)] +* 3 [Op=(plus, multiplies, max)] +* 5 [NumInputs=(2^10, 2^15, 2^20, 2^25, 2^30)] +``` + +For large configuration spaces like this, pruning some of the less useful +combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques +described in the "Skip Uninteresting / Invalid Benchmarks" section can help +immensely with keeping compile / run times manageable. + +Splitting a single large configuration space into multiple, more focused +benchmarks with reduced dimensionality will likely be worth the effort as well.