From 65bc2c1e3fc02d95873ab9c47f87b0efd2feebbf Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Thu, 4 Mar 2021 18:40:23 -0500 Subject: [PATCH] Documentation overhaul. Revamp README, split into multiple files. Add docs on CLI. Add `--help` and `--help-axis`. --- README.md | 431 ++++-------------------------- docs/benchmarks.md | 383 ++++++++++++++++++++++++++ docs/cli_help.md | 93 +++++++ docs/cli_help_axis.md | 69 +++++ nvbench/internal/help_strings.cuh | 194 ++++++++++++++ nvbench/option_parser.cu | 26 +- nvbench/option_parser.cuh | 2 + 7 files changed, 815 insertions(+), 383 deletions(-) create mode 100644 docs/benchmarks.md create mode 100644 docs/cli_help.md create mode 100644 docs/cli_help_axis.md create mode 100644 nvbench/internal/help_strings.cuh diff --git a/README.md b/README.md index a124500..3f1db1b 100644 --- a/README.md +++ b/README.md @@ -2,30 +2,33 @@ This project is a work-in-progress. Everything is subject to change. -NVBench is a C++17 library designed to simplify CUDA kernel benchmarking. -It allows simultaneous parameter sweeps across multiple axes, including template -parameters. Various timings are reported, including "cold" execution time -(single run per timed region) and "batch" execution time -(launch multiple kernels within a single timed region). +NVBench is a C++17 library designed to simplify CUDA kernel benchmarking. It +features: -# Example Sandbox Project +* [Parameter sweeps](docs/benchmarks.md#parameter-axes): a powerful and + flexible "axis" system explores a kernel's configuration space. Parameters may + be dynamic numbers/strings or [static types](docs/benchmarks.md#type-axes). +* [Runtime customization](docs/cli_help.md): A rich command-line interface + allows [redefinition of parameter axes](docs/cli_help_axis.md), CUDA device + selection, changing output formats, and more. +* [Throughput calculations](docs/benchmarks.md#throughput-measurements): Compute + and report: + * Item throughput (elements/second) + * Global memory bandwidth usage (bytes/second and per-device %-of-peak-bw) +* Multiple output formats: Currently supports markdown (default) and CSV output. +* [Manual timer mode](docs/benchmarks.md#explicit-timer-mode-nvbenchexec_tagtimer): + (optional) Explicitly start/stop timing in a benchmark implementation. +* Multiple measurement types: + * Cold Measurements: + * Each sample runs the benchmark once with a clean device L2 cache. + * GPU and CPU times are reported. + * Batch Measurements: + * Executes the benchmark multiple times back-to-back and records total time. + * Reports the average execution time (total time / number of executions). -See [NVBench Demo](https://github.com/allisonvacanti/nvbench_demo) for a simple -CMake project that builds an example benchmark using NVBench. +# Getting Started -## Scope and Related Tools - -NVBench will measure the CPU and CUDA GPU execution time of a ***single -host-side critical region*** per benchmark. It is intended for regression -testing and parameter tuning of individual kernels. For in-depth analysis of -end-to-end performance of multiple applications, the NVIDIA Nsight tools are -more appropriate. - -NVBench is focused on evaluating the performance of CUDA kernels and is not -optimized for CPU microbenchmarks. This may change in the future, but for now, -consider using Google Benchmark for high resolution CPU benchmarks. - -# Minimal Benchmark +## Minimal Benchmark A basic kernel benchmark can be created with just a few lines of CUDA C++: @@ -38,373 +41,37 @@ void my_benchmark(nvbench::state& state) { NVBENCH_BENCH(my_benchmark); ``` -There are three main components in the definition of a benchmark: +See [Benchmarks](docs/benchmarks.md) for information on customizing benchmarks +and implementing parameter sweeps. -- A `KernelGenerator` callable (`my_benchmark` above) -- A `KernelLauncher` callable (the lambda passed to `nvbench::exec`), and -- A `BenchmarkDeclaration` using `NVBENCH_BENCH` or similar macros. +## Command Line Interface -The `KernelGenerator` is called with an `nvbench::state` object that provides -configuration information, as shown in later sections. The generator is -responsible for configuring and instantiating a `KernelLauncher`, which is -(unsurprisingly) responsible for launching a kernel. The launcher should contain -only the minimum amount of code necessary to start the CUDA kernel, -since `nvbench::exec` will execute it repeatedly to gather timing information. -An `nvbench::launch` object is provided to the launcher to specify kernel -execution details, such as the CUDA stream to use. `NVBENCH_BENCH` registers -the benchmark with NVBench and initializes various attributes, including its -name and parameter axes. +Each benchmark executable produced by NVBench provides a rich set of +command-line options for configuring benchmark execution at runtime. See the +[CLI overview](docs/cli_help.md) +and [CLI axis specification](docs/cli_help_axis.md) for more information. -# Benchmark Name +## Example Sandbox Project -By default, a benchmark is named by converting the first argument -of `NVBENCH_BENCH` into a string. +To get started with NVBench, consider trying out +the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo). This +repository contains a simple CMake project that uses NVBench to build an example +benchmark. It's a great way to experiment with the library without a lot of +investment. -This can be changed to something more descriptive if desired. -The `NVBENCH_BENCH` macro produces a customization object that allows such -attributes to be modified. +# License -```cpp -NVBENCH_BENCH(my_benchmark).set_name("my_kernel<<>>"); -``` +NVBench is released under the Apache 2.0 License with LLVM exceptions. +See [LICENSE](./LICENSE). -# Parameter Axes +# Scope and Related Projects -Some kernels will be used with a variety of options, input data types/sizes, and -other factors that impact performance. NVBench explores these different -scenarios by sweeping through a set of user-defined parameter axes. +NVBench will measure the CPU and CUDA GPU execution time of a ***single +host-side critical region*** per benchmark. It is intended for regression +testing and parameter tuning of individual kernels. For in-depth analysis of +end-to-end performance of multiple applications, the NVIDIA Nsight tools are +more appropriate. -A parameter axis defines a set of interesting values for a single kernel -parameter — for example, the size of the input, or the type of values being -processed. These parameter axes are used to customize a `KernelGenerator` with -static and runtime configurations. There are four supported types of parameters: -int64, float64, string, and type. - -## Int64 Axes - -A common example of a parameter axis is to vary the number of input values a -kernel should process during a benchmark measurement. An `int64_axis` is ideal -for this: - -```cpp -void benchmark(nvbench::state& state) -{ - const auto num_inputs = state.get_int64("NumInputs"); - thrust::device_vector data = generate_input(num_inputs); - - state.exec([&data](nvbench::launch& launch) { - my_kernel<<>>(data.begin(), data.end()); - }); -} -NVBENCH_BENCH(benchmark).add_int64_axis("NumInputs", {16, 64, 256, 1024, 4096}); -``` - -NVBench will run the `benchmark` kernel generator once for each specified value -in the "NumInputs" axis. The `state` object provides the current parameter value -to `benchmark`. - -### Int64 Power-Of-Two Axes - -Using powers-of-two is quite common for these sorts of axes. `int64_axis` has a -unique power-of-two mode that simplifies how such axes are defined and helps -provide more readable output. A power-of-two int64 axis is defined using the -integer exponents, but the benchmark will be run with the computed 2^N value. - -```cpp -// Equivalent to above, {16, 64, 256, 1024, 4096} = {2^4, 2^6, 2^8, 2^10, 2^12} -NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs", - {4, 6, 8, 10, 12}); -// Or, as shown in a later section: -NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs", - nvbench::range(4, 12, 2}); -``` - -## Float64 Axes - -For floating point numbers, a `float64_axis` is available: - -```cpp -void benchmark(nvbench::state& state) -{ - const auto quality = state.get_float64("Quality"); - - state.exec([&quality](nvbench::launch& launch) - { - my_kernel<<>>(quality); - }); -} -NVBENCH_BENCH(benchmark).add_float64_axis("Quality", {0.05, 0.1, 0.25, 0.5, 0.75, 1.}); -``` - -## String Axes - -For non-numeric data, an axis of arbitrary strings provides additional -flexibility: - -```cpp -void benchmark(nvbench::state& state) -{ - const auto rng_dist = state.get_string("RNG Distribution"); - thrust::device_vector data = generate_input(rng_dist); - - state.exec([&data](nvbench::launch& launch) - { - my_kernel<<>>(data.begin(), data.end()); - }); -} -NVBENCH_BENCH(benchmark).add_string_axis("RNG Distribution", {"Uniform", "Gaussian"}); -``` - -## Type Axes - -Another common situation involves benchmarking a templated kernel with multiple -compile-time configurations. NVBench strives to make such benchmarks as easy to -write as possible through the use of type axes. - -A `type_axis` is a list of types (`T1`, `T2`, `Ts`...) wrapped in -a `nvbench::type_list`. The kernel generator becomes a template -function and will be instantiated using types defined by the axis. The current -configuration's type is passed into the kernel generator using -a `nvbench::type_list`. - -```cpp -template -void my_benchmark(nvbench::state& state, nvbench::type_list) -{ - thrust::device_vector data = generate_input(); - - state.exec([&data](nvbench::launch& launch) - { - my_kernel<<>>(data.begin(), data.end()); - }); -} -using my_types = nvbench::type_list; -NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(my_types)) - .set_type_axis_names({"ValueType"}); -``` - -The `NVBENCH_TYPE_AXES` macro is unfortunately necessary to prevent commas in -the `type_list<...>` from breaking macro parsing. - -## `nvbench::range` - -Since parameter sweeps often explore a range of evenly-spaced numeric values, a -strided range can be generated using the `nvbench::range(start, end, stride=1)` -helper. - -```cpp -assert(nvbench::range(2, 5) == {2, 3, 4, 5}); -assert(nvbench::range(2.0, 5.0) == {2.0, 3.0, 4.0, 5.0}); -assert(nvbench::range(2, 12, 2) == {2, 4, 6, 8, 10, 12}); -assert(nvbench::range(2, 12, 5) == {2, 7, 12}); -assert(nvbench::range(2, 12, 6) == {2, 8}); -assert(nvbench::range(0.0, 10.0, 2.5) == { 0.0, 2.5, 5.0, 7.5, 10.0}); -``` - -Note that start and end are inclusive. This utility can be used to define axis -values for all numeric axes. - -## Multiple Parameter Axes - -If more than one axis is defined, the complete cartesian product of all axes -will be benchmarked. For example, consider a benchmark with two type axes, one -int64 axis, and one float64 axis: - -```cpp -// InputTypes: {char, int, unsigned int} -// OutputTypes: {float, double} -// NumInputs: {2^10, 2^20, 2^30} -// Quality: {0.5, 1.0} - -using input_types = nvbench::type_list; -using output_types = nvbench::type_list; -NVBENCH_BENCH_TYPES(benchmark, NVBENCH_TYPE_AXES(input_types, output_types)) - .set_type_axes_names({"InputType", "OutputType"}) - .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 10)) - .add_float64_axis("Quality", {0.5, 1.0}); -``` - -This would generate a total of 36 configurations and instantiate the benchmark 6 -times. Keep the rapid growth of these combinations in mind when choosing the -number of values in an axis. See the section about combinatorial explosion for -more example and information. - -# Throughput Measurements - -In additional to raw timing information, NVBench can track a kernel's -throughput, reporting the amount of data processed as: - -- Number of items per second -- Number of bytes per second -- Percentage of device's peak memory bandwidth utilized - -To enable throughput measurements, the kernel generator can specify the number -of items and/or bytes handled in a single kernel execution using -the `nvbench::state` API. - -```cpp -state.add_element_count(size); -state.add_global_memory_reads(size); -state.add_global_memory_writes(size); -``` - -For meaningful results, specify the input element count, and include all reads -and writes to global memory. - -# Skip Uninteresting / Invalid Benchmarks - -Sometimes particular combinations of parameters aren't useful or interesting — -or for type axes, some configurations may not even compile. - -The `nvbench::state` object provides a `skip("Reason")` method that can be used -to avoid running these benchmarks. To skip uncompilable type axis -configurations, create an overload for the kernel generator that selects for the -invalid type combination: - -```cpp -template -void my_benchmark(nvbench::state& state, nvbench::type_list) -{ - // Skip benchmarks at runtime: - if (should_skip_this_config) - { - state.skip("Reason for skip."); - return; - } - - /* ... */ -}; - -// Skip benchmarks are compile time -- for example, always skip when T == U -// (Note that the `type_list` argument defines the same type twice). -template -void my_benchmark(nvbench::state& state, - nvbench::type_list) -{ - state.skip("T must not be the same type as U."); -} -using Ts = nvbench::type_list<...>; -using Us = nvbench::type_list<...>; -NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(Ts, Us)); -``` - -# Execution Tags For Special Cases - -By default, NVBench assumes that the entire execution time of the -`KernelLauncher` should be measured, and that no syncs are performed -(e.g. `cudaDeviceSynchronize`, `cudaStreamSynchronize`, `cudaEventSynchronize`, -etc. are not called). - -Execution tags may be passed to `state.exec` when these assumptions are not -true: - -- `nvbench::exec_tag::sync` tells NVBench that the kernel launcher will - synchronize internally, and -- `nvbench::exec_tag::timer` requests a timer object that can be used to - restrict the timed region. - -Multiple execution tags may be combined using `operator|`, e.g. - -```cpp -state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [](nvbench::launch &launch, auto& timer) { /*...*/ }); -``` - -The following sections provide more details on these features. - -## Benchmarks that sync: `nvbench::exec_tag::sync` - -If a `KernelLauncher` synchronizes the CUDA device internally without passing -this tag, **the benchmark will deadlock at runtime**. Passing the `sync` tag -will fix this issue. Note that this disables batch measurements. - -```cpp -void sync_example(nvbench::state& state) -{ - // Pass the `sync` exec tag to tell NVBench that this benchmark will sync: - state.exec(nvbench::exec_tag::sync, [](nvbench::launch& launch) { - /* Benchmark that implicitly syncs here. */ - }); -} -NVBENCH_BENCH(sync_example); -``` - -## Explicit timer mode: `nvbench::exec_tag::timer` - -For some kernels, the working data may need to be reset between launches. This -is particularly common for kernels that modify their input in-place. - -Resetting the input data to prepare for a new trial shouldn't be included in the -benchmark's execution time. NVBench provides a manual timer mode that allows the -kernel launcher to specify the critical section to be measured and exclude any -per-trial reset operations. - -To enable the manual timer mode, pass the tag object `nvbench::exec_tag::timer` -to `state.exec`, and declare the kernel launcher with an -additional `auto& timer` argument. - -Note that using manual timer mode disables batch measurements. - -```cpp -void timer_example(nvbench::state& state) -{ - // Pass the `timer` exec tag to request a timer: - state.exec(nvbench::exec_tag::timer, - // Lambda now accepts a timer: - [](nvbench::launch& launch, auto& timer) - { - /* Reset code here, excluded from timing */ - - /* Timed region is explicitly marked. - * The timer handles any synchronization, flushes, etc when/if - * needed for the current measurement. - */ - timer.start(); - /* Launch kernel on `launch.get_stream()` here */ - timer.stop(); - }); -} -NVBENCH_BENCH(timer_example); -``` - -# Beware: Combinatorial Explosion Is Lurking - -Be very careful of how quickly the configuration space can grow. The following -example generates 960 total runtime benchmark configurations, and will compile -192 different static parametrizations of the kernel generator. This is likely -excessive, especially for routine regression testing. - -```cpp -using value_types = nvbench::type_list; -using op_types = nvbench::type_list, - thrust::multiplies<>, - thrust::maximum<>>; - -NVBENCH_BENCH_TYPES(my_benchmark, - NVBENCH_TYPE_AXES(value_types, - value_types, - value_types, - op_types>)) - .set_type_axes_names({"T", "U", "V", "Op"}) - .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 5)); -``` - -``` -960 total configs -= 4 [T=(U8, I32, F32, F64)] -* 4 [U=(U8, I32, F32, F64)] -* 4 [V=(U8, I32, F32, F64)] -* 3 [Op=(plus, multiplies, max)] -* 5 [NumInputs=(2^10, 2^15, 2^20, 2^25, 2^30)] -``` - -For large configuration spaces like this, pruning some of the less useful -combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques -described in the "Skip Uninteresting / Invalid Benchmarks" section can help -immensely with keeping compile / run times manageable. - -Splitting a single large configuration space into multiple, more focused -benchmarks with reduced dimensionality will likely be worth the effort as well. +NVBench is focused on evaluating the performance of CUDA kernels and is not +optimized for CPU microbenchmarks. This may change in the future, but for now, +consider using Google Benchmark for high resolution CPU benchmarks. diff --git a/docs/benchmarks.md b/docs/benchmarks.md new file mode 100644 index 0000000..0a85278 --- /dev/null +++ b/docs/benchmarks.md @@ -0,0 +1,383 @@ +# Minimal Benchmark + +A basic kernel benchmark can be created with just a few lines of CUDA C++: + +```cpp +void my_benchmark(nvbench::state& state) { + state.exec([](nvbench::launch& launch) { + my_kernel<<>>(); + }); +} +NVBENCH_BENCH(my_benchmark); +``` + +There are three main components in the definition of a benchmark: + +- A `KernelGenerator` callable (`my_benchmark` above) +- A `KernelLauncher` callable (the lambda passed to `nvbench::exec`), and +- A `BenchmarkDeclaration` using `NVBENCH_BENCH` or similar macros. + +The `KernelGenerator` is called with an `nvbench::state` object that provides +configuration information, as shown in later sections. The generator is +responsible for configuring and instantiating a `KernelLauncher`, which is +(unsurprisingly) responsible for launching a kernel. The launcher should contain +only the minimum amount of code necessary to start the CUDA kernel, +since `nvbench::exec` will execute it repeatedly to gather timing information. +An `nvbench::launch` object is provided to the launcher to specify kernel +execution details, such as the CUDA stream to use. `NVBENCH_BENCH` registers +the benchmark with NVBench and initializes various attributes, including its +name and parameter axes. + +# Benchmark Name + +By default, a benchmark is named by converting the first argument +of `NVBENCH_BENCH` into a string. + +This can be changed to something more descriptive if desired. +The `NVBENCH_BENCH` macro produces a customization object that allows such +attributes to be modified. + +```cpp +NVBENCH_BENCH(my_benchmark).set_name("my_kernel<<>>"); +``` + +# Parameter Axes + +Some kernels will be used with a variety of options, input data types/sizes, and +other factors that impact performance. NVBench explores these different +scenarios by sweeping through a set of user-defined parameter axes. + +A parameter axis defines a set of interesting values for a single kernel +parameter — for example, the size of the input, or the type of values being +processed. These parameter axes are used to customize a `KernelGenerator` with +static and runtime configurations. There are four supported types of parameters: +int64, float64, string, and type. + +## Int64 Axes + +A common example of a parameter axis is to vary the number of input values a +kernel should process during a benchmark measurement. An `int64_axis` is ideal +for this: + +```cpp +void benchmark(nvbench::state& state) +{ + const auto num_inputs = state.get_int64("NumInputs"); + thrust::device_vector data = generate_input(num_inputs); + + state.exec([&data](nvbench::launch& launch) { + my_kernel<<>>(data.begin(), data.end()); + }); +} +NVBENCH_BENCH(benchmark).add_int64_axis("NumInputs", {16, 64, 256, 1024, 4096}); +``` + +NVBench will run the `benchmark` kernel generator once for each specified value +in the "NumInputs" axis. The `state` object provides the current parameter value +to `benchmark`. + +### Int64 Power-Of-Two Axes + +Using powers-of-two is quite common for these sorts of axes. `int64_axis` has a +unique power-of-two mode that simplifies how such axes are defined and helps +provide more readable output. A power-of-two int64 axis is defined using the +integer exponents, but the benchmark will be run with the computed 2^N value. + +```cpp +// Equivalent to above, {16, 64, 256, 1024, 4096} = {2^4, 2^6, 2^8, 2^10, 2^12} +NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs", + {4, 6, 8, 10, 12}); +// Or, as shown in a later section: +NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs", + nvbench::range(4, 12, 2}); +``` + +## Float64 Axes + +For floating point numbers, a `float64_axis` is available: + +```cpp +void benchmark(nvbench::state& state) +{ + const auto quality = state.get_float64("Quality"); + + state.exec([&quality](nvbench::launch& launch) + { + my_kernel<<>>(quality); + }); +} +NVBENCH_BENCH(benchmark).add_float64_axis("Quality", {0.05, 0.1, 0.25, 0.5, 0.75, 1.}); +``` + +## String Axes + +For non-numeric data, an axis of arbitrary strings provides additional +flexibility: + +```cpp +void benchmark(nvbench::state& state) +{ + const auto rng_dist = state.get_string("RNG Distribution"); + thrust::device_vector data = generate_input(rng_dist); + + state.exec([&data](nvbench::launch& launch) + { + my_kernel<<>>(data.begin(), data.end()); + }); +} +NVBENCH_BENCH(benchmark).add_string_axis("RNG Distribution", {"Uniform", "Gaussian"}); +``` + +## Type Axes + +Another common situation involves benchmarking a templated kernel with multiple +compile-time configurations. NVBench strives to make such benchmarks as easy to +write as possible through the use of type axes. + +A `type_axis` is a list of types (`T1`, `T2`, `Ts`...) wrapped in +a `nvbench::type_list`. The kernel generator becomes a template +function and will be instantiated using types defined by the axis. The current +configuration's type is passed into the kernel generator using +a `nvbench::type_list`. + +```cpp +template +void my_benchmark(nvbench::state& state, nvbench::type_list) +{ + thrust::device_vector data = generate_input(); + + state.exec([&data](nvbench::launch& launch) + { + my_kernel<<>>(data.begin(), data.end()); + }); +} +using my_types = nvbench::type_list; +NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(my_types)) + .set_type_axis_names({"ValueType"}); +``` + +The `NVBENCH_TYPE_AXES` macro is unfortunately necessary to prevent commas in +the `type_list<...>` from breaking macro parsing. + +## `nvbench::range` + +Since parameter sweeps often explore a range of evenly-spaced numeric values, a +strided range can be generated using the `nvbench::range(start, end, stride=1)` +helper. + +```cpp +assert(nvbench::range(2, 5) == {2, 3, 4, 5}); +assert(nvbench::range(2.0, 5.0) == {2.0, 3.0, 4.0, 5.0}); +assert(nvbench::range(2, 12, 2) == {2, 4, 6, 8, 10, 12}); +assert(nvbench::range(2, 12, 5) == {2, 7, 12}); +assert(nvbench::range(2, 12, 6) == {2, 8}); +assert(nvbench::range(0.0, 10.0, 2.5) == { 0.0, 2.5, 5.0, 7.5, 10.0}); +``` + +Note that start and end are inclusive. This utility can be used to define axis +values for all numeric axes. + +## Multiple Parameter Axes + +If more than one axis is defined, the complete cartesian product of all axes +will be benchmarked. For example, consider a benchmark with two type axes, one +int64 axis, and one float64 axis: + +```cpp +// InputTypes: {char, int, unsigned int} +// OutputTypes: {float, double} +// NumInputs: {2^10, 2^20, 2^30} +// Quality: {0.5, 1.0} + +using input_types = nvbench::type_list; +using output_types = nvbench::type_list; +NVBENCH_BENCH_TYPES(benchmark, NVBENCH_TYPE_AXES(input_types, output_types)) + .set_type_axes_names({"InputType", "OutputType"}) + .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 10)) + .add_float64_axis("Quality", {0.5, 1.0}); +``` + +This would generate a total of 36 configurations and instantiate the benchmark 6 +times. Keep the rapid growth of these combinations in mind when choosing the +number of values in an axis. See the section about combinatorial explosion for +more example and information. + +# Throughput Measurements + +In additional to raw timing information, NVBench can track a kernel's +throughput, reporting the amount of data processed as: + +- Number of items per second +- Number of bytes per second +- Percentage of device's peak memory bandwidth utilized + +To enable throughput measurements, the kernel generator can specify the number +of items and/or bytes handled in a single kernel execution using +the `nvbench::state` API. + +```cpp +state.add_element_count(size); +state.add_global_memory_reads(size); +state.add_global_memory_writes(size); +``` + +For meaningful results, specify the input element count, and include all reads +and writes to global memory. + +# Skip Uninteresting / Invalid Benchmarks + +Sometimes particular combinations of parameters aren't useful or interesting — +or for type axes, some configurations may not even compile. + +The `nvbench::state` object provides a `skip("Reason")` method that can be used +to avoid running these benchmarks. To skip uncompilable type axis +configurations, create an overload for the kernel generator that selects for the +invalid type combination: + +```cpp +template +void my_benchmark(nvbench::state& state, nvbench::type_list) +{ + // Skip benchmarks at runtime: + if (should_skip_this_config) + { + state.skip("Reason for skip."); + return; + } + + /* ... */ +}; + +// Skip benchmarks are compile time -- for example, always skip when T == U +// (Note that the `type_list` argument defines the same type twice). +template +void my_benchmark(nvbench::state& state, + nvbench::type_list) +{ + state.skip("T must not be the same type as U."); +} +using Ts = nvbench::type_list<...>; +using Us = nvbench::type_list<...>; +NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(Ts, Us)); +``` + +# Execution Tags For Special Cases + +By default, NVBench assumes that the entire execution time of the +`KernelLauncher` should be measured, and that no syncs are performed +(e.g. `cudaDeviceSynchronize`, `cudaStreamSynchronize`, `cudaEventSynchronize`, +etc. are not called). + +Execution tags may be passed to `state.exec` when these assumptions are not +true: + +- `nvbench::exec_tag::sync` tells NVBench that the kernel launcher will + synchronize internally, and +- `nvbench::exec_tag::timer` requests a timer object that can be used to + restrict the timed region. + +Multiple execution tags may be combined using `operator|`, e.g. + +```cpp +state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [](nvbench::launch &launch, auto& timer) { /*...*/ }); +``` + +The following sections provide more details on these features. + +## Benchmarks that sync: `nvbench::exec_tag::sync` + +If a `KernelLauncher` synchronizes the CUDA device internally without passing +this tag, **the benchmark will deadlock at runtime**. Passing the `sync` tag +will fix this issue. Note that this disables batch measurements. + +```cpp +void sync_example(nvbench::state& state) +{ + // Pass the `sync` exec tag to tell NVBench that this benchmark will sync: + state.exec(nvbench::exec_tag::sync, [](nvbench::launch& launch) { + /* Benchmark that implicitly syncs here. */ + }); +} +NVBENCH_BENCH(sync_example); +``` + +## Explicit timer mode: `nvbench::exec_tag::timer` + +For some kernels, the working data may need to be reset between launches. This +is particularly common for kernels that modify their input in-place. + +Resetting the input data to prepare for a new trial shouldn't be included in the +benchmark's execution time. NVBench provides a manual timer mode that allows the +kernel launcher to specify the critical section to be measured and exclude any +per-trial reset operations. + +To enable the manual timer mode, pass the tag object `nvbench::exec_tag::timer` +to `state.exec`, and declare the kernel launcher with an +additional `auto& timer` argument. + +Note that using manual timer mode disables batch measurements. + +```cpp +void timer_example(nvbench::state& state) +{ + // Pass the `timer` exec tag to request a timer: + state.exec(nvbench::exec_tag::timer, + // Lambda now accepts a timer: + [](nvbench::launch& launch, auto& timer) + { + /* Reset code here, excluded from timing */ + + /* Timed region is explicitly marked. + * The timer handles any synchronization, flushes, etc when/if + * needed for the current measurement. + */ + timer.start(); + /* Launch kernel on `launch.get_stream()` here */ + timer.stop(); + }); +} +NVBENCH_BENCH(timer_example); +``` + +# Beware: Combinatorial Explosion Is Lurking + +Be very careful of how quickly the configuration space can grow. The following +example generates 960 total runtime benchmark configurations, and will compile +192 different static parametrizations of the kernel generator. This is likely +excessive, especially for routine regression testing. + +```cpp +using value_types = nvbench::type_list; +using op_types = nvbench::type_list, + thrust::multiplies<>, + thrust::maximum<>>; + +NVBENCH_BENCH_TYPES(my_benchmark, + NVBENCH_TYPE_AXES(value_types, + value_types, + value_types, + op_types>)) + .set_type_axes_names({"T", "U", "V", "Op"}) + .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 5)); +``` + +``` +960 total configs += 4 [T=(U8, I32, F32, F64)] +* 4 [U=(U8, I32, F32, F64)] +* 4 [V=(U8, I32, F32, F64)] +* 3 [Op=(plus, multiplies, max)] +* 5 [NumInputs=(2^10, 2^15, 2^20, 2^25, 2^30)] +``` + +For large configuration spaces like this, pruning some of the less useful +combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques +described in the "Skip Uninteresting / Invalid Benchmarks" section can help +immensely with keeping compile / run times manageable. + +Splitting a single large configuration space into multiple, more focused +benchmarks with reduced dimensionality will likely be worth the effort as well. diff --git a/docs/cli_help.md b/docs/cli_help.md new file mode 100644 index 0000000..02e3e25 --- /dev/null +++ b/docs/cli_help.md @@ -0,0 +1,93 @@ +# Queries + +* `--list`, `-l` + * List all devices and benchmarks without running them. + +* `--help`, `-h` + * Print usage information and exit. + +* `--help-axes`, `--help-axis` + * Print axis specification documentation and exit. + +# Output + +* `--csv ` + * Write CSV output to a file, or "stdout" / "stderr". + +* `--markdown `, `--md ` + * Write markdown output to a file, or "stdout" / "stderr". + * Markdown is written to "stdout" by default. + +* `--quiet`, `-q` + * Suppress output. + +* `--color` + * Use color in output (markdown + stdout only). + +# Benchmark / Axis Specification + +* `--benchmark `, `-b ` + * Execute a specific benchmark. + * Argument is a benchmark name or index, taken from `--list`. + * If not specified, all benchmarks will run. + * `--benchmark` may be specified multiple times to run several benchmarks. + * The same benchmark may be specified multiple times with different + configurations. + +* `--axis `, `-a ` + * Override an axis specification. + * See `--help-axis` + for [details on axis specifications](./cli_help_axis.md). + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +# Benchmark Properties + +* `--devices `, `--device `, `-d ` + * Limit execution to one or more devices. + * `` is a single id, or a comma separated list. + * Device ids can be obtained from `--list`. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--min-samples ` + * Gather at least `` samples per measurement. + * Default is 10 samples. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--min-time ` + * Accumulate at least `` of execution time per measurement. + * Default is 0.5 seconds. + * If both GPU and CPU times are gathered, this applies to GPU time only. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--max-noise ` + * Gather samples until the error in the measurement drops below ``. + * Noise is computed as the percent relative standard deviation. + * Default is 0.5%. + * Only applies to Cold measurements. + * If both GPU and CPU times are gathered, this applies to GPU noise only. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--skip-time ` + * Skip a measurement when a warmup run executes in less than ``. + * Default is -1 seconds (disabled). + * Intended for testing / debugging only. + * Very fast kernels (<5us) often require an extremely large number of samples + to converge `max-noise`. This option allows them to be skipped to save time + during testing. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--timeout ` + * Measurements will timeout after `` have elapsed. + * Default is 15 seconds. + * `` is walltime, not accumulated sample time. + * If a measurement times out, the default markdown log will print a warning to + report any outstanding termination criteria (min samples, min time, max + noise). + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. diff --git a/docs/cli_help_axis.md b/docs/cli_help_axis.md new file mode 100644 index 0000000..a2da259 --- /dev/null +++ b/docs/cli_help_axis.md @@ -0,0 +1,69 @@ +# Axis Specification + +The `--axis ` option redefines the values in a benchmark's axis. It +applies to the benchmark created by the most recent `--benchmark` argument, or +all benchmarks if it precedes all `--benchmark` arguments (if any). + +Valid axis specification follow the form: + +* `=` +* `=[,,...]` +* `=[:]` +* `=[::]` +* `[]=` +* `[]=[,,...]` +* `[]=[:]` +* `[]=[::]` + +Whitespace is ignored if the argument is quoted. + +The axis type is taken from the benchmark definition. Some axes have additional +restrictions: + +* Numeric axes: + * A single value, explicit list of values, or strided range may be specified. + * For `int64` axes, the `power_of_two` flag is specified by adding `[pow2]` + after the axis name. + * Values may differ from those defined in the benchmark. +* String axes: + * A single value or explicit list of values may be specified. + * Values may differ from those defined in the benchmark. +* Type axes: + * A single value or explicit list of values may be specified. + * Values **MUST** be a subset of the types defined in the benchmark. + * Values **MUST** match the input strings provided by `--list` (e.g. `I32` + for `int`). + * Provide a `nvbench::type_strings` specialization to modify a custom + type's input string. + +# Examples + +## Single Value + +| Axis Type | Example | Example Result | +|-----------|-------------------------|------------------| +| Int64 | `-a InputSize=12345` | 12345 | +| Int64Pow2 | `-a InputSize[pow2]=8` | 256 | +| Float64 | `-a Quality=0.5` | 0.5 | +| String | `-a RNG=Uniform` | "Uniform" | +| Type | `-a ValueType=I32` | `int32_t` | + +## Explicit List + +| Axis Type | Example | Example Result | +|-----------|---------------------------------|--------------------------------| +| Int64 | `-a InputSize=[1,2,3,4,5]` | 1, 2, 3, 4, 5 | +| Int64Pow2 | `-a InputSize[pow2]=[4,6,8,10]` | 16, 64, 256, 1024 | +| Float64 | `-a Quality=[0.5,0.75,1.0]` | 0.5, 0.75, 1.0 | +| String | `-a RNG=[Uniform,Gaussian]` | "Uniform", "Gaussian" | +| Type | `-a ValueType=[U8,I32,F64]` | `uint8_t`, `int32_t`, `double` | + +## Strided Range + +| Axis Type | Example | Example Result | +|-----------|---------------------------------|------------------------------| +| Int64 | `-a InputSize=[2:10:2]` | 2, 4, 6, 8, 10 | +| Int64Pow2 | `-a InputSize[pow2]=[2:10:2]` | 4, 16, 64, 128, 256, 1024 | +| Float64 | `-a Quality=[.5:1:.1]` | 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 | +| String | [Not supported] | | +| Type | [Not supported] | | diff --git a/nvbench/internal/help_strings.cuh b/nvbench/internal/help_strings.cuh new file mode 100644 index 0000000..bd16092 --- /dev/null +++ b/nvbench/internal/help_strings.cuh @@ -0,0 +1,194 @@ +/* + * Copyright 2020 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +// TODO These should exactly match the relevant files in /docs. +// Eventually we should generate this file at configure-time with CMake magic. + +namespace nvbench::internal +{ + +static const std::string help_text = + R"string_bounds(# Queries + +* `--list`, `-l` + * List all devices and benchmarks without running them. + +* `--help`, `-h` + * Print usage information and exit. + +* `--help-axes`, `--help-axis` + * Print axis specification documentation and exit. + +# Output + +* `--csv ` + * Write CSV output to a file, or "stdout" / "stderr". + +* `--markdown `, `--md ` + * Write markdown output to a file, or "stdout" / "stderr". + * Markdown is written to "stdout" by default. + +* `--quiet`, `-q` + * Suppress output. + +* `--color` + * Use color in output (markdown + stdout only). + +# Benchmark / Axis Specification + +* `--benchmark `, `-b ` + * Execute a specific benchmark. + * Argument is a benchmark name or index, taken from `--list`. + * If not specified, all benchmarks will run. + * `--benchmark` may be specified multiple times to run several benchmarks. + * The same benchmark may be specified multiple times with different + configurations. + +* `--axis `, `-a ` + * Override an axis specification. + * See `--help-axis` for details on axis specifications. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +# Benchmark Properties + +* `--devices `, `--device `, `-d ` + * Limit execution to one or more devices. + * `` is a single id, or a comma separated list. + * Device ids can be obtained from `--list`. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--min-samples ` + * Gather at least `` samples per measurement. + * Default is 10 samples. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--min-time ` + * Accumulate at least `` of execution time per measurement. + * Default is 0.5 seconds. + * If both GPU and CPU times are gathered, this applies to GPU time only. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--max-noise ` + * Gather samples until the error in the measurement drops below ``. + * Noise is computed as the percent relative standard deviation. + * Default is 0.5%. + * Only applies to Cold measurements. + * If both GPU and CPU times are gathered, this applies to GPU noise only. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--skip-time ` + * Skip a measurement when a warmup run executes in less than ``. + * Default is -1 seconds (disabled). + * Intended for testing / debugging only. + * Very fast kernels (<5us) often require an extremely large number of samples + to converge `max-noise`. This option allows them to be skipped to save time + during testing. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--timeout ` + * Measurements will timeout after `` have elapsed. + * Default is 15 seconds. + * `` is walltime, not accumulated sample time. + * If a measurement times out, the default markdown log will print a warning to + report any outstanding termination criteria (min samples, min time, max + noise). + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. +)string_bounds"; + +static const std::string help_axis_text = + R"string_bounds(# Axis Specification + +The `--axis ` option redefines the values in a benchmark's axis. It +applies to the benchmark created by the most recent `--benchmark` argument, or +all benchmarks if it precedes all `--benchmark` arguments (if any). + +Valid axis specification follow the form: + +* `=` +* `=[,,...]` +* `=[:]` +* `=[::]` +* `[]=` +* `[]=[,,...]` +* `[]=[:]` +* `[]=[::]` + +Whitespace is ignored if the argument is quoted. + +The axis type is taken from the benchmark definition. Some axes have additional +restrictions: + +* Numeric axes: + * A single value, explicit list of values, or strided range may be specified. + * For `int64` axes, the `power_of_two` flag is specified by adding `[pow2]` + after the axis name. + * Values may differ from those defined in the benchmark. +* String axes: + * A single value or explicit list of values may be specified. + * Values may differ from those defined in the benchmark. +* Type axes: + * A single value or explicit list of values may be specified. + * Values **MUST** be a subset of the types defined in the benchmark. + * Values **MUST** match the input strings provided by `--list` (e.g. `I32` + for `int`). + * Provide a `nvbench::type_strings` specialization to modify a custom + type's input string. + +# Examples + +## Single Value + +| Axis Type | Example | Example Result | +|-----------|-------------------------|------------------| +| Int64 | `-a InputSize=12345` | 12345 | +| Int64Pow2 | `-a InputSize[pow2]=8` | 256 | +| Float64 | `-a Quality=0.5` | 0.5 | +| String | `-a RNG=Uniform` | "Uniform" | +| Type | `-a ValueType=I32` | `int32_t` | + +## Explicit List + +| Axis Type | Example | Example Result | +|-----------|---------------------------------|--------------------------------| +| Int64 | `-a InputSize=[1,2,3,4,5]` | 1, 2, 3, 4, 5 | +| Int64Pow2 | `-a InputSize[pow2]=[4,6,8,10]` | 16, 64, 256, 1024 | +| Float64 | `-a Quality=[0.5,0.75,1.0]` | 0.5, 0.75, 1.0 | +| String | `-a RNG=[Uniform,Gaussian]` | "Uniform", "Gaussian" | +| Type | `-a ValueType=[U8,I32,F64]` | `uint8_t`, `int32_t`, `double` | + +## Strided Range + +| Axis Type | Example | Example Result | +|-----------|---------------------------------|------------------------------| +| Int64 | `-a InputSize=[2:10:2]` | 2, 4, 6, 8, 10 | +| Int64Pow2 | `-a InputSize[pow2]=[2:10:2]` | 4, 16, 64, 128, 256, 1024 | +| Float64 | `-a Quality=[.5:1:.1]` | 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 | +| String | [Not supported] | | +| Type | [Not supported] | | +)string_bounds"; + +} // namespace nvbench::internal diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu index 61a34d0..fbc6db2 100644 --- a/nvbench/option_parser.cu +++ b/nvbench/option_parser.cu @@ -27,6 +27,8 @@ #include +#include + #include #include @@ -365,7 +367,17 @@ void option_parser::parse_range(option_parser::arg_iterator_t first, { const auto &arg = *first; - if (arg == "--list" || arg == "-l") + if (arg == "--help" || arg == "-h") + { + this->print_help(); + std::exit(0); + } + else if (arg == "--help-axes" || arg == "--help-axis") + { + this->print_help_axis(); + std::exit(0); + } + else if (arg == "--list" || arg == "-l") { this->print_list(); std::exit(0); @@ -497,6 +509,18 @@ void option_parser::print_list() const printer.print_benchmark_list(bench_mgr.get_benchmarks()); } +void option_parser::print_help() const +{ + fmt::print("{}\n{}\n", + nvbench::internal::help_text, + nvbench::internal::help_axis_text); +} + +void option_parser::print_help_axis() const +{ + fmt::print("{}\n", nvbench::internal::help_axis_text); +} + void option_parser::add_benchmark(const std::string &name) try { diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh index 7a14ecf..7bba637 100644 --- a/nvbench/option_parser.cuh +++ b/nvbench/option_parser.cuh @@ -84,6 +84,8 @@ private: std::ostream &printer_spec_to_ostream(const std::string &spec); void print_list() const; + void print_help() const; + void print_help_axis() const; void add_benchmark(const std::string &name); void replay_global_args();