From 65bc2c1e3fc02d95873ab9c47f87b0efd2feebbf Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Thu, 4 Mar 2021 18:40:23 -0500
Subject: [PATCH] Documentation overhaul.

Revamp README, split into multiple files. Add docs on CLI.

Add `--help` and `--help-axis`.
---
 README.md                         | 431 ++++--------------------------
 docs/benchmarks.md                | 383 ++++++++++++++++++++++++++
 docs/cli_help.md                  |  93 +++++++
 docs/cli_help_axis.md             |  69 +++++
 nvbench/internal/help_strings.cuh | 194 ++++++++++++++
 nvbench/option_parser.cu          |  26 +-
 nvbench/option_parser.cuh         |   2 +
 7 files changed, 815 insertions(+), 383 deletions(-)
 create mode 100644 docs/benchmarks.md
 create mode 100644 docs/cli_help.md
 create mode 100644 docs/cli_help_axis.md
 create mode 100644 nvbench/internal/help_strings.cuh

diff --git a/README.md b/README.md
index a124500..3f1db1b 100644
--- a/README.md
+++ b/README.md
@@ -2,30 +2,33 @@
 
 This project is a work-in-progress. Everything is subject to change.
 
-NVBench is a C++17 library designed to simplify CUDA kernel benchmarking.
-It allows simultaneous parameter sweeps across multiple axes, including template
-parameters. Various timings are reported, including "cold" execution time
-(single run per timed region) and "batch" execution time
-(launch multiple kernels within a single timed region).
+NVBench is a C++17 library designed to simplify CUDA kernel benchmarking. It
+features:
 
-# Example Sandbox Project
+* [Parameter sweeps](docs/benchmarks.md#parameter-axes): a powerful and
+  flexible "axis" system explores a kernel's configuration space. Parameters may
+  be dynamic numbers/strings or [static types](docs/benchmarks.md#type-axes).
+* [Runtime customization](docs/cli_help.md): A rich command-line interface
+  allows [redefinition of parameter axes](docs/cli_help_axis.md), CUDA device
+  selection, changing output formats, and more.
+* [Throughput calculations](docs/benchmarks.md#throughput-measurements): Compute
+  and report:
+  * Item throughput (elements/second)
+  * Global memory bandwidth usage (bytes/second and per-device %-of-peak-bw)
+* Multiple output formats: Currently supports markdown (default) and CSV output.
+* [Manual timer mode](docs/benchmarks.md#explicit-timer-mode-nvbenchexec_tagtimer):
+  (optional) Explicitly start/stop timing in a benchmark implementation.
+* Multiple measurement types:
+  * Cold Measurements:
+    * Each sample runs the benchmark once with a clean device L2 cache.
+    * GPU and CPU times are reported.
+  * Batch Measurements:
+    * Executes the benchmark multiple times back-to-back and records total time.
+    * Reports the average execution time (total time / number of executions).
 
-See [NVBench Demo](https://github.com/allisonvacanti/nvbench_demo) for a simple
-CMake project that builds an example benchmark using NVBench.
+# Getting Started
 
-## Scope and Related Tools
-
-NVBench will measure the CPU and CUDA GPU execution time of a ***single
-host-side critical region*** per benchmark. It is intended for regression
-testing and parameter tuning of individual kernels. For in-depth analysis of
-end-to-end performance of multiple applications, the NVIDIA Nsight tools are
-more appropriate.
-
-NVBench is focused on evaluating the performance of CUDA kernels and is not
-optimized for CPU microbenchmarks. This may change in the future, but for now,
-consider using Google Benchmark for high resolution CPU benchmarks.
-
-# Minimal Benchmark
+## Minimal Benchmark
 
 A basic kernel benchmark can be created with just a few lines of CUDA C++:
 
@@ -38,373 +41,37 @@ void my_benchmark(nvbench::state& state) {
 NVBENCH_BENCH(my_benchmark);
 ```
 
-There are three main components in the definition of a benchmark:
+See [Benchmarks](docs/benchmarks.md) for information on customizing benchmarks
+and implementing parameter sweeps.
 
-- A `KernelGenerator` callable (`my_benchmark` above)
-- A `KernelLauncher` callable (the lambda passed to `nvbench::exec`), and
-- A `BenchmarkDeclaration` using `NVBENCH_BENCH` or similar macros.
+## Command Line Interface
 
-The `KernelGenerator` is called with an `nvbench::state` object that provides
-configuration information, as shown in later sections. The generator is
-responsible for configuring and instantiating a `KernelLauncher`, which is
-(unsurprisingly) responsible for launching a kernel. The launcher should contain
-only the minimum amount of code necessary to start the CUDA kernel,
-since `nvbench::exec` will execute it repeatedly to gather timing information.
-An `nvbench::launch` object is provided to the launcher to specify kernel
-execution details, such as the CUDA stream to use. `NVBENCH_BENCH` registers
-the benchmark with NVBench and initializes various attributes, including its
-name and parameter axes.
+Each benchmark executable produced by NVBench provides a rich set of
+command-line options for configuring benchmark execution at runtime. See the
+[CLI overview](docs/cli_help.md)
+and [CLI axis specification](docs/cli_help_axis.md) for more information.
 
-# Benchmark Name
+## Example Sandbox Project
 
-By default, a benchmark is named by converting the first argument
-of `NVBENCH_BENCH` into a string.
+To get started with NVBench, consider trying out
+the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo). This
+repository contains a simple CMake project that uses NVBench to build an example
+benchmark. It's a great way to experiment with the library without a lot of
+investment.
 
-This can be changed to something more descriptive if desired.
-The `NVBENCH_BENCH` macro produces a customization object that allows such
-attributes to be modified.
+# License
 
-```cpp
-NVBENCH_BENCH(my_benchmark).set_name("my_kernel<<<num_blocks, 256>>>");
-```
+NVBench is released under the Apache 2.0 License with LLVM exceptions.
+See [LICENSE](./LICENSE).
 
-# Parameter Axes
+# Scope and Related Projects
 
-Some kernels will be used with a variety of options, input data types/sizes, and
-other factors that impact performance. NVBench explores these different
-scenarios by sweeping through a set of user-defined parameter axes.
+NVBench will measure the CPU and CUDA GPU execution time of a ***single
+host-side critical region*** per benchmark. It is intended for regression
+testing and parameter tuning of individual kernels. For in-depth analysis of
+end-to-end performance of multiple applications, the NVIDIA Nsight tools are
+more appropriate.
 
-A parameter axis defines a set of interesting values for a single kernel
-parameter — for example, the size of the input, or the type of values being
-processed. These parameter axes are used to customize a `KernelGenerator` with
-static and runtime configurations. There are four supported types of parameters:
-int64, float64, string, and type.
-
-## Int64 Axes
-
-A common example of a parameter axis is to vary the number of input values a
-kernel should process during a benchmark measurement. An `int64_axis` is ideal
-for this:
-
-```cpp
-void benchmark(nvbench::state& state)
-{
-  const auto num_inputs = state.get_int64("NumInputs");
-  thrust::device_vector<int> data = generate_input(num_inputs);
-
-  state.exec([&data](nvbench::launch& launch) { 
-    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
-  });
-}
-NVBENCH_BENCH(benchmark).add_int64_axis("NumInputs", {16, 64, 256, 1024, 4096});
-```
-
-NVBench will run the `benchmark` kernel generator once for each specified value
-in the "NumInputs" axis. The `state` object provides the current parameter value
-to `benchmark`.
-
-### Int64 Power-Of-Two Axes
-
-Using powers-of-two is quite common for these sorts of axes. `int64_axis` has a
-unique power-of-two mode that simplifies how such axes are defined and helps
-provide more readable output. A power-of-two int64 axis is defined using the
-integer exponents, but the benchmark will be run with the computed 2^N value.
-
-```cpp
-// Equivalent to above, {16, 64, 256, 1024, 4096} = {2^4, 2^6, 2^8, 2^10, 2^12}
-NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs",
-                                                     {4, 6, 8, 10, 12});
-// Or, as shown in a later section:
-NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs",
-                                                     nvbench::range(4, 12, 2});
-```
-
-## Float64 Axes
-
-For floating point numbers, a `float64_axis` is available:
-
-```cpp
-void benchmark(nvbench::state& state)
-{
-  const auto quality = state.get_float64("Quality");
-
-  state.exec([&quality](nvbench::launch& launch)
-  { 
-    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(quality);
-  });
-}
-NVBENCH_BENCH(benchmark).add_float64_axis("Quality", {0.05, 0.1, 0.25, 0.5, 0.75, 1.});
-```
-
-## String Axes
-
-For non-numeric data, an axis of arbitrary strings provides additional
-flexibility:
-
-```cpp
-void benchmark(nvbench::state& state)
-{
-  const auto rng_dist = state.get_string("RNG Distribution");
-  thrust::device_vector<int> data = generate_input(rng_dist);
-
-  state.exec([&data](nvbench::launch& launch)
-  { 
-    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
-  });
-}
-NVBENCH_BENCH(benchmark).add_string_axis("RNG Distribution", {"Uniform", "Gaussian"});
-```
-
-## Type Axes
-
-Another common situation involves benchmarking a templated kernel with multiple
-compile-time configurations. NVBench strives to make such benchmarks as easy to
-write as possible through the use of type axes.
-
-A `type_axis` is a list of types (`T1`, `T2`, `Ts`...) wrapped in
-a `nvbench::type_list<T1, T2, Ts...>`. The kernel generator becomes a template
-function and will be instantiated using types defined by the axis. The current
-configuration's type is passed into the kernel generator using
-a `nvbench::type_list<T>`.
-
-```cpp
-template <typename T>
-void my_benchmark(nvbench::state& state, nvbench::type_list<T>)
-{
-  thrust::device_vector<T> data = generate_input<T>();
-
-  state.exec([&data](nvbench::launch& launch)
-  { 
-    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
-  });
-}
-using my_types = nvbench::type_list<int, float, double>;
-NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(my_types))
-  .set_type_axis_names({"ValueType"});
-```
-
-The `NVBENCH_TYPE_AXES` macro is unfortunately necessary to prevent commas in
-the `type_list<...>` from breaking macro parsing.
-
-## `nvbench::range`
-
-Since parameter sweeps often explore a range of evenly-spaced numeric values, a
-strided range can be generated using the `nvbench::range(start, end, stride=1)`
-helper.
-
-```cpp
-assert(nvbench::range(2, 5) == {2, 3, 4, 5});
-assert(nvbench::range(2.0, 5.0) == {2.0, 3.0, 4.0, 5.0});
-assert(nvbench::range(2, 12, 2) == {2, 4, 6, 8, 10, 12});
-assert(nvbench::range(2, 12, 5) == {2, 7, 12});
-assert(nvbench::range(2, 12, 6) == {2, 8});
-assert(nvbench::range(0.0, 10.0, 2.5) == { 0.0, 2.5, 5.0, 7.5, 10.0});
-```
-
-Note that start and end are inclusive. This utility can be used to define axis
-values for all numeric axes.
-
-## Multiple Parameter Axes
-
-If more than one axis is defined, the complete cartesian product of all axes
-will be benchmarked. For example, consider a benchmark with two type axes, one
-int64 axis, and one float64 axis:
-
-```cpp
-// InputTypes: {char, int, unsigned int}
-// OutputTypes: {float, double}
-// NumInputs: {2^10, 2^20, 2^30}
-// Quality: {0.5, 1.0}
-
-using input_types = nvbench::type_list<char, int, unsigned int>;
-using output_types = nvbench::type_list<float, double>;
-NVBENCH_BENCH_TYPES(benchmark, NVBENCH_TYPE_AXES(input_types, output_types))
-  .set_type_axes_names({"InputType", "OutputType"})
-  .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 10))
-  .add_float64_axis("Quality", {0.5, 1.0});
-```
-
-This would generate a total of 36 configurations and instantiate the benchmark 6
-times. Keep the rapid growth of these combinations in mind when choosing the
-number of values in an axis. See the section about combinatorial explosion for
-more example and information.
-
-# Throughput Measurements
-
-In additional to raw timing information, NVBench can track a kernel's
-throughput, reporting the amount of data processed as:
-
-- Number of items per second
-- Number of bytes per second
-- Percentage of device's peak memory bandwidth utilized
-
-To enable throughput measurements, the kernel generator can specify the number
-of items and/or bytes handled in a single kernel execution using
-the `nvbench::state` API.
-
-```cpp
-state.add_element_count(size);
-state.add_global_memory_reads<InputType>(size);
-state.add_global_memory_writes<OutputType>(size);
-```
-
-For meaningful results, specify the input element count, and include all reads
-and writes to global memory.
-
-# Skip Uninteresting / Invalid Benchmarks
-
-Sometimes particular combinations of parameters aren't useful or interesting —
-or for type axes, some configurations may not even compile.
-
-The `nvbench::state` object provides a `skip("Reason")` method that can be used
-to avoid running these benchmarks. To skip uncompilable type axis
-configurations, create an overload for the kernel generator that selects for the
-invalid type combination:
-
-```cpp
-template <typename T, typename U>
-void my_benchmark(nvbench::state& state, nvbench::type_list<T, U>)
-{
-  // Skip benchmarks at runtime:
-  if (should_skip_this_config)
-  {
-    state.skip("Reason for skip.");
-    return;
-  }
-
-  /* ... */
-};
-
-// Skip benchmarks are compile time -- for example, always skip when T == U
-// (Note that the `type_list` argument defines the same type twice).
-template <typename SameType>
-void my_benchmark(nvbench::state& state, 
-                  nvbench::type_list<SameType, SameType>)
-{
-  state.skip("T must not be the same type as U.");
-}
-using Ts = nvbench::type_list<...>;
-using Us = nvbench::type_list<...>;
-NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(Ts, Us));
-```
-
-# Execution Tags For Special Cases
-
-By default, NVBench assumes that the entire execution time of the
-`KernelLauncher` should be measured, and that no syncs are performed
-(e.g. `cudaDeviceSynchronize`, `cudaStreamSynchronize`, `cudaEventSynchronize`,
-etc. are not called).
-
-Execution tags may be passed to `state.exec` when these assumptions are not
-true:
-
-- `nvbench::exec_tag::sync` tells NVBench that the kernel launcher will
-  synchronize internally, and
-- `nvbench::exec_tag::timer` requests a timer object that can be used to
-  restrict the timed region.
-
-Multiple execution tags may be combined using `operator|`, e.g.
-
-```cpp
-state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
-           [](nvbench::launch &launch, auto& timer) { /*...*/ });
-```
-
-The following sections provide more details on these features.
-
-## Benchmarks that sync: `nvbench::exec_tag::sync`
-
-If a `KernelLauncher` synchronizes the CUDA device internally without passing
-this tag, **the benchmark will deadlock at runtime**. Passing the `sync` tag
-will fix this issue. Note that this disables batch measurements.
-
-```cpp
-void sync_example(nvbench::state& state)
-{
-  // Pass the `sync` exec tag to tell NVBench that this benchmark will sync:
-  state.exec(nvbench::exec_tag::sync, [](nvbench::launch& launch) {
-    /* Benchmark that implicitly syncs here. */
-  });
-}
-NVBENCH_BENCH(sync_example);
-```
-
-## Explicit timer mode: `nvbench::exec_tag::timer`
-
-For some kernels, the working data may need to be reset between launches. This
-is particularly common for kernels that modify their input in-place.
-
-Resetting the input data to prepare for a new trial shouldn't be included in the
-benchmark's execution time. NVBench provides a manual timer mode that allows the
-kernel launcher to specify the critical section to be measured and exclude any
-per-trial reset operations.
-
-To enable the manual timer mode, pass the tag object `nvbench::exec_tag::timer`
-to `state.exec`, and declare the kernel launcher with an
-additional `auto& timer` argument.
-
-Note that using manual timer mode disables batch measurements.
-
-```cpp
-void timer_example(nvbench::state& state)
-{
-  // Pass the `timer` exec tag to request a timer:
-  state.exec(nvbench::exec_tag::timer, 
-    // Lambda now accepts a timer:
-    [](nvbench::launch& launch, auto& timer)
-    {
-      /* Reset code here, excluded from timing */
-
-      /* Timed region is explicitly marked.
-       * The timer handles any synchronization, flushes, etc when/if
-       * needed for the current measurement.
-       */
-      timer.start();
-      /* Launch kernel on `launch.get_stream()` here */
-      timer.stop();
-    });
-}
-NVBENCH_BENCH(timer_example);
-```
-
-# Beware: Combinatorial Explosion Is Lurking
-
-Be very careful of how quickly the configuration space can grow. The following
-example generates 960 total runtime benchmark configurations, and will compile
-192 different static parametrizations of the kernel generator. This is likely
-excessive, especially for routine regression testing.
-
-```cpp
-using value_types = nvbench::type_list<nvbench::uint8_t,
-                                       nvbench::int32_t,
-                                       nvbench::float32_t,
-                                       nvbench::float64_t>;
-using op_types = nvbench::type_list<thrust::plus<>, 
-                                    thrust::multiplies<>,
-                                    thrust::maximum<>>;
-
-NVBENCH_BENCH_TYPES(my_benchmark,
-                    NVBENCH_TYPE_AXES(value_types,
-                                      value_types,
-                                      value_types,
-                                      op_types>))
-  .set_type_axes_names({"T", "U", "V", "Op"})
-  .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 5));
-```
-
-```
-960 total configs
-= 4 [T=(U8, I32, F32, F64)] 
-* 4 [U=(U8, I32, F32, F64)]
-* 4 [V=(U8, I32, F32, F64)]
-* 3 [Op=(plus, multiplies, max)]
-* 5 [NumInputs=(2^10, 2^15, 2^20, 2^25, 2^30)]
-```
-
-For large configuration spaces like this, pruning some of the less useful
-combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques
-described in the "Skip Uninteresting / Invalid Benchmarks" section can help
-immensely with keeping compile / run times manageable.
-
-Splitting a single large configuration space into multiple, more focused
-benchmarks with reduced dimensionality will likely be worth the effort as well.
+NVBench is focused on evaluating the performance of CUDA kernels and is not
+optimized for CPU microbenchmarks. This may change in the future, but for now,
+consider using Google Benchmark for high resolution CPU benchmarks.
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
new file mode 100644
index 0000000..0a85278
--- /dev/null
+++ b/docs/benchmarks.md
@@ -0,0 +1,383 @@
+# Minimal Benchmark
+
+A basic kernel benchmark can be created with just a few lines of CUDA C++:
+
+```cpp
+void my_benchmark(nvbench::state& state) {
+  state.exec([](nvbench::launch& launch) { 
+    my_kernel<<<num_blocks, 256, 0, launch.get_stream()>>>();
+  });
+}
+NVBENCH_BENCH(my_benchmark);
+```
+
+There are three main components in the definition of a benchmark:
+
+- A `KernelGenerator` callable (`my_benchmark` above)
+- A `KernelLauncher` callable (the lambda passed to `nvbench::exec`), and
+- A `BenchmarkDeclaration` using `NVBENCH_BENCH` or similar macros.
+
+The `KernelGenerator` is called with an `nvbench::state` object that provides
+configuration information, as shown in later sections. The generator is
+responsible for configuring and instantiating a `KernelLauncher`, which is
+(unsurprisingly) responsible for launching a kernel. The launcher should contain
+only the minimum amount of code necessary to start the CUDA kernel,
+since `nvbench::exec` will execute it repeatedly to gather timing information.
+An `nvbench::launch` object is provided to the launcher to specify kernel
+execution details, such as the CUDA stream to use. `NVBENCH_BENCH` registers
+the benchmark with NVBench and initializes various attributes, including its
+name and parameter axes.
+
+# Benchmark Name
+
+By default, a benchmark is named by converting the first argument
+of `NVBENCH_BENCH` into a string.
+
+This can be changed to something more descriptive if desired.
+The `NVBENCH_BENCH` macro produces a customization object that allows such
+attributes to be modified.
+
+```cpp
+NVBENCH_BENCH(my_benchmark).set_name("my_kernel<<<num_blocks, 256>>>");
+```
+
+# Parameter Axes
+
+Some kernels will be used with a variety of options, input data types/sizes, and
+other factors that impact performance. NVBench explores these different
+scenarios by sweeping through a set of user-defined parameter axes.
+
+A parameter axis defines a set of interesting values for a single kernel
+parameter — for example, the size of the input, or the type of values being
+processed. These parameter axes are used to customize a `KernelGenerator` with
+static and runtime configurations. There are four supported types of parameters:
+int64, float64, string, and type.
+
+## Int64 Axes
+
+A common example of a parameter axis is to vary the number of input values a
+kernel should process during a benchmark measurement. An `int64_axis` is ideal
+for this:
+
+```cpp
+void benchmark(nvbench::state& state)
+{
+  const auto num_inputs = state.get_int64("NumInputs");
+  thrust::device_vector<int> data = generate_input(num_inputs);
+
+  state.exec([&data](nvbench::launch& launch) { 
+    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
+  });
+}
+NVBENCH_BENCH(benchmark).add_int64_axis("NumInputs", {16, 64, 256, 1024, 4096});
+```
+
+NVBench will run the `benchmark` kernel generator once for each specified value
+in the "NumInputs" axis. The `state` object provides the current parameter value
+to `benchmark`.
+
+### Int64 Power-Of-Two Axes
+
+Using powers-of-two is quite common for these sorts of axes. `int64_axis` has a
+unique power-of-two mode that simplifies how such axes are defined and helps
+provide more readable output. A power-of-two int64 axis is defined using the
+integer exponents, but the benchmark will be run with the computed 2^N value.
+
+```cpp
+// Equivalent to above, {16, 64, 256, 1024, 4096} = {2^4, 2^6, 2^8, 2^10, 2^12}
+NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs",
+                                                     {4, 6, 8, 10, 12});
+// Or, as shown in a later section:
+NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs",
+                                                     nvbench::range(4, 12, 2});
+```
+
+## Float64 Axes
+
+For floating point numbers, a `float64_axis` is available:
+
+```cpp
+void benchmark(nvbench::state& state)
+{
+  const auto quality = state.get_float64("Quality");
+
+  state.exec([&quality](nvbench::launch& launch)
+  { 
+    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(quality);
+  });
+}
+NVBENCH_BENCH(benchmark).add_float64_axis("Quality", {0.05, 0.1, 0.25, 0.5, 0.75, 1.});
+```
+
+## String Axes
+
+For non-numeric data, an axis of arbitrary strings provides additional
+flexibility:
+
+```cpp
+void benchmark(nvbench::state& state)
+{
+  const auto rng_dist = state.get_string("RNG Distribution");
+  thrust::device_vector<int> data = generate_input(rng_dist);
+
+  state.exec([&data](nvbench::launch& launch)
+  { 
+    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
+  });
+}
+NVBENCH_BENCH(benchmark).add_string_axis("RNG Distribution", {"Uniform", "Gaussian"});
+```
+
+## Type Axes
+
+Another common situation involves benchmarking a templated kernel with multiple
+compile-time configurations. NVBench strives to make such benchmarks as easy to
+write as possible through the use of type axes.
+
+A `type_axis` is a list of types (`T1`, `T2`, `Ts`...) wrapped in
+a `nvbench::type_list<T1, T2, Ts...>`. The kernel generator becomes a template
+function and will be instantiated using types defined by the axis. The current
+configuration's type is passed into the kernel generator using
+a `nvbench::type_list<T>`.
+
+```cpp
+template <typename T>
+void my_benchmark(nvbench::state& state, nvbench::type_list<T>)
+{
+  thrust::device_vector<T> data = generate_input<T>();
+
+  state.exec([&data](nvbench::launch& launch)
+  { 
+    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
+  });
+}
+using my_types = nvbench::type_list<int, float, double>;
+NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(my_types))
+  .set_type_axis_names({"ValueType"});
+```
+
+The `NVBENCH_TYPE_AXES` macro is unfortunately necessary to prevent commas in
+the `type_list<...>` from breaking macro parsing.
+
+## `nvbench::range`
+
+Since parameter sweeps often explore a range of evenly-spaced numeric values, a
+strided range can be generated using the `nvbench::range(start, end, stride=1)`
+helper.
+
+```cpp
+assert(nvbench::range(2, 5) == {2, 3, 4, 5});
+assert(nvbench::range(2.0, 5.0) == {2.0, 3.0, 4.0, 5.0});
+assert(nvbench::range(2, 12, 2) == {2, 4, 6, 8, 10, 12});
+assert(nvbench::range(2, 12, 5) == {2, 7, 12});
+assert(nvbench::range(2, 12, 6) == {2, 8});
+assert(nvbench::range(0.0, 10.0, 2.5) == { 0.0, 2.5, 5.0, 7.5, 10.0});
+```
+
+Note that start and end are inclusive. This utility can be used to define axis
+values for all numeric axes.
+
+## Multiple Parameter Axes
+
+If more than one axis is defined, the complete cartesian product of all axes
+will be benchmarked. For example, consider a benchmark with two type axes, one
+int64 axis, and one float64 axis:
+
+```cpp
+// InputTypes: {char, int, unsigned int}
+// OutputTypes: {float, double}
+// NumInputs: {2^10, 2^20, 2^30}
+// Quality: {0.5, 1.0}
+
+using input_types = nvbench::type_list<char, int, unsigned int>;
+using output_types = nvbench::type_list<float, double>;
+NVBENCH_BENCH_TYPES(benchmark, NVBENCH_TYPE_AXES(input_types, output_types))
+  .set_type_axes_names({"InputType", "OutputType"})
+  .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 10))
+  .add_float64_axis("Quality", {0.5, 1.0});
+```
+
+This would generate a total of 36 configurations and instantiate the benchmark 6
+times. Keep the rapid growth of these combinations in mind when choosing the
+number of values in an axis. See the section about combinatorial explosion for
+more example and information.
+
+# Throughput Measurements
+
+In additional to raw timing information, NVBench can track a kernel's
+throughput, reporting the amount of data processed as:
+
+- Number of items per second
+- Number of bytes per second
+- Percentage of device's peak memory bandwidth utilized
+
+To enable throughput measurements, the kernel generator can specify the number
+of items and/or bytes handled in a single kernel execution using
+the `nvbench::state` API.
+
+```cpp
+state.add_element_count(size);
+state.add_global_memory_reads<InputType>(size);
+state.add_global_memory_writes<OutputType>(size);
+```
+
+For meaningful results, specify the input element count, and include all reads
+and writes to global memory.
+
+# Skip Uninteresting / Invalid Benchmarks
+
+Sometimes particular combinations of parameters aren't useful or interesting —
+or for type axes, some configurations may not even compile.
+
+The `nvbench::state` object provides a `skip("Reason")` method that can be used
+to avoid running these benchmarks. To skip uncompilable type axis
+configurations, create an overload for the kernel generator that selects for the
+invalid type combination:
+
+```cpp
+template <typename T, typename U>
+void my_benchmark(nvbench::state& state, nvbench::type_list<T, U>)
+{
+  // Skip benchmarks at runtime:
+  if (should_skip_this_config)
+  {
+    state.skip("Reason for skip.");
+    return;
+  }
+
+  /* ... */
+};
+
+// Skip benchmarks are compile time -- for example, always skip when T == U
+// (Note that the `type_list` argument defines the same type twice).
+template <typename SameType>
+void my_benchmark(nvbench::state& state, 
+                  nvbench::type_list<SameType, SameType>)
+{
+  state.skip("T must not be the same type as U.");
+}
+using Ts = nvbench::type_list<...>;
+using Us = nvbench::type_list<...>;
+NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(Ts, Us));
+```
+
+# Execution Tags For Special Cases
+
+By default, NVBench assumes that the entire execution time of the
+`KernelLauncher` should be measured, and that no syncs are performed
+(e.g. `cudaDeviceSynchronize`, `cudaStreamSynchronize`, `cudaEventSynchronize`,
+etc. are not called).
+
+Execution tags may be passed to `state.exec` when these assumptions are not
+true:
+
+- `nvbench::exec_tag::sync` tells NVBench that the kernel launcher will
+  synchronize internally, and
+- `nvbench::exec_tag::timer` requests a timer object that can be used to
+  restrict the timed region.
+
+Multiple execution tags may be combined using `operator|`, e.g.
+
+```cpp
+state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+           [](nvbench::launch &launch, auto& timer) { /*...*/ });
+```
+
+The following sections provide more details on these features.
+
+## Benchmarks that sync: `nvbench::exec_tag::sync`
+
+If a `KernelLauncher` synchronizes the CUDA device internally without passing
+this tag, **the benchmark will deadlock at runtime**. Passing the `sync` tag
+will fix this issue. Note that this disables batch measurements.
+
+```cpp
+void sync_example(nvbench::state& state)
+{
+  // Pass the `sync` exec tag to tell NVBench that this benchmark will sync:
+  state.exec(nvbench::exec_tag::sync, [](nvbench::launch& launch) {
+    /* Benchmark that implicitly syncs here. */
+  });
+}
+NVBENCH_BENCH(sync_example);
+```
+
+## Explicit timer mode: `nvbench::exec_tag::timer`
+
+For some kernels, the working data may need to be reset between launches. This
+is particularly common for kernels that modify their input in-place.
+
+Resetting the input data to prepare for a new trial shouldn't be included in the
+benchmark's execution time. NVBench provides a manual timer mode that allows the
+kernel launcher to specify the critical section to be measured and exclude any
+per-trial reset operations.
+
+To enable the manual timer mode, pass the tag object `nvbench::exec_tag::timer`
+to `state.exec`, and declare the kernel launcher with an
+additional `auto& timer` argument.
+
+Note that using manual timer mode disables batch measurements.
+
+```cpp
+void timer_example(nvbench::state& state)
+{
+  // Pass the `timer` exec tag to request a timer:
+  state.exec(nvbench::exec_tag::timer, 
+    // Lambda now accepts a timer:
+    [](nvbench::launch& launch, auto& timer)
+    {
+      /* Reset code here, excluded from timing */
+
+      /* Timed region is explicitly marked.
+       * The timer handles any synchronization, flushes, etc when/if
+       * needed for the current measurement.
+       */
+      timer.start();
+      /* Launch kernel on `launch.get_stream()` here */
+      timer.stop();
+    });
+}
+NVBENCH_BENCH(timer_example);
+```
+
+# Beware: Combinatorial Explosion Is Lurking
+
+Be very careful of how quickly the configuration space can grow. The following
+example generates 960 total runtime benchmark configurations, and will compile
+192 different static parametrizations of the kernel generator. This is likely
+excessive, especially for routine regression testing.
+
+```cpp
+using value_types = nvbench::type_list<nvbench::uint8_t,
+                                       nvbench::int32_t,
+                                       nvbench::float32_t,
+                                       nvbench::float64_t>;
+using op_types = nvbench::type_list<thrust::plus<>, 
+                                    thrust::multiplies<>,
+                                    thrust::maximum<>>;
+
+NVBENCH_BENCH_TYPES(my_benchmark,
+                    NVBENCH_TYPE_AXES(value_types,
+                                      value_types,
+                                      value_types,
+                                      op_types>))
+  .set_type_axes_names({"T", "U", "V", "Op"})
+  .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 5));
+```
+
+```
+960 total configs
+= 4 [T=(U8, I32, F32, F64)] 
+* 4 [U=(U8, I32, F32, F64)]
+* 4 [V=(U8, I32, F32, F64)]
+* 3 [Op=(plus, multiplies, max)]
+* 5 [NumInputs=(2^10, 2^15, 2^20, 2^25, 2^30)]
+```
+
+For large configuration spaces like this, pruning some of the less useful
+combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques
+described in the "Skip Uninteresting / Invalid Benchmarks" section can help
+immensely with keeping compile / run times manageable.
+
+Splitting a single large configuration space into multiple, more focused
+benchmarks with reduced dimensionality will likely be worth the effort as well.
diff --git a/docs/cli_help.md b/docs/cli_help.md
new file mode 100644
index 0000000..02e3e25
--- /dev/null
+++ b/docs/cli_help.md
@@ -0,0 +1,93 @@
+# Queries
+
+* `--list`, `-l`
+  * List all devices and benchmarks without running them.
+
+* `--help`, `-h`
+  * Print usage information and exit.
+
+* `--help-axes`, `--help-axis`
+  * Print axis specification documentation and exit.
+
+# Output
+
+* `--csv <filename/stream>`
+  * Write CSV output to a file, or "stdout" / "stderr".
+
+* `--markdown <filename/stream>`, `--md <filename/stream>`
+  * Write markdown output to a file, or "stdout" / "stderr".
+  * Markdown is written to "stdout" by default.
+
+* `--quiet`, `-q`
+  * Suppress output.
+
+* `--color`
+  * Use color in output (markdown + stdout only).
+
+# Benchmark / Axis Specification
+
+* `--benchmark <benchmark name/index>`, `-b <benchmark name/index>`
+  * Execute a specific benchmark.
+  * Argument is a benchmark name or index, taken from `--list`.
+  * If not specified, all benchmarks will run.
+  * `--benchmark` may be specified multiple times to run several benchmarks.
+  * The same benchmark may be specified multiple times with different
+    configurations.
+
+* `--axis <axis specification>`, `-a <axis specification>`
+  * Override an axis specification.
+  * See `--help-axis`
+    for [details on axis specifications](./cli_help_axis.md).
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+# Benchmark Properties
+
+* `--devices <device ids>`, `--device <device ids>`, `-d <device ids>`
+  * Limit execution to one or more devices.
+  * `<device ids>` is a single id, or a comma separated list.
+  * Device ids can be obtained from `--list`.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--min-samples <count>`
+  * Gather at least `<count>` samples per measurement.
+  * Default is 10 samples.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--min-time <seconds>`
+  * Accumulate at least `<seconds>` of execution time per measurement.
+  * Default is 0.5 seconds.
+  * If both GPU and CPU times are gathered, this applies to GPU time only.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--max-noise <value>`
+  * Gather samples until the error in the measurement drops below `<value>`.
+  * Noise is computed as the percent relative standard deviation.
+  * Default is 0.5%.
+  * Only applies to Cold measurements.
+  * If both GPU and CPU times are gathered, this applies to GPU noise only.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--skip-time <seconds>`
+  * Skip a measurement when a warmup run executes in less than `<seconds>`.
+  * Default is -1 seconds (disabled).
+  * Intended for testing / debugging only.
+  * Very fast kernels (<5us) often require an extremely large number of samples
+    to converge `max-noise`. This option allows them to be skipped to save time
+    during testing.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--timeout <seconds>`
+  * Measurements will timeout after `<seconds>` have elapsed.
+  * Default is 15 seconds.
+  * `<seconds>` is walltime, not accumulated sample time.
+  * If a measurement times out, the default markdown log will print a warning to
+    report any outstanding termination criteria (min samples, min time, max
+    noise).
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
diff --git a/docs/cli_help_axis.md b/docs/cli_help_axis.md
new file mode 100644
index 0000000..a2da259
--- /dev/null
+++ b/docs/cli_help_axis.md
@@ -0,0 +1,69 @@
+# Axis Specification
+
+The `--axis <axis spec>` option redefines the values in a benchmark's axis. It
+applies to the benchmark created by the most recent `--benchmark` argument, or
+all benchmarks if it precedes all `--benchmark` arguments (if any).
+
+Valid axis specification follow the form:
+
+* `<axis_name>=<value>`
+* `<axis_name>=[<value1>,<value2>,...]`
+* `<axis_name>=[<start>:<stop>]`
+* `<axis_name>=[<start>:<stop>:<stride>]`
+* `<axis_name>[<flags>]=<value>`
+* `<axis_name>[<flags>]=[<value1>,<value2>,...]`
+* `<axis_name>[<flags>]=[<start>:<stop>]`
+* `<axis_name>[<flags>]=[<start>:<stop>:<stride>]`
+
+Whitespace is ignored if the argument is quoted.
+
+The axis type is taken from the benchmark definition. Some axes have additional
+restrictions:
+
+* Numeric axes:
+  * A single value, explicit list of values, or strided range may be specified.
+  * For `int64` axes, the `power_of_two` flag is specified by adding `[pow2]`
+    after the axis name.
+  * Values may differ from those defined in the benchmark.
+* String axes:
+  * A single value or explicit list of values may be specified.
+  * Values may differ from those defined in the benchmark.
+* Type axes:
+  * A single value or explicit list of values may be specified.
+  * Values **MUST** be a subset of the types defined in the benchmark.
+  * Values **MUST** match the input strings provided by `--list` (e.g. `I32`
+    for `int`).
+  * Provide a `nvbench::type_strings<T>` specialization to modify a custom
+    type's input string.
+
+# Examples
+
+## Single Value
+
+| Axis Type | Example                 | Example Result   |
+|-----------|-------------------------|------------------|
+| Int64     | `-a InputSize=12345`    | 12345            |
+| Int64Pow2 | `-a InputSize[pow2]=8`  | 256              |
+| Float64   | `-a Quality=0.5`        | 0.5              |
+| String    | `-a RNG=Uniform`        | "Uniform"        |
+| Type      | `-a ValueType=I32`      | `int32_t`        |
+
+## Explicit List
+
+| Axis Type | Example                         | Example Result                 |
+|-----------|---------------------------------|--------------------------------|
+| Int64     | `-a InputSize=[1,2,3,4,5]`      | 1, 2, 3, 4, 5                  |
+| Int64Pow2 | `-a InputSize[pow2]=[4,6,8,10]` | 16, 64, 256, 1024              |
+| Float64   | `-a Quality=[0.5,0.75,1.0]`     | 0.5, 0.75, 1.0                 |
+| String    | `-a RNG=[Uniform,Gaussian]`     | "Uniform", "Gaussian"          |
+| Type      | `-a ValueType=[U8,I32,F64]`     | `uint8_t`, `int32_t`, `double` |
+
+## Strided Range
+
+| Axis Type | Example                         | Example Result               |
+|-----------|---------------------------------|------------------------------|
+| Int64     | `-a InputSize=[2:10:2]`         | 2, 4, 6, 8, 10               |
+| Int64Pow2 | `-a InputSize[pow2]=[2:10:2]`   | 4, 16, 64, 128, 256, 1024    |
+| Float64   | `-a Quality=[.5:1:.1]`          | 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 |
+| String    | [Not supported]                 |                              |
+| Type      | [Not supported]                 |                              |
diff --git a/nvbench/internal/help_strings.cuh b/nvbench/internal/help_strings.cuh
new file mode 100644
index 0000000..bd16092
--- /dev/null
+++ b/nvbench/internal/help_strings.cuh
@@ -0,0 +1,194 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <string>
+
+// TODO These should exactly match the relevant files in /docs.
+// Eventually we should generate this file at configure-time with CMake magic.
+
+namespace nvbench::internal
+{
+
+static const std::string help_text =
+  R"string_bounds(# Queries
+
+* `--list`, `-l`
+  * List all devices and benchmarks without running them.
+
+* `--help`, `-h`
+  * Print usage information and exit.
+
+* `--help-axes`, `--help-axis`
+  * Print axis specification documentation and exit.
+
+# Output
+
+* `--csv <filename/stream>`
+  * Write CSV output to a file, or "stdout" / "stderr".
+
+* `--markdown <filename/stream>`, `--md <filename/stream>`
+  * Write markdown output to a file, or "stdout" / "stderr".
+  * Markdown is written to "stdout" by default.
+
+* `--quiet`, `-q`
+  * Suppress output.
+
+* `--color`
+  * Use color in output (markdown + stdout only).
+
+# Benchmark / Axis Specification
+
+* `--benchmark <benchmark name/index>`, `-b <benchmark name/index>`
+  * Execute a specific benchmark.
+  * Argument is a benchmark name or index, taken from `--list`.
+  * If not specified, all benchmarks will run.
+  * `--benchmark` may be specified multiple times to run several benchmarks.
+  * The same benchmark may be specified multiple times with different
+    configurations.
+
+* `--axis <axis specification>`, `-a <axis specification>`
+  * Override an axis specification.
+  * See `--help-axis` for details on axis specifications.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+# Benchmark Properties
+
+* `--devices <device ids>`, `--device <device ids>`, `-d <device ids>`
+  * Limit execution to one or more devices.
+  * `<device ids>` is a single id, or a comma separated list.
+  * Device ids can be obtained from `--list`.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--min-samples <count>`
+  * Gather at least `<count>` samples per measurement.
+  * Default is 10 samples.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--min-time <seconds>`
+  * Accumulate at least `<seconds>` of execution time per measurement.
+  * Default is 0.5 seconds.
+  * If both GPU and CPU times are gathered, this applies to GPU time only.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--max-noise <value>`
+  * Gather samples until the error in the measurement drops below `<value>`.
+  * Noise is computed as the percent relative standard deviation.
+  * Default is 0.5%.
+  * Only applies to Cold measurements.
+  * If both GPU and CPU times are gathered, this applies to GPU noise only.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--skip-time <seconds>`
+  * Skip a measurement when a warmup run executes in less than `<seconds>`.
+  * Default is -1 seconds (disabled).
+  * Intended for testing / debugging only.
+  * Very fast kernels (<5us) often require an extremely large number of samples
+    to converge `max-noise`. This option allows them to be skipped to save time
+    during testing.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--timeout <seconds>`
+  * Measurements will timeout after `<seconds>` have elapsed.
+  * Default is 15 seconds.
+  * `<seconds>` is walltime, not accumulated sample time.
+  * If a measurement times out, the default markdown log will print a warning to
+    report any outstanding termination criteria (min samples, min time, max
+    noise).
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+)string_bounds";
+
+static const std::string help_axis_text =
+  R"string_bounds(# Axis Specification
+
+The `--axis <axis spec>` option redefines the values in a benchmark's axis. It
+applies to the benchmark created by the most recent `--benchmark` argument, or
+all benchmarks if it precedes all `--benchmark` arguments (if any).
+
+Valid axis specification follow the form:
+
+* `<axis_name>=<value>`
+* `<axis_name>=[<value1>,<value2>,...]`
+* `<axis_name>=[<start>:<stop>]`
+* `<axis_name>=[<start>:<stop>:<stride>]`
+* `<axis_name>[<flags>]=<value>`
+* `<axis_name>[<flags>]=[<value1>,<value2>,...]`
+* `<axis_name>[<flags>]=[<start>:<stop>]`
+* `<axis_name>[<flags>]=[<start>:<stop>:<stride>]`
+
+Whitespace is ignored if the argument is quoted.
+
+The axis type is taken from the benchmark definition. Some axes have additional
+restrictions:
+
+* Numeric axes:
+  * A single value, explicit list of values, or strided range may be specified.
+  * For `int64` axes, the `power_of_two` flag is specified by adding `[pow2]`
+    after the axis name.
+  * Values may differ from those defined in the benchmark.
+* String axes:
+  * A single value or explicit list of values may be specified.
+  * Values may differ from those defined in the benchmark.
+* Type axes:
+  * A single value or explicit list of values may be specified.
+  * Values **MUST** be a subset of the types defined in the benchmark.
+  * Values **MUST** match the input strings provided by `--list` (e.g. `I32`
+    for `int`).
+  * Provide a `nvbench::type_strings<T>` specialization to modify a custom
+    type's input string.
+
+# Examples
+
+## Single Value
+
+| Axis Type | Example                 | Example Result   |
+|-----------|-------------------------|------------------|
+| Int64     | `-a InputSize=12345`    | 12345            |
+| Int64Pow2 | `-a InputSize[pow2]=8`  | 256              |
+| Float64   | `-a Quality=0.5`        | 0.5              |
+| String    | `-a RNG=Uniform`        | "Uniform"        |
+| Type      | `-a ValueType=I32`      | `int32_t`        |
+
+## Explicit List
+
+| Axis Type | Example                         | Example Result                 |
+|-----------|---------------------------------|--------------------------------|
+| Int64     | `-a InputSize=[1,2,3,4,5]`      | 1, 2, 3, 4, 5                  |
+| Int64Pow2 | `-a InputSize[pow2]=[4,6,8,10]` | 16, 64, 256, 1024              |
+| Float64   | `-a Quality=[0.5,0.75,1.0]`     | 0.5, 0.75, 1.0                 |
+| String    | `-a RNG=[Uniform,Gaussian]`     | "Uniform", "Gaussian"          |
+| Type      | `-a ValueType=[U8,I32,F64]`     | `uint8_t`, `int32_t`, `double` |
+
+## Strided Range
+
+| Axis Type | Example                         | Example Result               |
+|-----------|---------------------------------|------------------------------|
+| Int64     | `-a InputSize=[2:10:2]`         | 2, 4, 6, 8, 10               |
+| Int64Pow2 | `-a InputSize[pow2]=[2:10:2]`   | 4, 16, 64, 128, 256, 1024    |
+| Float64   | `-a Quality=[.5:1:.1]`          | 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 |
+| String    | [Not supported]                 |                              |
+| Type      | [Not supported]                 |                              |
+)string_bounds";
+
+} // namespace nvbench::internal
diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu
index 61a34d0..fbc6db2 100644
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -27,6 +27,8 @@
 
 #include <nvbench/detail/throw.cuh>
 
+#include <nvbench/internal/help_strings.cuh>
+
 #include <fmt/format.h>
 
 #include <cassert>
@@ -365,7 +367,17 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
   {
     const auto &arg = *first;
 
-    if (arg == "--list" || arg == "-l")
+    if (arg == "--help" || arg == "-h")
+    {
+      this->print_help();
+      std::exit(0);
+    }
+    else if (arg == "--help-axes" || arg == "--help-axis")
+    {
+      this->print_help_axis();
+      std::exit(0);
+    }
+    else if (arg == "--list" || arg == "-l")
     {
       this->print_list();
       std::exit(0);
@@ -497,6 +509,18 @@ void option_parser::print_list() const
   printer.print_benchmark_list(bench_mgr.get_benchmarks());
 }
 
+void option_parser::print_help() const
+{
+  fmt::print("{}\n{}\n",
+             nvbench::internal::help_text,
+             nvbench::internal::help_axis_text);
+}
+
+void option_parser::print_help_axis() const
+{
+  fmt::print("{}\n", nvbench::internal::help_axis_text);
+}
+
 void option_parser::add_benchmark(const std::string &name)
 try
 {
diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh
index 7a14ecf..7bba637 100644
--- a/nvbench/option_parser.cuh
+++ b/nvbench/option_parser.cuh
@@ -84,6 +84,8 @@ private:
   std::ostream &printer_spec_to_ostream(const std::string &spec);
 
   void print_list() const;
+  void print_help() const;
+  void print_help_axis() const;
 
   void add_benchmark(const std::string &name);
   void replay_global_args();