nvbench/nvbench/benchmark_base.cuh

/*
 *  Copyright 2021 NVIDIA Corporation
 *
 *  Licensed under the Apache License, Version 2.0 with the LLVM exception
 *  (the "License"); you may not use this file except in compliance with
 *  the License.
 *
 *  You may obtain a copy of the License at
 *
 *      http://llvm.org/foundation/relicensing/LICENSE.txt
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

#pragma once

#include <nvbench/axes_metadata.cuh>
#include <nvbench/device_info.cuh>
#include <nvbench/state.cuh>
#include <nvbench/stopping_criterion.cuh>

#include <functional> // reference_wrapper, ref
#include <memory>
#include <optional>
#include <vector>

namespace nvbench
{

struct printer_base;
struct runner_base;

template <typename BenchmarkType>
struct runner;

/**
 * Hold runtime benchmark information and provides public customization API for
 * the `NVBENCH_BENCH` macros.
 *
 * Delegates responsibility to the following classes:
 * - nvbench::axes_metadata: Axis specifications.
 */
struct benchmark_base
{
  template <typename T>
  using optional_ref = std::optional<std::reference_wrapper<T>>;

  template <typename TypeAxes>
  explicit benchmark_base(TypeAxes type_axes)
      : m_axes(type_axes)
  {
    this->set_stopping_criterion(nvbench::detail::default_stopping_criterion());
  }

  virtual ~benchmark_base();

  /**
   * Returns a pointer to a new instance of the concrete benchmark<...>
   * subclass.
   *
   * The result will have the same name and axes as the source benchmark.
   * The `get_states()` vector of the result will always be empty.
   */
  [[nodiscard]] std::unique_ptr<benchmark_base> clone() const;

  benchmark_base &set_name(std::string name)
  {
    m_name = std::move(name);
    return *this;
  }

  [[nodiscard]] const std::string &get_name() const { return m_name; }

  benchmark_base &set_type_axes_names(std::vector<std::string> names)
  {
    this->do_set_type_axes_names(std::move(names));
    return *this;
  }

  benchmark_base &add_float64_axis(std::string name, std::vector<nvbench::float64_t> data)
  {
    m_axes.add_float64_axis(std::move(name), std::move(data));
    return *this;
  }

  benchmark_base &add_int64_axis(std::string name,
                                 std::vector<nvbench::int64_t> data,
                                 nvbench::int64_axis_flags flags = nvbench::int64_axis_flags::none)
  {
    m_axes.add_int64_axis(std::move(name), std::move(data), flags);
    return *this;
  }

  benchmark_base &add_int64_power_of_two_axis(std::string name, std::vector<nvbench::int64_t> data)
  {
    return this->add_int64_axis(std::move(name),
                                std::move(data),
                                nvbench::int64_axis_flags::power_of_two);
  }

  benchmark_base &add_string_axis(std::string name, std::vector<std::string> data)
  {
    m_axes.add_string_axis(std::move(name), std::move(data));
    return *this;
  }

  benchmark_base &set_devices(std::vector<int> device_ids);

  benchmark_base &set_devices(std::vector<nvbench::device_info> devices)
  {
    m_devices = std::move(devices);
    return *this;
  }

  benchmark_base &clear_devices()
  {
    m_devices.clear();
    return *this;
  }

  benchmark_base &add_device(int device_id);

  benchmark_base &add_device(nvbench::device_info device)
  {
    m_devices.push_back(std::move(device));
    return *this;
  }

  [[nodiscard]] const std::vector<nvbench::device_info> &get_devices() const { return m_devices; }

  [[nodiscard]] nvbench::axes_metadata &get_axes() { return m_axes; }

  [[nodiscard]] const nvbench::axes_metadata &get_axes() const { return m_axes; }

  // Computes the number of configs in the benchmark.
  // Unlike get_states().size(), this method may be used prior to calling run().
  [[nodiscard]] std::size_t get_config_count() const;

  // Is empty until run() is called.
  [[nodiscard]] const std::vector<nvbench::state> &get_states() const { return m_states; }
  [[nodiscard]] std::vector<nvbench::state> &get_states() { return m_states; }

  void run() { this->do_run(); }
  void run_or_skip(bool &skip_remaining) { this->do_run_or_skip(skip_remaining); }

  void set_printer(nvbench::printer_base &printer) { m_printer = std::ref(printer); }

  void clear_printer() { m_printer = std::nullopt; }

  [[nodiscard]] optional_ref<nvbench::printer_base> get_printer() const { return m_printer; }

  /// Execute at least this many trials per measurement. @{
  [[nodiscard]] nvbench::int64_t get_min_samples() const { return m_min_samples; }
  benchmark_base &set_min_samples(nvbench::int64_t min_samples)
  {
    m_min_samples = min_samples;
    return *this;
  }
  /// @}

  /// If true, the benchmark measurements only record CPU time and assume no GPU work is performed.
  /// @{
  [[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; }
  benchmark_base &set_is_cpu_only(bool is_cpu_only)
  {
    m_is_cpu_only = is_cpu_only;
    return *this;
  }
  /// @}

  /// If true, the benchmark is only run once, skipping all warmup runs and only
  /// executing a single non-batched measurement. This is intended for use with
  /// external profiling tools. @{
  [[nodiscard]] bool get_run_once() const { return m_run_once; }
  benchmark_base &set_run_once(bool v)
  {
    m_run_once = v;
    return *this;
  }
  /// @}

  /// If true, the batched measurements for benchmark are not run. This is intended for use to
  /// save resources when only non-batched measurements are of interest, although batched
  /// measurements are meaningful and code to exercise them is compiled. This option has no
  /// effect for CPU only benchmarks and for benchmarks tagged with no_batch tag. @{
  [[nodiscard]] bool get_skip_batched() const { return m_skip_batched; }
  benchmark_base &set_skip_batched(bool v)
  {
    m_skip_batched = v;
    return *this;
  }
  /// @}

  /// If true, the benchmark does not use the blocking_kernel. This is intended
  /// for use with external profiling tools. @{
  [[nodiscard]] bool get_disable_blocking_kernel() const { return m_disable_blocking_kernel; }
  benchmark_base &set_disable_blocking_kernel(bool v)
  {
    m_disable_blocking_kernel = v;
    return *this;
  }
  /// @}

  /// If a warmup run finishes in less than `skip_time`, the measurement will
  /// be skipped.
  /// Extremely fast kernels (< 5000 ns) often timeout before they can
  /// accumulate `min_time` measurements, and are often uninteresting. Setting
  /// this value can help improve performance by skipping time consuming
  /// measurement that don't provide much information.
  /// Default value is -1., which disables the feature.
  /// @{
  [[nodiscard]] nvbench::float64_t get_skip_time() const { return m_skip_time; }
  benchmark_base &set_skip_time(nvbench::float64_t skip_time)
  {
    m_skip_time = skip_time;
    return *this;
  }
  /// @}

  /// If a measurement take more than `timeout` seconds to complete, stop the
  /// measurement early. A warning should be printed if this happens.
  /// This setting overrides all other termination criteria.
  /// Note that this is measured in CPU walltime, not sample time.
  /// @{
  [[nodiscard]] nvbench::float64_t get_timeout() const { return m_timeout; }
  benchmark_base &set_timeout(nvbench::float64_t timeout)
  {
    m_timeout = timeout;
    return *this;
  }
  /// @}

  [[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; }

  benchmark_base &set_throttle_threshold(nvbench::float32_t throttle_threshold)
  {
    m_throttle_threshold = throttle_threshold;
    return *this;
  }

  [[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const
  {
    return m_throttle_recovery_delay;
  }

  benchmark_base &set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay)
  {
    m_throttle_recovery_delay = throttle_recovery_delay;
    return *this;
  }

  /// Control the stopping criterion for the measurement loop.
  /// @{
  [[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; }
  benchmark_base &set_stopping_criterion(std::string criterion);
  /// @}

  [[nodiscard]] bool has_criterion_param(const std::string &name) const
  {
    return m_criterion_params.has_value(name);
  }

  [[nodiscard]] nvbench::int64_t get_criterion_param_int64(const std::string &name) const
  {
    return m_criterion_params.get_int64(name);
  }
  benchmark_base &set_criterion_param_int64(const std::string &name, nvbench::int64_t value)
  {
    m_criterion_params.set_int64(name, value);
    return *this;
  }

  [[nodiscard]] nvbench::float64_t get_criterion_param_float64(const std::string &name) const
  {
    return m_criterion_params.get_float64(name);
  }
  benchmark_base &set_criterion_param_float64(const std::string &name, nvbench::float64_t value)
  {
    m_criterion_params.set_float64(name, value);
    return *this;
  }

  [[nodiscard]] std::string get_criterion_param_string(const std::string &name) const
  {
    return m_criterion_params.get_string(name);
  }
  benchmark_base &set_criterion_param_string(const std::string &name, std::string value)
  {
    m_criterion_params.set_string(name, std::move(value));
    return *this;
  }

  [[nodiscard]] nvbench::criterion_params &get_criterion_params() { return m_criterion_params; }
  [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
  {
    return m_criterion_params;
  }

protected:
  friend struct nvbench::runner_base;

  template <typename BenchmarkType>
  friend struct nvbench::runner;

  std::string m_name;
  nvbench::axes_metadata m_axes;
  std::vector<nvbench::device_info> m_devices;
  std::vector<nvbench::state> m_states;

  optional_ref<nvbench::printer_base> m_printer;

  bool m_is_cpu_only{false};
  bool m_run_once{false};
  bool m_disable_blocking_kernel{false};
  bool m_skip_batched{false};

  nvbench::int64_t m_min_samples{10};

  nvbench::float64_t m_skip_time{-1.};
  nvbench::float64_t m_timeout{15.};

  nvbench::float32_t m_throttle_threshold{0.75f};      // [% of default SM clock rate]
  nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds]

  nvbench::criterion_params m_criterion_params;
  std::string m_stopping_criterion{};

private:
  // route these through virtuals so the templated subclass can inject type info
  virtual std::unique_ptr<benchmark_base> do_clone() const            = 0;
  virtual void do_set_type_axes_names(std::vector<std::string> names) = 0;
  virtual void do_run()                                               = 0;
  virtual void do_run_or_skip(bool &skip_remaining)                   = 0;
};

} // namespace nvbench