From 92cc3b1189f90c0bbd37492727395a0b5898d476 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Fri, 12 Feb 2021 20:02:20 -0500 Subject: [PATCH] Execute benchmarks on all devices. --- nvbench/benchmark_base.cu | 21 +++ nvbench/benchmark_base.cuh | 33 +++-- nvbench/detail/markdown_format.cu | 211 ++++++++++++++++------------- nvbench/detail/measure_cold.cu | 27 +++- nvbench/detail/measure_cold.cuh | 11 +- nvbench/detail/measure_hot.cu | 50 ++++--- nvbench/detail/measure_hot.cuh | 8 +- nvbench/detail/state_generator.cu | 46 +++++-- nvbench/detail/state_generator.cuh | 10 +- nvbench/device_info.cuh | 33 ++++- nvbench/runner.cuh | 53 ++++++-- nvbench/state.cuh | 28 +++- testing/create.cu | 13 +- testing/option_parser.cu | 55 +++----- testing/runner.cu | 58 +++----- testing/state.cu | 9 +- testing/state_generator.cu | 191 ++++++++++++++++---------- 17 files changed, 534 insertions(+), 323 deletions(-) diff --git a/nvbench/benchmark_base.cu b/nvbench/benchmark_base.cu index e5d1e64..ff7110f 100644 --- a/nvbench/benchmark_base.cu +++ b/nvbench/benchmark_base.cu @@ -1,8 +1,14 @@ #include +#include + namespace nvbench { +benchmark_base::benchmark_base() + : m_devices(nvbench::device_manager::get().get_devices()) +{} + benchmark_base::~benchmark_base() = default; std::unique_ptr benchmark_base::clone() const @@ -16,5 +22,20 @@ std::unique_ptr benchmark_base::clone() const return std::move(result); } +void benchmark_base::set_devices(std::vector device_ids) +{ + std::vector devices; + devices.reserve(device_ids.size()); + for (int dev_id : device_ids) + { + devices.emplace_back(dev_id); + } + this->set_devices(std::move(devices)); +} + +void benchmark_base::add_device(int device_id) +{ + this->add_device(device_info{device_id}); +} } // namespace nvbench diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index 7be67d5..a1239ac 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -21,6 +22,7 @@ struct runner; */ struct benchmark_base { + benchmark_base(); virtual ~benchmark_base(); /** @@ -77,25 +79,37 @@ struct benchmark_base return *this; } - [[nodiscard]] nvbench::axes_metadata &get_axes() + void set_devices(std::vector device_ids); + + void set_devices(std::vector devices) { - return m_axes; + m_devices = std::move(devices); } + void add_device(int device_id); + + void add_device(nvbench::device_info device) + { + m_devices.push_back(std::move(device)); + } + + [[nodiscard]] const std::vector &get_devices() const + { + return m_devices; + } + + [[nodiscard]] nvbench::axes_metadata &get_axes() { return m_axes; } + [[nodiscard]] const nvbench::axes_metadata &get_axes() const { return m_axes; } - [[nodiscard]] const std::vector> & - get_states() const - { - return m_states; - } - [[nodiscard]] std::vector> &get_states() + [[nodiscard]] const std::vector &get_states() const { return m_states; } + [[nodiscard]] std::vector &get_states() { return m_states; } void run() { this->do_run(); } @@ -105,7 +119,8 @@ protected: std::string m_name; nvbench::axes_metadata m_axes; - std::vector> m_states; + std::vector m_devices; + std::vector m_states; private: // route these through virtuals so the templated subclass can inject type info diff --git a/nvbench/detail/markdown_format.cu b/nvbench/detail/markdown_format.cu index 6097876..ad4544e 100644 --- a/nvbench/detail/markdown_format.cu +++ b/nvbench/detail/markdown_format.cu @@ -218,6 +218,8 @@ void markdown_format::print_benchmark_summaries( void markdown_format::print_benchmark_results(const benchmark_vector &benchmarks) { + // This needs to be refactored and cleaned up (someday....) but here's a + // buncha functors that do various string formatting stuff: auto format_visitor = [](const auto &v) { using T = std::decay_t; if constexpr (std::is_same_v) @@ -312,110 +314,129 @@ void markdown_format::print_benchmark_results(const benchmark_vector &benchmarks return fmt::format("{:.2f}%", percentage); }; - fmt::print("# Benchmark Summaries\n"); + // Start printing benchmarks + fmt::print("# Benchmark Results\n"); for (const auto &bench_ptr : benchmarks) { - const benchmark_base &bench = *bench_ptr; - const axes_metadata &axes = bench.get_axes(); + const auto &bench = *bench_ptr; + const auto &devices = bench.get_devices(); + const auto &axes = bench.get_axes(); - fmt::print("\n## {}\n\n", bench.get_name()); + fmt::print("\n## {}\n", bench.get_name()); - std::size_t row = 0; - table_builder table; - - for (const auto &inner_states : bench.get_states()) + // Do a single pass when no devices are specified. This happens for + // benchmarks with `cpu` exec_tags. + const std::size_t num_device_passes = devices.empty() ? 1 : devices.size(); + for (std::size_t device_pass = 0; device_pass < num_device_passes; + ++device_pass) { - for (const nvbench::state &state : inner_states) + std::optional device = + devices.empty() ? std::nullopt + : std::make_optional(devices[device_pass]); + + if (device) { - const auto &axis_values = state.get_axis_values(); - for (const auto &name : axis_values.get_names()) - { - // Handle power-of-two int64 axes differently: - if (axis_values.get_type(name) == named_values::type::int64 && - axes.get_int64_axis(name).is_power_of_two()) - { - const nvbench::uint64_t value = axis_values.get_int64(name); - const nvbench::uint64_t exponent = int64_axis::compute_log2(value); - table.add_cell(row, - name + "_axis_pretty", - name, - fmt::format("2^{}", exponent)); - table.add_cell(row, - name + "_axis_descriptive", - fmt::format("({})", name), - fmt::to_string(value)); - } - else - { - std::string value = std::visit(format_visitor, - axis_values.get_value(name)); - table.add_cell(row, name + "_axis", name, std::move(value)); - } - } - - for (const auto &summ : state.get_summaries()) - { - if (summ.has_value("hide")) - { - continue; - } - const std::string &key = summ.get_name(); - const std::string &header = summ.has_value("short_name") - ? summ.get_string("short_name") - : key; - - std::string hint = summ.has_value("hint") ? summ.get_string("hint") - : std::string{}; - if (hint == "duration") - { - table.add_cell(row, - key, - header, - format_duration(summ.get_float64("value"))); - } - else if (hint == "item_rate") - { - table.add_cell(row, - key, - header, - format_item_rate(summ.get_float64("value"))); - } - else if (hint == "bytes") - { - table.add_cell(row, - key, - header, - format_bytes(summ.get_int64("value"))); - } - else if (hint == "byte_rate") - { - table.add_cell(row, - key, - header, - format_byte_rate(summ.get_float64("value"))); - } - else if (hint == "percentage") - { - table.add_cell(row, - key, - header, - format_percentage(summ.get_float64("value"))); - } - else - { - table.add_cell(row, - key, - header, - std::visit(format_visitor, summ.get_value("value"))); - } - } - row++; + fmt::print("\n### [{}] {}\n\n", device->get_id(), device->get_name()); } - } - fmt::print("{}", table.to_string()); - } // end foreach benchmark + std::size_t row = 0; + table_builder table; + + for (const auto &cur_state : bench.get_states()) + { + if (cur_state.get_device() == device) + { + const auto &axis_values = cur_state.get_axis_values(); + for (const auto &name : axis_values.get_names()) + { + // Handle power-of-two int64 axes differently: + if (axis_values.get_type(name) == named_values::type::int64 && + axes.get_int64_axis(name).is_power_of_two()) + { + const nvbench::int64_t value = axis_values.get_int64(name); + const nvbench::int64_t exponent = int64_axis::compute_log2(value); + table.add_cell(row, + name + "_axis_pretty", + name, + fmt::format("2^{}", exponent)); + table.add_cell(row, + name + "_axis_descriptive", + fmt::format("({})", name), + fmt::to_string(value)); + } + else + { + std::string value = std::visit(format_visitor, + axis_values.get_value(name)); + table.add_cell(row, name + "_axis", name, std::move(value)); + } + } + + for (const auto &summ : cur_state.get_summaries()) + { + if (summ.has_value("hide")) + { + continue; + } + const std::string &key = summ.get_name(); + const std::string &header = summ.has_value("short_name") + ? summ.get_string("short_name") + : key; + + std::string hint = summ.has_value("hint") ? summ.get_string("hint") + : std::string{}; + if (hint == "duration") + { + table.add_cell(row, + key, + header, + format_duration(summ.get_float64("value"))); + } + else if (hint == "item_rate") + { + table.add_cell(row, + key, + header, + format_item_rate(summ.get_float64("value"))); + } + else if (hint == "bytes") + { + table.add_cell(row, + key, + header, + format_bytes(summ.get_int64("value"))); + } + else if (hint == "byte_rate") + { + table.add_cell(row, + key, + header, + format_byte_rate(summ.get_float64("value"))); + } + else if (hint == "percentage") + { + table.add_cell(row, + key, + header, + format_percentage(summ.get_float64("value"))); + } + else + { + table.add_cell(row, + key, + header, + std::visit(format_visitor, + summ.get_value("value"))); + } + } + row++; + } + } + + fmt::print("{}", table.to_string()); + } // end foreach device_pass + } } } // namespace detail diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index b3def99..5e24208 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -8,6 +8,7 @@ #include #include +#include #include namespace nvbench @@ -16,9 +17,29 @@ namespace nvbench namespace detail { +void measure_cold_base::check() +{ + const auto device = m_state.get_device(); + if (!device) + { + throw std::runtime_error(fmt::format("{}:{}: Device required for `cold` " + "measurement.", + __FILE__, + __LINE__)); + } + if (!device->is_active()) + { // This means something went wrong higher up. Throw an error. + throw std::runtime_error(fmt::format("{}:{}: Internal error: Current " + "device is not active.", + __FILE__, + __LINE__)); + } +} + void measure_cold_base::generate_summaries() { - const auto avg_cuda_time = m_total_cuda_time / m_total_iters; + const auto d_iters = static_cast(m_total_iters); + const auto avg_cuda_time = m_total_cuda_time / d_iters; { auto &summ = m_state.add_summary("Average GPU Time (Cold)"); summ.set_string("hint", "duration"); @@ -39,7 +60,7 @@ void measure_cold_base::generate_summaries() summ.set_float64("value", m_cuda_noise); } - const auto avg_cpu_time = m_total_cpu_time / m_total_iters; + const auto avg_cpu_time = m_total_cpu_time / d_iters; { auto &summ = m_state.add_summary("Average CPU Time (Cold)"); summ.set_string("hint", "duration"); @@ -70,7 +91,7 @@ void measure_cold_base::generate_summaries() // Log to stdout: fmt::memory_buffer param_buffer; - fmt::format_to(param_buffer, ""); + fmt::format_to(param_buffer, "Device={}", m_state.get_device()->get_id()); const axes_metadata &axes = m_state.get_benchmark().get_axes(); const auto &axis_values = m_state.get_axis_values(); for (const auto &name : axis_values.get_names()) diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index c15ad29..fcbe1ed 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,9 @@ struct measure_cold_base measure_cold_base &operator=(measure_cold_base &&) = delete; protected: + + void check(); + void initialize() { m_total_cuda_time = 0.; @@ -54,15 +58,15 @@ protected: nvbench::cpu_timer m_cpu_timer; nvbench::detail::l2flush m_l2flush; - nvbench::int64_t m_min_iters{100}; + nvbench::int64_t m_min_iters{10}; nvbench::int64_t m_total_iters{}; - nvbench::float64_t m_max_noise{1.0}; // % rel stdev + nvbench::float64_t m_max_noise{0.5}; // % rel stdev nvbench::float64_t m_cuda_noise{}; // % rel stdev nvbench::float64_t m_cpu_noise{}; // % rel stdev nvbench::float64_t m_min_time{0.5}; - nvbench::float64_t m_max_time{1.0}; + nvbench::float64_t m_max_time{3.0}; nvbench::float64_t m_total_cuda_time{}; nvbench::float64_t m_total_cpu_time{}; @@ -83,6 +87,7 @@ struct measure_cold : public measure_cold_base void operator()() { + this->check(); this->initialize(); this->run_warmup(); this->run_trials(); diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu index 35923f0..eed0fef 100644 --- a/nvbench/detail/measure_hot.cu +++ b/nvbench/detail/measure_hot.cu @@ -10,16 +10,31 @@ #include #include -// TODO these can be removed once there's a device_manager or some such: -#include -#include - namespace nvbench { namespace detail { +void measure_hot_base::check() +{ + const auto device = m_state.get_device(); + if (!device) + { + throw std::runtime_error(fmt::format("{}:{}: Device required for `hot` " + "measurement.", + __FILE__, + __LINE__)); + } + if (!device->is_active()) + { // This means something went wrong higher up. Throw an error. + throw std::runtime_error(fmt::format("{}:{}: Internal error: Current " + "device is not active.", + __FILE__, + __LINE__)); + } +} + measure_hot_base::measure_hot_base(state &exec_state) : m_state(exec_state) { @@ -48,7 +63,8 @@ measure_hot_base::measure_hot_base(state &exec_state) void measure_hot_base::generate_summaries() { - const auto avg_cuda_time = m_total_cuda_time / m_total_iters; + const auto d_iters = static_cast(m_total_iters); + const auto avg_cuda_time = m_total_cuda_time / d_iters; { auto &summ = m_state.add_summary("Average GPU Time (Hot)"); summ.set_string("hint", "duration"); @@ -59,7 +75,7 @@ void measure_hot_base::generate_summaries() summ.set_float64("value", avg_cuda_time); } - const auto avg_cpu_time = m_total_cpu_time / m_total_iters; + const auto avg_cpu_time = m_total_cpu_time / d_iters; { auto &summ = m_state.add_summary("Average CPU Time (Hot)"); summ.set_string("hide", @@ -86,13 +102,13 @@ void measure_hot_base::generate_summaries() summ.set_string("hint", "item_rate"); summ.set_string("short_name", "Item Rate"); summ.set_string("description", "Number of input items handled per second."); - summ.set_float64("value", items / avg_cuda_time); + summ.set_float64("value", static_cast(items) / avg_cuda_time); } if (const auto bytes = m_state.get_global_bytes_accessed_per_launch(); bytes != 0) { - const auto avg_used_gmem_bw = bytes / avg_cuda_time; + const auto avg_used_gmem_bw = static_cast(bytes) / avg_cuda_time; { auto &summ = m_state.add_summary("Average Global Memory Throughput"); summ.set_string("hint", "byte_rate"); @@ -103,16 +119,10 @@ void measure_hot_base::generate_summaries() summ.set_float64("value", avg_used_gmem_bw); } - // TODO cache this in a singleton somewhere. - int dev_id{}; - cudaDeviceProp prop{}; - NVBENCH_CUDA_CALL(cudaGetDevice(&dev_id)); - NVBENCH_CUDA_CALL(cudaGetDeviceProperties(&prop, dev_id)); - // clock rate in khz, width in bits. Result in bytes/sec. - const auto peak_gmem_bw = 2 * 1000. * prop.memoryClockRate * // (sec^-1) - prop.memoryBusWidth / CHAR_BIT; // bytes - { + const auto peak_gmem_bw = static_cast( + m_state.get_device()->get_global_memory_bus_bandwidth()); + auto &summ = m_state.add_summary("Percent Peak Global Memory Throughput"); summ.set_string("hint", "percentage"); summ.set_string("short_name", "PeakGMem"); @@ -125,7 +135,7 @@ void measure_hot_base::generate_summaries() // Log to stdout: fmt::memory_buffer param_buffer; - fmt::format_to(param_buffer, ""); + fmt::format_to(param_buffer, "Device={}", m_state.get_device()->get_id()); const axes_metadata &axes = m_state.get_benchmark().get_axes(); const auto &axis_values = m_state.get_axis_values(); for (const auto &name : axis_values.get_names()) @@ -140,8 +150,8 @@ void measure_hot_base::generate_summaries() if (axis_values.get_type(name) == named_values::type::int64 && axes.get_int64_axis(name).is_power_of_two()) { - const nvbench::uint64_t value = axis_values.get_int64(name); - const nvbench::uint64_t exponent = int64_axis::compute_log2(value); + const nvbench::int64_t value = axis_values.get_int64(name); + const nvbench::int64_t exponent = int64_axis::compute_log2(value); fmt::format_to(param_buffer, "2^{}", exponent); } else diff --git a/nvbench/detail/measure_hot.cuh b/nvbench/detail/measure_hot.cuh index 69c4c5f..6d3c0f3 100644 --- a/nvbench/detail/measure_hot.cuh +++ b/nvbench/detail/measure_hot.cuh @@ -26,6 +26,9 @@ struct measure_hot_base measure_hot_base &operator=(measure_hot_base &&) = delete; protected: + + void check(); + void initialize() { m_total_cpu_time = 0.; @@ -43,10 +46,10 @@ protected: nvbench::cpu_timer m_cpu_timer; nvbench::int64_t m_total_iters{}; - nvbench::int64_t m_min_iters{100}; + nvbench::int64_t m_min_iters{10}; nvbench::float64_t m_min_time{0.5}; - nvbench::float64_t m_max_time{1.0}; + nvbench::float64_t m_max_time{3.0}; nvbench::float64_t m_total_cuda_time{}; nvbench::float64_t m_total_cpu_time{}; @@ -64,6 +67,7 @@ struct measure_hot : public measure_hot_base void operator()() { + this->check(); this->initialize(); this->run_warmup(); this->run_trials(); diff --git a/nvbench/detail/state_generator.cu b/nvbench/detail/state_generator.cu index 7b2b583..27c19d6 100644 --- a/nvbench/detail/state_generator.cu +++ b/nvbench/detail/state_generator.cu @@ -1,6 +1,7 @@ #include #include +#include #include #include @@ -207,36 +208,53 @@ void state_generator::build_axis_configs() void state_generator::build_states() { - // Assemble states into a std::vector>, where the - // outer vector has one inner vector per type_config, and all configs in an - // inner vector use the same type config. This should probably be wrapped up - // into a nicer data structure, but organizing states in this way makes - // matching up states to kernel_generator instantiations much easier during - // dispatch. - m_states.clear(); - m_states.reserve(m_type_axis_configs.size()); - for (const auto &[type_config, axis_mask] : m_type_axis_configs) + + const auto &devices = m_benchmark.get_devices(); + if (devices.empty()) { - auto &inner_states = m_states.emplace_back(); + this->add_states_for_device(std::nullopt); + } + else + { + for (const auto &device : devices) + { + this->add_states_for_device(device); + } + } +} + +void state_generator::add_states_for_device( + const std::optional &device) +{ + const auto num_type_configs = m_type_axis_configs.size(); + for (std::size_t type_config_index = 0; type_config_index < num_type_configs; + ++type_config_index) + { + const auto &[type_config, + axis_mask] = m_type_axis_configs[type_config_index]; if (!axis_mask) { // Don't generate inner vector if the type config is masked out. continue; } - inner_states.reserve(m_non_type_axis_configs.size()); for (const auto &non_type_config : m_non_type_axis_configs) { + // Concatenate the type + non_type configurations: nvbench::named_values config = type_config; config.append(non_type_config); - inner_states.push_back(nvbench::state{m_benchmark, config}); + + // Create benchmark: + m_states.push_back(nvbench::state{m_benchmark, + std::move(config), + device, + type_config_index}); } } } -std::vector> -state_generator::create(const benchmark_base &bench) +std::vector state_generator::create(const benchmark_base &bench) { state_generator sg{bench}; sg.build_axis_configs(); diff --git a/nvbench/detail/state_generator.cuh b/nvbench/detail/state_generator.cuh index 90c4e43..b092b57 100644 --- a/nvbench/detail/state_generator.cuh +++ b/nvbench/detail/state_generator.cuh @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -11,25 +12,27 @@ namespace nvbench { struct benchmark_base; +struct device_info; + namespace detail { struct state_generator { - static std::vector> - create(const benchmark_base &bench); + static std::vector create(const benchmark_base &bench); private: explicit state_generator(const benchmark_base &bench); void build_axis_configs(); void build_states(); + void add_states_for_device(const std::optional &device); const benchmark_base &m_benchmark; // bool is a mask value; true if the config is used. std::vector> m_type_axis_configs; std::vector m_non_type_axis_configs; - std::vector> m_states; + std::vector m_states; }; // Detail class; Generates a cartesian product of axis indices. @@ -73,6 +76,5 @@ struct state_iterator std::size_t m_total{}; }; - } // namespace detail } // namespace nvbench diff --git a/nvbench/device_info.cuh b/nvbench/device_info.cuh index b4959d3..e3dc345 100644 --- a/nvbench/device_info.cuh +++ b/nvbench/device_info.cuh @@ -15,11 +15,17 @@ namespace nvbench namespace detail { int get_ptx_version(int); -} +} // namespace detail struct device_info { - explicit device_info(int id); + explicit device_info(int device_id); + + // Mainly used by unit tests: + device_info(int device_id, cudaDeviceProp prop) + : m_id{device_id} + , m_prop{prop} + {} /// @return The device's id on the current system. [[nodiscard]] int get_id() const { return m_id; } @@ -30,6 +36,18 @@ struct device_info return std::string_view(m_prop.name); } + [[nodiscard]] bool is_active() const + { + int id{-1}; + NVBENCH_CUDA_CALL(cudaGetDevice(&id)); + return id == m_id; + } + + void set_active() const + { + NVBENCH_CUDA_CALL(cudaSetDevice(m_id)); + } + /// @return The SM version of the current device as (major*100) + (minor*10). [[nodiscard]] int get_sm_version() const { @@ -145,6 +163,15 @@ struct device_info return m_prop; } + [[nodiscard]] bool operator==(const device_info &o) const + { + return m_id == o.m_id; + } + [[nodiscard]] bool operator!=(const device_info &o) const + { + return m_id != o.m_id; + } + private: int m_id; cudaDeviceProp m_prop; @@ -152,6 +179,8 @@ private: // get_ptx_version implementation; this needs to stay in the header so it will // pick up the downstream project's compilation settings. +// TODO this is fragile and will break when called from any library +// translation unit. namespace detail { // Templated to workaround ODR issues since __global__functions cannot be marked diff --git a/nvbench/runner.cuh b/nvbench/runner.cuh index 58aa0d1..ee5683c 100644 --- a/nvbench/runner.cuh +++ b/nvbench/runner.cuh @@ -29,24 +29,51 @@ struct runner void run() { - auto states_iter = m_benchmark.m_states.begin(); - if (states_iter + num_type_configs != m_benchmark.m_states.end()) + if (m_benchmark.m_devices.empty()) { - throw std::runtime_error("State vector doesn't match type_configs."); + this->run_device(std::nullopt); + } + else + { + for (const auto &device : m_benchmark.m_devices) + { + this->run_device(device); + } } - - nvbench::tl::foreach( - [&states_iter](auto type_config_wrapper) { - using type_config = typename decltype(type_config_wrapper)::type; - for (nvbench::state &cur_state : *states_iter) - { - kernel_generator{}(cur_state, type_config{}); - } - states_iter++; - }); } private: + + void run_device(const std::optional &device) + { + if (device) + { + device->set_active(); + } + + // Iterate through type_configs: + std::size_t type_config_index = 0; + nvbench::tl::foreach([&states = m_benchmark.m_states, + &type_config_index, + &device](auto type_config_wrapper) { + + // Get current type_config: + using type_config = typename decltype(type_config_wrapper)::type; + + // Find states with the current device / type_config + for (nvbench::state &cur_state : states) + { + if (cur_state.get_device() == device && + cur_state.get_type_config_index() == type_config_index) + { + kernel_generator{}(cur_state, type_config{}); + } + } + + ++type_config_index; + }); + } + benchmark_type &m_benchmark; }; diff --git a/nvbench/state.cuh b/nvbench/state.cuh index edbad6b..c4150cc 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -1,10 +1,12 @@ #pragma once +#include #include #include #include #include +#include #include #include @@ -17,7 +19,7 @@ namespace detail { struct state_generator; struct state_tester; -} +} // namespace detail /** * Stores all information about a particular benchmark configuration. @@ -41,6 +43,20 @@ struct state state &operator=(const state &) = delete; state &operator=(state &&) = default; + /// The CUDA device associated with with this benchmark state. May be + /// nullopt for CPU-only benchmarks. + [[nodiscard]] const std::optional &get_device() const + { + return m_device; + } + + /// An index into a benchmark::type_configs type_list. Returns 0 if no type + /// axes in the associated benchmark. + [[nodiscard]] std::size_t get_type_config_index() const + { + return m_type_config_index; + } + [[nodiscard]] nvbench::int64_t get_int64(const std::string &axis_name) const; [[nodiscard]] nvbench::float64_t @@ -99,13 +115,21 @@ private: : m_benchmark{bench} {} - state(const benchmark_base &bench, nvbench::named_values values) + state(const benchmark_base &bench, + nvbench::named_values values, + std::optional device, + std::size_t type_config_index) : m_benchmark{bench} , m_axis_values{std::move(values)} + , m_device{std::move(device)} + , m_type_config_index{type_config_index} {} std::reference_wrapper m_benchmark; nvbench::named_values m_axis_values; + std::optional m_device; + std::size_t m_type_config_index{}; + std::vector m_summaries; std::string m_skip_reason; nvbench::int64_t m_items_processed_per_launch{}; diff --git a/testing/create.cu b/testing/create.cu index 5d6ab4e..4708007 100644 --- a/testing/create.cu +++ b/testing/create.cu @@ -83,18 +83,15 @@ std::string run_and_get_state_string(nvbench::benchmark_base &bench, std::size_t num_type_configs, std::size_t states_per_type_config) { + bench.set_devices(std::vector{}); bench.run(); fmt::memory_buffer buffer; const auto &states = bench.get_states(); - ASSERT(states.size() == num_type_configs); - for (const auto &inner_states : states) + ASSERT(states.size() == num_type_configs * states_per_type_config); + for (const auto &state : states) { - ASSERT(inner_states.size() == states_per_type_config); - for (const auto &state : inner_states) - { - ASSERT(state.is_skipped()); - fmt::format_to(buffer, "{}\n", state.get_skip_reason()); - } + ASSERT(state.is_skipped()); + fmt::format_to(buffer, "{}\n", state.get_skip_reason()); } return fmt::to_string(buffer); } diff --git a/testing/option_parser.cu b/testing/option_parser.cu index 0b9a958..7b0c825 100644 --- a/testing/option_parser.cu +++ b/testing/option_parser.cu @@ -32,7 +32,7 @@ namespace { [[nodiscard]] std::string -states_to_string(const std::vector> &states) +states_to_string(const std::vector &states) { fmt::memory_buffer buffer; std::string table_format = "| {:^5} | {:^10} | {:^4} | {:^4} | {:^4} " @@ -50,24 +50,19 @@ states_to_string(const std::vector> &states) "Floats", "Strings"); - std::size_t type_config = 0; - std::size_t config = 0; - for (const auto &inner_states : states) + std::size_t config = 0; + for (const auto &state : states) { - for (const nvbench::state &state : inner_states) - { - fmt::format_to(buffer, - table_format, - config++, - type_config, - state.get_string("T"), - state.get_string("U"), - state.get_int64("Ints"), - state.get_int64("PO2s"), - state.get_float64("Floats"), - std::string{"\'"} + state.get_string("Strings") + "'"); - } - type_config++; + fmt::format_to(buffer, + table_format, + config++, + state.get_type_config_index(), + state.get_string("T"), + state.get_string("U"), + state.get_int64("Ints"), + state.get_int64("PO2s"), + state.get_float64("Floats"), + std::string{"\'"} + state.get_string("Strings") + "'"); } return fmt::to_string(buffer); } @@ -333,8 +328,7 @@ void test_int64_axis_pow2_single() { nvbench::option_parser parser; - parser.parse( - {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = 7 "}); + parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = 7 "}); const auto test = parser_to_state_string(parser); ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test); } @@ -451,8 +445,7 @@ void test_int64_axis_none_to_pow2_single() { nvbench::option_parser parser; - parser.parse( - {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = 7 "}); + parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = 7 "}); const auto test = parser_to_state_string(parser); ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test); } @@ -569,8 +562,7 @@ void test_int64_axis_pow2_to_none_single() { nvbench::option_parser parser; - parser.parse( - {"--benchmark", "TestBench", "--axis", " PO2s [ ] = 2 "}); + parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = 2 "}); const auto test = parser_to_state_string(parser); ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test); } @@ -584,8 +576,7 @@ void test_int64_axis_pow2_to_none_single() { nvbench::option_parser parser; - parser.parse( - {"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 ] "}); + parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 ] "}); const auto test = parser_to_state_string(parser); ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test); } @@ -687,8 +678,7 @@ void test_float64_axis_single() { nvbench::option_parser parser; - parser.parse( - {"--benchmark", "TestBench", "--axis", " Floats [ ] = 3.5 "}); + parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = 3.5 "}); const auto test = parser_to_state_string(parser); ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test); } @@ -727,8 +717,7 @@ void test_float64_axis_single() { nvbench::option_parser parser; - parser.parse( - {"--benchmark", "TestBench", "--axis", "Floats=[3.5:3.6]"}); + parser.parse({"--benchmark", "TestBench", "--axis", "Floats=[3.5:3.6]"}); const auto test = parser_to_state_string(parser); ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test); } @@ -893,8 +882,7 @@ void test_type_axis_single() { nvbench::option_parser parser; - parser.parse( - {"--benchmark", "TestBench", "--axis", " T [ ] = U8 "}); + parser.parse({"--benchmark", "TestBench", "--axis", " T [ ] = U8 "}); const auto test = parser_to_state_string(parser); ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test); } @@ -908,8 +896,7 @@ void test_type_axis_single() { nvbench::option_parser parser; - parser.parse( - {"--benchmark", "TestBench", "--axis", " T [ ] = [ U8 ] "}); + parser.parse({"--benchmark", "TestBench", "--axis", " T [ ] = [ U8 ] "}); const auto test = parser_to_state_string(parser); ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test); } diff --git a/testing/runner.cu b/testing/runner.cu index ebfc800..e61d0a3 100644 --- a/testing/runner.cu +++ b/testing/runner.cu @@ -73,12 +73,10 @@ void test_empty() runner.generate_states(); ASSERT(bench.get_states().size() == 1); - ASSERT(bench.get_states().front().size() == 1); - ASSERT(bench.get_states().front().front().is_skipped() == false); + ASSERT(bench.get_states().front().is_skipped() == false); runner.run(); ASSERT(bench.get_states().size() == 1); - ASSERT(bench.get_states().front().size() == 1); - ASSERT(bench.get_states().front().front().is_skipped() == true); + ASSERT(bench.get_states().front().is_skipped() == true); } void test_non_types() @@ -94,18 +92,16 @@ void test_non_types() runner_type runner{bench}; runner.generate_states(); - ASSERT(bench.get_states().size() == 1); - ASSERT(bench.get_states().front().size() == 27); - for (const auto &state : bench.get_states().front()) + ASSERT(bench.get_states().size() == 27); + for (const auto &state : bench.get_states()) { ASSERT(state.is_skipped() == false); } fmt::memory_buffer buffer; runner.run(); - ASSERT(bench.get_states().size() == 1); - ASSERT(bench.get_states().front().size() == 27); - for (const auto &state : bench.get_states().front()) + ASSERT(bench.get_states().size() == 27); + for (const auto &state : bench.get_states()) { ASSERT(state.is_skipped() == true); fmt::format_to(buffer, "{}\n", state.get_skip_reason()); @@ -150,32 +146,25 @@ void test_types() using runner_type = nvbench::runner; benchmark_type bench; + bench.set_devices(std::vector{}); bench.set_type_axes_names({"FloatT", "IntT", "MiscT"}); runner_type runner{bench}; runner.generate_states(); ASSERT(bench.get_states().size() == 8); - for (const auto &inner_states : bench.get_states()) + for (const auto &state : bench.get_states()) { - ASSERT(inner_states.size() == 1); - for (const auto &state : inner_states) - { - ASSERT(state.is_skipped() == false); - } + ASSERT(state.is_skipped() == false); } fmt::memory_buffer buffer; runner.run(); ASSERT(bench.get_states().size() == 8); - for (const auto &inner_states : bench.get_states()) + for (const auto &state : bench.get_states()) { - ASSERT(inner_states.size() == 1); - for (const auto &state : inner_states) - { - ASSERT(state.is_skipped() == true); - fmt::format_to(buffer, "{}\n", state.get_skip_reason()); - } + ASSERT(state.is_skipped() == true); + fmt::format_to(buffer, "{}\n", state.get_skip_reason()); } const std::string ref = R"expected(Params: FloatT: F32 IntT: I32 MiscT: bool @@ -198,6 +187,7 @@ void test_both() using runner_type = nvbench::runner; benchmark_type bench; + bench.set_devices(std::vector{}); bench.set_type_axes_names({"FloatT", "IntT", "MiscT"}); bench.add_int64_axis("Int", {1, 2, 3}); bench.add_float64_axis("Float", {11.0, 12.0, 13.0}); @@ -206,27 +196,19 @@ void test_both() runner_type runner{bench}; runner.generate_states(); - ASSERT(bench.get_states().size() == 8); - for (const auto &inner_states : bench.get_states()) + ASSERT(bench.get_states().size() == 8 * 27); + for (const auto &state : bench.get_states()) { - ASSERT(inner_states.size() == 27); - for (const auto &state : inner_states) - { - ASSERT(state.is_skipped() == false); - } + ASSERT(state.is_skipped() == false); } fmt::memory_buffer buffer; runner.run(); - ASSERT(bench.get_states().size() == 8); - for (const auto &inner_states : bench.get_states()) + ASSERT(bench.get_states().size() == 8 * 27); + for (const auto &state : bench.get_states()) { - ASSERT(inner_states.size() == 27); - for (const auto &state : inner_states) - { - ASSERT(state.is_skipped() == true); - fmt::format_to(buffer, "{}\n", state.get_skip_reason()); - } + ASSERT(state.is_skipped() == true); + fmt::format_to(buffer, "{}\n", state.get_skip_reason()); } const std::string ref = diff --git a/testing/state.cu b/testing/state.cu index 05861c3..18e0fc0 100644 --- a/testing/state.cu +++ b/testing/state.cu @@ -13,9 +13,11 @@ NVBENCH_DEFINE_CALLABLE(dummy_generator, dummy_callable); using dummy_bench = nvbench::benchmark; // Subclass to gain access to protected members for testing: +namespace nvbench::detail +{ struct state_tester : public nvbench::state { - state_tester(const nvbench::benchmark_base& bench) + state_tester(const nvbench::benchmark_base &bench) : nvbench::state{bench} {} @@ -27,6 +29,9 @@ struct state_tester : public nvbench::state std::forward(value)}); } }; +} // namespace nvbench::detail + +using nvbench::detail::state_tester; void test_params() { @@ -50,7 +55,7 @@ void test_summaries() ASSERT(state.get_summaries().size() == 0); { - nvbench::summary& summary = state.add_summary("Test Summary1"); + nvbench::summary &summary = state.add_summary("Test Summary1"); summary.set_float64("Float", 3.14); summary.set_int64("Int", 128); summary.set_string("String", "str"); diff --git a/testing/state_generator.cu b/testing/state_generator.cu index 805fba0..b3a2099 100644 --- a/testing/state_generator.cu +++ b/testing/state_generator.cu @@ -130,6 +130,7 @@ void test_basic() void test_create() { dummy_bench bench; + bench.set_devices(std::vector{}); bench.add_float64_axis("Radians", {3.14, 6.28}); bench.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none); bench.add_int64_axis("NumInputs", @@ -137,22 +138,14 @@ void test_create() nvbench::int64_axis_flags::power_of_two); bench.add_string_axis("Strategy", {"Recursive", "Iterative"}); - const std::vector> states = + const std::vector states = nvbench::detail::state_generator::create(bench); - // Outer vector has one entry per type_config. There are no type axes, so - // there's only one type_config: - ASSERT(states.size() == 1); - - // Inner vectors have one entry per non-type config: // 2 (Radians) * 3 (VecSize) * 3 (NumInputs) * 2 (Strategy) = 36 - for (const auto &inner_states : states) - { - ASSERT(inner_states.size() == 36); - } + ASSERT(states.size() == 36); fmt::memory_buffer buffer; - std::string table_format = + const std::string table_format = "| {:^5} | {:^10} | {:^7} | {:^7} | {:^9} | {:^9} |\n"; fmt::format_to(buffer, "\n"); @@ -165,22 +158,17 @@ void test_create() "NumInputs", "Strategy"); - std::size_t type_config = 0; - std::size_t config = 0; - for (const auto &inner_states : states) + std::size_t config = 0; + for (const auto &state : states) { - for (const nvbench::state &state : inner_states) - { - fmt::format_to(buffer, - table_format, - config++, - type_config, - state.get_float64("Radians"), - state.get_int64("VecSize"), - state.get_int64("NumInputs"), - state.get_string("Strategy")); - } - type_config++; + fmt::format_to(buffer, + table_format, + config++, + state.get_type_config_index(), + state.get_float64("Radians"), + state.get_int64("VecSize"), + state.get_int64("NumInputs"), + state.get_string("Strategy")); } const std::string ref = @@ -231,6 +219,7 @@ void test_create() void test_create_with_types() { template_bench bench; + bench.set_devices(std::vector{}); bench.set_type_axes_names({"Floats", "Ints", "Misc"}); bench.add_float64_axis("Radians", {3.14, 6.28}); bench.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none); @@ -239,19 +228,13 @@ void test_create_with_types() nvbench::int64_axis_flags::power_of_two); bench.add_string_axis("Strategy", {"Recursive", "Iterative"}); - const std::vector> states = + const std::vector states = nvbench::detail::state_generator::create(bench); - // Outer vector has one entry per type_config - // 2 (Floats) * 2 (Ints) * 2 (Misc) = 8 total type_configs - ASSERT(states.size() == 8); - - // Inner vectors have one entry per non-type config: - // 2 (Radians) * 3 (VecSize) * 3 (NumInputs) * 2 (Strategy) = 36 - for (const auto &inner_states : states) - { - ASSERT(inner_states.size() == 36); - } + // - 2 (Floats) * 2 (Ints) * 2 (Misc) = 8 total type_configs + // - 2 (Radians) * 3 (VecSize) * 3 (NumInputs) * 2 (Strategy) = 36 non_type + // configs + ASSERT(states.size() == 8 * 36); fmt::memory_buffer buffer; std::string table_format = "| {:^5} | {:^10} | {:^6} | {:^4} | {:^4} | {:^7} " @@ -270,25 +253,20 @@ void test_create_with_types() "NumInputs", "Strategy"); - std::size_t type_config = 0; - std::size_t config = 0; - for (const auto &inner_states : states) + std::size_t config = 0; + for (const auto &state : states) { - for (const nvbench::state &state : inner_states) - { - fmt::format_to(buffer, - table_format, - config++, - type_config, - state.get_string("Floats"), - state.get_string("Ints"), - state.get_string("Misc"), - state.get_float64("Radians"), - state.get_int64("VecSize"), - state.get_int64("NumInputs"), - state.get_string("Strategy")); - } - type_config++; + fmt::format_to(buffer, + table_format, + config++, + state.get_type_config_index(), + state.get_string("Floats"), + state.get_string("Ints"), + state.get_string("Misc"), + state.get_float64("Radians"), + state.get_int64("VecSize"), + state.get_int64("NumInputs"), + state.get_string("Strategy")); } const std::string ref = @@ -591,6 +569,7 @@ void test_create_with_types() void test_create_with_masked_types() { template_bench bench; + bench.set_devices(std::vector{}); bench.set_type_axes_names({"Floats", "Ints", "Misc"}); bench.add_float64_axis("Radians", {3.14, 6.28}); bench.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none); @@ -603,7 +582,7 @@ void test_create_with_masked_types() bench.get_axes().get_type_axis("Floats").set_active_inputs({"F32"}); bench.get_axes().get_type_axis("Ints").set_active_inputs({"I64"}); - const std::vector> states = + const std::vector states = nvbench::detail::state_generator::create(bench); fmt::memory_buffer buffer; @@ -623,25 +602,20 @@ void test_create_with_masked_types() "NumInputs", "Strategy"); - std::size_t type_config = 0; - std::size_t config = 0; - for (const auto &inner_states : states) + std::size_t config = 0; + for (const auto &state : states) { - for (const nvbench::state &state : inner_states) - { - fmt::format_to(buffer, - table_format, - config++, - type_config, - state.get_string("Floats"), - state.get_string("Ints"), - state.get_string("Misc"), - state.get_float64("Radians"), - state.get_int64("VecSize"), - state.get_int64("NumInputs"), - state.get_string("Strategy")); - } - type_config++; + fmt::format_to(buffer, + table_format, + config++, + state.get_type_config_index(), + state.get_string("Floats"), + state.get_string("Ints"), + state.get_string("Misc"), + state.get_float64("Radians"), + state.get_int64("VecSize"), + state.get_int64("NumInputs"), + state.get_string("Strategy")); } const std::string ref = @@ -725,7 +699,69 @@ void test_create_with_masked_types() ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test); } +void test_devices() +{ + const auto device_0 = nvbench::device_info{0, {}}; + const auto device_1 = nvbench::device_info{1, {}}; + const auto device_2 = nvbench::device_info{2, {}}; + + dummy_bench bench; + bench.set_devices({device_0, device_1, device_2}); + bench.add_string_axis("S", {"foo", "bar"}); + bench.add_int64_axis("I", {2, 4}); + + const std::vector states = + nvbench::detail::state_generator::create(bench); + + // 3 devices * 4 axis configs = 12 total states + ASSERT(states.size() == 12); + + fmt::memory_buffer buffer; + const std::string table_format = + "| {:^5} | {:^6} | {:^5} | {:^3} |\n"; + + fmt::format_to(buffer, "\n"); + fmt::format_to(buffer, + table_format, + "State", + "Device", + "S", + "I"); + + std::size_t config = 0; + for (const auto &state : states) + { + fmt::format_to(buffer, + table_format, + config++, + state.get_device()->get_id(), + state.get_string("S"), + state.get_int64("I")); + } + + const std::string ref = + R"expected( +| State | Device | S | I | +| 0 | 0 | foo | 2 | +| 1 | 0 | bar | 2 | +| 2 | 0 | foo | 4 | +| 3 | 0 | bar | 4 | +| 4 | 1 | foo | 2 | +| 5 | 1 | bar | 2 | +| 6 | 1 | foo | 4 | +| 7 | 1 | bar | 4 | +| 8 | 2 | foo | 2 | +| 9 | 2 | bar | 2 | +| 10 | 2 | foo | 4 | +| 11 | 2 | bar | 4 | +)expected"; + + const std::string test = fmt::to_string(buffer); + ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test); +} + int main() +try { test_empty(); test_single_state(); @@ -733,4 +769,11 @@ int main() test_create(); test_create_with_types(); test_create_with_masked_types(); + test_devices(); + return 0; +} +catch (std::exception& e) +{ + fmt::print("{}\n", e.what()); + return 1; }