mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-05-01 12:11:14 +00:00
commit4b309e6ad8Author: Allison Piper <alliepiper16@gmail.com> Date: Sat Apr 6 13:19:14 2024 +0000 Minor cleanups commit476ed2ceaeAuthor: Allison Piper <alliepiper16@gmail.com> Date: Sat Apr 6 12:53:37 2024 +0000 WAR compiler ice in nlohmann json. Only seeing this on GCC 9 + CTK 11.1. Seems to be having trouble with the `[[no_unique_address]]` optimization. commita9bf1d3e42Author: Allison Piper <alliepiper16@gmail.com> Date: Sat Apr 6 00:24:47 2024 +0000 Bump nlohmann json. commit80980fe373Author: Allison Piper <alliepiper16@gmail.com> Date: Sat Apr 6 00:22:07 2024 +0000 Fix llvm filesystem support commitf6099e6311Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 23:18:44 2024 +0000 Drop MSVC 2017 testing. commit5ae50a8ef5Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 23:02:32 2024 +0000 Add mroe missing headers. commitb2a9ae04d9Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 22:37:56 2024 +0000 Remove old CUDA+MSVC builds and make windows build-only. commit5b18c26a28Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 22:37:07 2024 +0000 Fix header for std::min/max. Why do I always think it's utility instead of algorithm.... commit6a409efa2dAuthor: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 22:18:18 2024 +0000 Temporarily disable CUPTI on all windows builds. commitf432f88866Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 21:42:52 2024 +0000 Fix warnings on MSVC. commit829787649bAuthor: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 21:03:16 2024 +0000 More flailing about in powershell. commit21742e6beaAuthor: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 20:36:08 2024 +0000 Cleanup filesystem header handling. commitde3d202635Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 20:09:00 2024 +0000 Windows CI debugging. commita4151667ffAuthor: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 19:45:40 2024 +0000 Quotation mark madness commitdd04f3befeAuthor: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 19:27:27 2024 +0000 Temporarily disable NVML on windows CI until new containers are ready. commitf3952848c4Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 19:25:22 2024 +0000 WAR issues on gcc-7. commit198986875eAuthor: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 19:25:04 2024 +0000 More matrix/devcontainer updates. commitb9712f8696Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 18:30:35 2024 +0000 Fix windows build scripts. commit943f268280Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 18:18:33 2024 +0000 Fix warnings with clang host compiler. commit7063e1d60aAuthor: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 18:14:28 2024 +0000 More devcontainer hijinks. commit06532fde81Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 17:51:25 2024 +0000 More matrix updates. commit78a265ea55Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 17:34:00 2024 +0000 Support CLI CMake options for windows ci scripts. commit670895c867Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 17:31:59 2024 +0000 Add missing devcontainers. commitb121823e74Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 17:22:54 2024 +0000 Build for `all-major` architectures in presets. We can get away with this because we require CMake 3.23.1. This was added in 3.23. commitfccfd44685Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 17:22:08 2024 +0000 Update matrix file. commite7d43ba90eAuthor: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 16:23:48 2024 +0000 Consolidate build/test jobs. commitc4044056ecAuthor: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 16:04:11 2024 +0000 Add missing build script.
215 lines
5.5 KiB
Plaintext
215 lines
5.5 KiB
Plaintext
/*
|
|
* Copyright 2021 NVIDIA Corporation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
|
* (the "License"); you may not use this file except in compliance with
|
|
* the License.
|
|
*
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <nvbench/blocking_kernel.cuh>
|
|
#include <nvbench/cpu_timer.cuh>
|
|
#include <nvbench/cuda_call.cuh>
|
|
#include <nvbench/cuda_timer.cuh>
|
|
#include <nvbench/exec_tag.cuh>
|
|
#include <nvbench/launch.cuh>
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
#include <algorithm>
|
|
|
|
namespace nvbench
|
|
{
|
|
|
|
struct state;
|
|
|
|
namespace detail
|
|
{
|
|
|
|
// non-templated code goes here to keep instantiation cost down:
|
|
struct measure_hot_base
|
|
{
|
|
explicit measure_hot_base(nvbench::state &exec_state);
|
|
measure_hot_base(const measure_hot_base &) = delete;
|
|
measure_hot_base(measure_hot_base &&) = delete;
|
|
measure_hot_base &operator=(const measure_hot_base &) = delete;
|
|
measure_hot_base &operator=(measure_hot_base &&) = delete;
|
|
|
|
protected:
|
|
void check();
|
|
|
|
void initialize()
|
|
{
|
|
m_total_cuda_time = 0.;
|
|
m_total_samples = 0;
|
|
m_max_time_exceeded = false;
|
|
}
|
|
|
|
void generate_summaries();
|
|
|
|
void check_skip_time(nvbench::float64_t warmup_time);
|
|
|
|
void block_stream();
|
|
|
|
__forceinline__ void unblock_stream() { m_blocker.unblock(); }
|
|
|
|
nvbench::state &m_state;
|
|
|
|
nvbench::launch m_launch;
|
|
nvbench::cuda_timer m_cuda_timer;
|
|
nvbench::cpu_timer m_walltime_timer;
|
|
nvbench::blocking_kernel m_blocker;
|
|
|
|
nvbench::int64_t m_min_samples{};
|
|
nvbench::float64_t m_min_time{};
|
|
|
|
nvbench::float64_t m_skip_time{};
|
|
nvbench::float64_t m_timeout{};
|
|
|
|
nvbench::int64_t m_total_samples{};
|
|
nvbench::float64_t m_total_cuda_time{};
|
|
|
|
bool m_max_time_exceeded{false};
|
|
};
|
|
|
|
template <typename KernelLauncher, bool use_blocking_kernel>
|
|
struct measure_hot : public measure_hot_base
|
|
{
|
|
measure_hot(nvbench::state &state, KernelLauncher &kernel_launcher)
|
|
: measure_hot_base(state)
|
|
, m_kernel_launcher{kernel_launcher}
|
|
{}
|
|
|
|
void operator()()
|
|
{
|
|
this->check();
|
|
this->initialize();
|
|
this->run_warmup();
|
|
this->run_trials();
|
|
this->generate_summaries();
|
|
}
|
|
|
|
private:
|
|
// Run the kernel once, measuring the GPU time. If under skip_time, skip the
|
|
// measurement.
|
|
void run_warmup()
|
|
{
|
|
if constexpr (use_blocking_kernel)
|
|
{
|
|
this->block_stream();
|
|
}
|
|
|
|
m_cuda_timer.start(m_launch.get_stream());
|
|
this->launch_kernel();
|
|
m_cuda_timer.stop(m_launch.get_stream());
|
|
|
|
if constexpr (use_blocking_kernel)
|
|
{
|
|
this->unblock_stream();
|
|
}
|
|
this->sync_stream();
|
|
|
|
this->check_skip_time(m_cuda_timer.get_duration());
|
|
}
|
|
|
|
void run_trials()
|
|
{
|
|
m_walltime_timer.start();
|
|
|
|
// Use warmup results to estimate the number of iterations to run.
|
|
// The .95 factor here pads the batch_size a bit to avoid needing a second
|
|
// batch due to noise.
|
|
const auto time_estimate = m_cuda_timer.get_duration() * 0.95;
|
|
auto batch_size = static_cast<nvbench::int64_t>(m_min_time / time_estimate);
|
|
|
|
do
|
|
{
|
|
batch_size = std::max(batch_size, nvbench::int64_t{1});
|
|
|
|
if constexpr (use_blocking_kernel)
|
|
{
|
|
// Block stream until some work is queued.
|
|
// Limit the number of kernel executions while blocked to prevent
|
|
// deadlocks. See warnings on blocking_kernel.
|
|
const auto blocked_launches = std::min(batch_size, nvbench::int64_t{2});
|
|
const auto unblocked_launches = batch_size - blocked_launches;
|
|
|
|
this->block_stream();
|
|
m_cuda_timer.start(m_launch.get_stream());
|
|
|
|
for (nvbench::int64_t i = 0; i < blocked_launches; ++i)
|
|
{
|
|
// If your benchmark deadlocks in the next launch, reduce the size of
|
|
// blocked_launches. See note above.
|
|
this->launch_kernel();
|
|
}
|
|
|
|
this->unblock_stream(); // Start executing earlier launches
|
|
|
|
for (nvbench::int64_t i = 0; i < unblocked_launches; ++i)
|
|
{
|
|
this->launch_kernel();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
m_cuda_timer.start(m_launch.get_stream());
|
|
|
|
for (nvbench::int64_t i = 0; i < batch_size; ++i)
|
|
{
|
|
this->launch_kernel();
|
|
}
|
|
}
|
|
|
|
m_cuda_timer.stop(m_launch.get_stream());
|
|
this->sync_stream();
|
|
|
|
m_total_cuda_time += m_cuda_timer.get_duration();
|
|
m_total_samples += batch_size;
|
|
|
|
// Predict number of remaining iterations:
|
|
batch_size = static_cast<nvbench::int64_t>(
|
|
(m_min_time - m_total_cuda_time) /
|
|
(m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples)));
|
|
|
|
if (m_total_cuda_time > m_min_time && // min time okay
|
|
m_total_samples > m_min_samples) // min samples okay
|
|
{
|
|
break; // Stop iterating
|
|
}
|
|
|
|
m_walltime_timer.stop();
|
|
if (m_walltime_timer.get_duration() > m_timeout)
|
|
{
|
|
m_max_time_exceeded = true;
|
|
break;
|
|
}
|
|
} while (true);
|
|
|
|
m_walltime_timer.stop();
|
|
}
|
|
|
|
__forceinline__ void launch_kernel() { m_kernel_launcher(m_launch); }
|
|
|
|
__forceinline__ void sync_stream() const
|
|
{
|
|
NVBENCH_CUDA_CALL(cudaStreamSynchronize(m_launch.get_stream()));
|
|
}
|
|
|
|
KernelLauncher &m_kernel_launcher;
|
|
};
|
|
|
|
} // namespace detail
|
|
} // namespace nvbench
|