Files
nvbench/nvbench/detail/measure_hot.cuh
Allison Piper e8c8877d36 Squashed commit of the following:
commit 4b309e6ad8
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Sat Apr 6 13:19:14 2024 +0000

    Minor cleanups

commit 476ed2ceae
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Sat Apr 6 12:53:37 2024 +0000

    WAR compiler ice in nlohmann json.

    Only seeing this on GCC 9 + CTK 11.1. Seems to be
    having trouble with the `[[no_unique_address]]` optimization.

commit a9bf1d3e42
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Sat Apr 6 00:24:47 2024 +0000

    Bump nlohmann json.

commit 80980fe373
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Sat Apr 6 00:22:07 2024 +0000

    Fix llvm filesystem support

commit f6099e6311
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 23:18:44 2024 +0000

    Drop MSVC 2017 testing.

commit 5ae50a8ef5
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 23:02:32 2024 +0000

    Add mroe missing headers.

commit b2a9ae04d9
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 22:37:56 2024 +0000

    Remove old CUDA+MSVC builds and make windows build-only.

commit 5b18c26a28
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 22:37:07 2024 +0000

    Fix header for std::min/max.

    Why do I always think it's utility instead of algorithm....

commit 6a409efa2d
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 22:18:18 2024 +0000

    Temporarily disable CUPTI on all windows builds.

commit f432f88866
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 21:42:52 2024 +0000

    Fix warnings on MSVC.

commit 829787649b
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 21:03:16 2024 +0000

    More flailing about in powershell.

commit 21742e6bea
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 20:36:08 2024 +0000

    Cleanup filesystem header handling.

commit de3d202635
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 20:09:00 2024 +0000

    Windows CI debugging.

commit a4151667ff
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 19:45:40 2024 +0000

    Quotation mark madness

commit dd04f3befe
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 19:27:27 2024 +0000

    Temporarily disable NVML on windows CI until new containers are ready.

commit f3952848c4
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 19:25:22 2024 +0000

    WAR issues on gcc-7.

commit 198986875e
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 19:25:04 2024 +0000

    More matrix/devcontainer updates.

commit b9712f8696
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 18:30:35 2024 +0000

    Fix windows build scripts.

commit 943f268280
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 18:18:33 2024 +0000

    Fix warnings with clang host compiler.

commit 7063e1d60a
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 18:14:28 2024 +0000

    More devcontainer hijinks.

commit 06532fde81
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 17:51:25 2024 +0000

    More matrix updates.

commit 78a265ea55
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 17:34:00 2024 +0000

    Support CLI CMake options for windows ci scripts.

commit 670895c867
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 17:31:59 2024 +0000

    Add missing devcontainers.

commit b121823e74
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 17:22:54 2024 +0000

    Build for `all-major` architectures in presets.

    We can get away with this because we require CMake 3.23.1.
    This was added in 3.23.

commit fccfd44685
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 17:22:08 2024 +0000

    Update matrix file.

commit e7d43ba90e
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 16:23:48 2024 +0000

    Consolidate build/test jobs.

commit c4044056ec
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Fri Apr 5 16:04:11 2024 +0000

    Add missing build script.
2024-04-06 13:56:10 +00:00

215 lines
5.5 KiB
Plaintext

/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nvbench/blocking_kernel.cuh>
#include <nvbench/cpu_timer.cuh>
#include <nvbench/cuda_call.cuh>
#include <nvbench/cuda_timer.cuh>
#include <nvbench/exec_tag.cuh>
#include <nvbench/launch.cuh>
#include <cuda_runtime.h>
#include <algorithm>
namespace nvbench
{
struct state;
namespace detail
{
// non-templated code goes here to keep instantiation cost down:
struct measure_hot_base
{
explicit measure_hot_base(nvbench::state &exec_state);
measure_hot_base(const measure_hot_base &) = delete;
measure_hot_base(measure_hot_base &&) = delete;
measure_hot_base &operator=(const measure_hot_base &) = delete;
measure_hot_base &operator=(measure_hot_base &&) = delete;
protected:
void check();
void initialize()
{
m_total_cuda_time = 0.;
m_total_samples = 0;
m_max_time_exceeded = false;
}
void generate_summaries();
void check_skip_time(nvbench::float64_t warmup_time);
void block_stream();
__forceinline__ void unblock_stream() { m_blocker.unblock(); }
nvbench::state &m_state;
nvbench::launch m_launch;
nvbench::cuda_timer m_cuda_timer;
nvbench::cpu_timer m_walltime_timer;
nvbench::blocking_kernel m_blocker;
nvbench::int64_t m_min_samples{};
nvbench::float64_t m_min_time{};
nvbench::float64_t m_skip_time{};
nvbench::float64_t m_timeout{};
nvbench::int64_t m_total_samples{};
nvbench::float64_t m_total_cuda_time{};
bool m_max_time_exceeded{false};
};
template <typename KernelLauncher, bool use_blocking_kernel>
struct measure_hot : public measure_hot_base
{
measure_hot(nvbench::state &state, KernelLauncher &kernel_launcher)
: measure_hot_base(state)
, m_kernel_launcher{kernel_launcher}
{}
void operator()()
{
this->check();
this->initialize();
this->run_warmup();
this->run_trials();
this->generate_summaries();
}
private:
// Run the kernel once, measuring the GPU time. If under skip_time, skip the
// measurement.
void run_warmup()
{
if constexpr (use_blocking_kernel)
{
this->block_stream();
}
m_cuda_timer.start(m_launch.get_stream());
this->launch_kernel();
m_cuda_timer.stop(m_launch.get_stream());
if constexpr (use_blocking_kernel)
{
this->unblock_stream();
}
this->sync_stream();
this->check_skip_time(m_cuda_timer.get_duration());
}
void run_trials()
{
m_walltime_timer.start();
// Use warmup results to estimate the number of iterations to run.
// The .95 factor here pads the batch_size a bit to avoid needing a second
// batch due to noise.
const auto time_estimate = m_cuda_timer.get_duration() * 0.95;
auto batch_size = static_cast<nvbench::int64_t>(m_min_time / time_estimate);
do
{
batch_size = std::max(batch_size, nvbench::int64_t{1});
if constexpr (use_blocking_kernel)
{
// Block stream until some work is queued.
// Limit the number of kernel executions while blocked to prevent
// deadlocks. See warnings on blocking_kernel.
const auto blocked_launches = std::min(batch_size, nvbench::int64_t{2});
const auto unblocked_launches = batch_size - blocked_launches;
this->block_stream();
m_cuda_timer.start(m_launch.get_stream());
for (nvbench::int64_t i = 0; i < blocked_launches; ++i)
{
// If your benchmark deadlocks in the next launch, reduce the size of
// blocked_launches. See note above.
this->launch_kernel();
}
this->unblock_stream(); // Start executing earlier launches
for (nvbench::int64_t i = 0; i < unblocked_launches; ++i)
{
this->launch_kernel();
}
}
else
{
m_cuda_timer.start(m_launch.get_stream());
for (nvbench::int64_t i = 0; i < batch_size; ++i)
{
this->launch_kernel();
}
}
m_cuda_timer.stop(m_launch.get_stream());
this->sync_stream();
m_total_cuda_time += m_cuda_timer.get_duration();
m_total_samples += batch_size;
// Predict number of remaining iterations:
batch_size = static_cast<nvbench::int64_t>(
(m_min_time - m_total_cuda_time) /
(m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples)));
if (m_total_cuda_time > m_min_time && // min time okay
m_total_samples > m_min_samples) // min samples okay
{
break; // Stop iterating
}
m_walltime_timer.stop();
if (m_walltime_timer.get_duration() > m_timeout)
{
m_max_time_exceeded = true;
break;
}
} while (true);
m_walltime_timer.stop();
}
__forceinline__ void launch_kernel() { m_kernel_launcher(m_launch); }
__forceinline__ void sync_stream() const
{
NVBENCH_CUDA_CALL(cudaStreamSynchronize(m_launch.get_stream()));
}
KernelLauncher &m_kernel_launcher;
};
} // namespace detail
} // namespace nvbench