mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-04-19 22:38:52 +00:00
Draft of Python API for NVBench
The prototype is based on pybind11 to minimize boiler-plate code needed to deal with move-only semantics of many nvbench classes.
This commit is contained in:
4
python/.gitignore
vendored
Normal file
4
python/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
build
|
||||
nvbench_build
|
||||
nvbench_install
|
||||
__pycache__
|
||||
29
python/CMakeLists.txt
Normal file
29
python/CMakeLists.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
cmake_minimum_required(VERSION 3.30...4.0)
|
||||
|
||||
project(${SKBUILD_PROJECT_NAME} LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
find_package(Python REQUIRED COMPONENTS Development.Module)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
|
||||
include(FetchContent)
|
||||
|
||||
FetchContent_Declare(
|
||||
pybind11
|
||||
URL https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.tar.gz
|
||||
URL_HASH SHA256=e08cb87f4773da97fa7b5f035de8763abc656d87d5773e62f6da0587d1f0ec20
|
||||
FIND_PACKAGE_ARGS NAMES pybind11
|
||||
)
|
||||
FetchContent_MakeAvailable(pybind11)
|
||||
|
||||
find_package(nvbench CONFIG REQUIRED)
|
||||
|
||||
pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp)
|
||||
target_link_libraries(_nvbench PUBLIC nvbench::nvbench)
|
||||
target_link_libraries(_nvbench PRIVATE CUDA::cudart_static)
|
||||
set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
|
||||
|
||||
install(TARGETS _nvbench DESTINATION cuda/nvbench)
|
||||
install(IMPORTED_RUNTIME_ARTIFACTS nvbench::nvbench DESTINATION cuda/nvbench)
|
||||
21
python/README.md
Normal file
21
python/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# CUDA Kernel Benchmarking Package
|
||||
|
||||
This package provides Python API to CUDA Kernel Benchmarking Library `NVBench`.
|
||||
|
||||
## Building
|
||||
|
||||
### Build `NVBench` project
|
||||
|
||||
```
|
||||
cd nvbench/python
|
||||
cmake -B nvbench_build --preset nvbench-ci -S $(pwd)/.. -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DNVBench_ENABLE_EXAMPLES=OFF -DCMAKE_INSTALL_PREFIX=$(pwd)/nvbench_install
|
||||
cmake --build nvbench_build/ --config Release --target install
|
||||
|
||||
nvbench_DIR=$(pwd)/nvbench_install/lib/cmake CUDACXX=/usr/local/cuda/bin/nvcc pip install -e .
|
||||
```
|
||||
|
||||
### Verify that package works
|
||||
|
||||
```
|
||||
python test/run_1.py
|
||||
```
|
||||
18
python/cuda/nvbench/__init__.py
Normal file
18
python/cuda/nvbench/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
import importlib.metadata
|
||||
|
||||
from cuda.bindings.path_finder import ( # type: ignore[import-not-found]
|
||||
_load_nvidia_dynamic_library,
|
||||
)
|
||||
|
||||
try:
|
||||
__version__ = importlib.metadata.version("pynvbench")
|
||||
except Exception:
|
||||
__version__ = "0.0.0dev"
|
||||
|
||||
for libname in ("cupti", "nvperf_target", "nvperf_host"):
|
||||
_load_nvidia_dynamic_library(libname)
|
||||
|
||||
from ._nvbench import * # noqa: E402, F403
|
||||
from ._nvbench import register, run_all_benchmarks # noqa: E402
|
||||
|
||||
__all__ = ["register", "run_all_benchmarks"]
|
||||
58
python/pyproject.toml
Normal file
58
python/pyproject.toml
Normal file
@@ -0,0 +1,58 @@
|
||||
[build-system]
|
||||
requires = ["scikit-build-core>=0.10", "setuptools_scm"]
|
||||
build-backend = "scikit_build_core.build"
|
||||
|
||||
[project]
|
||||
name = "pynvbench"
|
||||
description = "CUDA Kernel Benchmarking Package"
|
||||
authors = [{ name = "NVIDIA Corporation" }]
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Environment :: GPU :: NVIDIA CUDA",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
]
|
||||
requires-python = ">=3.9"
|
||||
dependencies = [
|
||||
# pathfinder
|
||||
"cuda-bindings",
|
||||
|
||||
# Library expects to find shared libraries
|
||||
# libcupti, libnvperf_target, libnvperf_host
|
||||
# pathfinder is used to find it in the Python layout
|
||||
"nvidia-cuda-cupti-cu12",
|
||||
|
||||
# The shared library
|
||||
# libnvidia-ml must be installed system-wide
|
||||
# (Debian package provider: libnvidia-compute)
|
||||
]
|
||||
dynamic = ["version"]
|
||||
readme = { file = "README.md", content-type = "text/markdown" }
|
||||
|
||||
[project.optional-dependencies]
|
||||
test = ["pytest", "cupy-cuda12x", "numba"]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://developer.nvidia.com/"
|
||||
|
||||
[tool.scikit-build]
|
||||
minimum-version = "build-system.requires"
|
||||
build-dir = "build/{wheel_tag}"
|
||||
|
||||
[tool.scikit-build.cmake]
|
||||
version = ">=3.30.4"
|
||||
args = []
|
||||
build-type = "Release"
|
||||
source-dir = "."
|
||||
|
||||
[tool.scikit-build.ninja]
|
||||
version = ">=1.11"
|
||||
make-fallback = true
|
||||
|
||||
[tool.scikit-build.metadata.version]
|
||||
provider = "scikit_build_core.metadata.setuptools_scm"
|
||||
|
||||
[tool.setuptools_scm]
|
||||
root = ".."
|
||||
|
||||
[tool.scikit-build.wheel.packages]
|
||||
"cuda/nvbench" = "cuda/nvbench"
|
||||
17
python/src/README.md
Normal file
17
python/src/README.md
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
```
|
||||
g++ py_nvbench.cpp \
|
||||
-shared -fPIC \
|
||||
-I ${HOME}/repos/pybind11/include \
|
||||
-I ${HOME}/repos/pynvbench/nvbench_dir/include \
|
||||
-I /usr/local/cuda/include \
|
||||
$(python3-config --includes) \
|
||||
$(python3-config --libs) \
|
||||
-L ${HOME}/repos/pynvbench/nvbench_dir/lib/ \
|
||||
-lnvbench \
|
||||
-Wl,-rpath,${HOME}/repos/pynvbench/nvbench_dir/lib \
|
||||
-L /usr/local/cuda/lib64/ \
|
||||
-lcudart \
|
||||
-Wl,-rpath,/usr/local/cuda/lib64 \
|
||||
-o _nvbench$(python3-config --extension-suffix)
|
||||
```
|
||||
451
python/src/py_nvbench.cpp
Normal file
451
python/src/py_nvbench.cpp
Normal file
@@ -0,0 +1,451 @@
|
||||
// TODO: Copyright header
|
||||
|
||||
#include <nvbench/nvbench.cuh>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
inline void set_env(const char *name, const char *value)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
_putenv_s(name, value);
|
||||
#else
|
||||
setenv(name, value, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
struct PyObjectDeleter
|
||||
{
|
||||
void operator()(py::object *p)
|
||||
{
|
||||
const bool initialized = Py_IsInitialized();
|
||||
|
||||
#if PY_VERSION_HEX < 0x30d0000
|
||||
const bool finalizing = _Py_IsFinalizing();
|
||||
#else
|
||||
const bool finalizing = Py_IsFinalizing();
|
||||
#endif
|
||||
const bool guard = initialized && !finalizing;
|
||||
|
||||
// deleter only call ~object if interpreter is active and
|
||||
// not shutting down, let OS clean up resources after
|
||||
// interpreter tear-down
|
||||
if (guard)
|
||||
{
|
||||
delete p;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct benchmark_wrapper_t
|
||||
{
|
||||
|
||||
benchmark_wrapper_t()
|
||||
: m_fn() {};
|
||||
explicit benchmark_wrapper_t(py::object o)
|
||||
: m_fn{std::shared_ptr<py::object>(new py::object(o), PyObjectDeleter{})}
|
||||
{}
|
||||
|
||||
benchmark_wrapper_t(const benchmark_wrapper_t &other)
|
||||
: m_fn{other.m_fn}
|
||||
{}
|
||||
benchmark_wrapper_t &operator=(const benchmark_wrapper_t &other) = delete;
|
||||
benchmark_wrapper_t(benchmark_wrapper_t &&) noexcept = delete;
|
||||
benchmark_wrapper_t &operator=(benchmark_wrapper_t &&) noexcept = delete;
|
||||
|
||||
void operator()(nvbench::state &state, nvbench::type_list<>)
|
||||
{
|
||||
// box as Python object, using reference semantics
|
||||
auto arg = py::cast(std::ref(state), py::return_value_policy::reference);
|
||||
|
||||
// Execute Python callable
|
||||
(*m_fn)(arg);
|
||||
}
|
||||
|
||||
private:
|
||||
// Important to use shared pointer here rather than py::object directly,
|
||||
// since copy constructor must be const (benchmark::do_clone is const member method)
|
||||
std::shared_ptr<py::object> m_fn;
|
||||
};
|
||||
|
||||
class GlobalBenchmarkRegistry
|
||||
{
|
||||
bool m_finalized;
|
||||
|
||||
public:
|
||||
GlobalBenchmarkRegistry()
|
||||
: m_finalized(false) {};
|
||||
|
||||
GlobalBenchmarkRegistry(const GlobalBenchmarkRegistry &) = delete;
|
||||
GlobalBenchmarkRegistry &operator=(const GlobalBenchmarkRegistry &) = delete;
|
||||
|
||||
GlobalBenchmarkRegistry(GlobalBenchmarkRegistry &&) = delete;
|
||||
GlobalBenchmarkRegistry &operator=(GlobalBenchmarkRegistry &&) = delete;
|
||||
|
||||
bool is_finalized() const { return m_finalized; }
|
||||
|
||||
nvbench::benchmark_base &add_bench(py::object fn)
|
||||
{
|
||||
if (m_finalized)
|
||||
{
|
||||
throw std::runtime_error("Can not register more benchmarks after benchmark was run");
|
||||
}
|
||||
if (!PyCallable_Check(fn.ptr()))
|
||||
{
|
||||
throw py::value_error("Benchmark should be a callable object");
|
||||
}
|
||||
std::string name;
|
||||
if (py::hasattr(fn, "__name__"))
|
||||
{
|
||||
py::str py_name = fn.attr("__name__");
|
||||
name = py::cast<std::string>(py_name);
|
||||
}
|
||||
else
|
||||
{
|
||||
py::str py_name = py::repr(fn);
|
||||
name = py::cast<std::string>(py_name);
|
||||
}
|
||||
benchmark_wrapper_t executor(fn);
|
||||
|
||||
return nvbench::benchmark_manager::get()
|
||||
.add(std::make_unique<nvbench::benchmark<benchmark_wrapper_t>>(executor))
|
||||
.set_name(std::move(name));
|
||||
}
|
||||
|
||||
void run(const std::vector<std::string> &argv)
|
||||
{
|
||||
if (nvbench::benchmark_manager::get().get_benchmarks().empty())
|
||||
{
|
||||
throw std::runtime_error("No benchmarks had been registered yet");
|
||||
}
|
||||
if (m_finalized)
|
||||
{
|
||||
throw std::runtime_error("Benchmarks were already executed");
|
||||
}
|
||||
m_finalized = true;
|
||||
|
||||
try
|
||||
{
|
||||
// TODO: This line is mandatory for correctness
|
||||
// Q: Why is initializing at module init not enough?
|
||||
nvbench::benchmark_manager::get().initialize();
|
||||
{
|
||||
nvbench::option_parser parser{};
|
||||
parser.parse(argv);
|
||||
|
||||
NVBENCH_MAIN_PRINT_PREAMBLE(parser);
|
||||
NVBENCH_MAIN_RUN_BENCHMARKS(parser);
|
||||
NVBENCH_MAIN_PRINT_EPILOGUE(parser);
|
||||
|
||||
NVBENCH_MAIN_PRINT_RESULTS(parser);
|
||||
} /* Tear down parser before finalization */
|
||||
}
|
||||
catch (const std::exception &e)
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << "Caught exception while running benchmakrs: ";
|
||||
ss << e.what();
|
||||
ss << "\n";
|
||||
py::print(py::cast(ss.str(), py::return_value_policy::move));
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
py::print("Caught exception in nvbench_main\n");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// essentially a global variable, but allocated on the heap during module initialization
|
||||
constinit std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
|
||||
|
||||
} // end of anonymous namespace
|
||||
|
||||
PYBIND11_MODULE(_nvbench, m)
|
||||
{
|
||||
// == STEP 1
|
||||
// Set environment variable CUDA_MODULE_LOADING=EAGER
|
||||
|
||||
// See NVIDIA/NVBench#136 for CUDA_MODULE_LOADING
|
||||
set_env("CUDA_MODULE_LOADING", "EAGER");
|
||||
|
||||
NVBENCH_DRIVER_API_CALL(cuInit(0));
|
||||
|
||||
nvbench::benchmark_manager::get().initialize();
|
||||
|
||||
// TODO: Use cuModuleGetLoadingMode(&mode) to confirm that (mode == CU_MODULE_EAGER_LOADING)
|
||||
// and issue warning otherwise
|
||||
|
||||
// == STEP 2
|
||||
// Define CudaStream class
|
||||
// ATTN: nvbench::cuda_stream is move-only class
|
||||
// Methods:
|
||||
// Constructors, based on device, or on existing stream
|
||||
// nvbench::cuda_stream::get_stream
|
||||
|
||||
auto py_cuda_stream_cls = py::class_<nvbench::cuda_stream>(m, "CudaStream");
|
||||
|
||||
py_cuda_stream_cls.def("__cuda_stream__",
|
||||
[](const nvbench::cuda_stream &s) -> std::pair<std::size_t, std::size_t> {
|
||||
return std::make_pair(std::size_t{0},
|
||||
reinterpret_cast<std::size_t>(s.get_stream()));
|
||||
});
|
||||
py_cuda_stream_cls.def("addressof", [](const nvbench::cuda_stream &s) -> std::size_t {
|
||||
return reinterpret_cast<std::size_t>(s.get_stream());
|
||||
});
|
||||
|
||||
// == STEP 3
|
||||
// Define Launch class
|
||||
// ATTN: nvbench::launch is move-only class
|
||||
// Methods:
|
||||
// nvbench::launch::get_stream -> nvbench::cuda_stream
|
||||
|
||||
auto py_launch_cls = py::class_<nvbench::launch>(m, "Launch");
|
||||
|
||||
py_launch_cls.def(
|
||||
"getStream",
|
||||
[](nvbench::launch &launch) { return std::ref(launch.get_stream()); },
|
||||
py::return_value_policy::reference);
|
||||
|
||||
// == STEP 4
|
||||
// Define Benchmark class
|
||||
|
||||
auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark");
|
||||
py_benchmark_cls.def("getName", &nvbench::benchmark_base::get_name);
|
||||
py_benchmark_cls.def(
|
||||
"addInt64Axis",
|
||||
[](nvbench::benchmark_base &self, std::string name, const std::vector<nvbench::int64_t> &data) {
|
||||
self.add_int64_axis(name, data);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference);
|
||||
py_benchmark_cls.def(
|
||||
"addFloat64Axis",
|
||||
[](nvbench::benchmark_base &self,
|
||||
std::string name,
|
||||
const std::vector<nvbench::float64_t> &data) {
|
||||
self.add_float64_axis(name, data);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference);
|
||||
py_benchmark_cls.def(
|
||||
"addStringAxis",
|
||||
[](nvbench::benchmark_base &self, std::string name, const std::vector<std::string> &data) {
|
||||
self.add_string_axis(name, data);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference);
|
||||
|
||||
// == STEP 5
|
||||
// Define PyState class
|
||||
// ATTN: nvbench::state is move-only class
|
||||
// Methods:
|
||||
// nvbench::state::get_cuda_stream
|
||||
// nvbench::state::get_cuda_stream_optional
|
||||
// nvbench::state::set_cuda_stream
|
||||
// nvbench::state::get_device
|
||||
// nvbench::state::get_is_cpu_only
|
||||
// nvbench::state::get_type_config_index
|
||||
// nvbench::state::get_int64
|
||||
// nvbench::state::get_int64_or_default
|
||||
// nvbench::state::get_float64
|
||||
// nvbench::state::get_float64_or_default
|
||||
// nvbench::state::get_string
|
||||
// nvbench::state::get_string_or_default
|
||||
// nvbench::state::add_element_count
|
||||
// nvbench::state::set_element_count
|
||||
// nvbench::state::get_element_count
|
||||
// nvbench::state::add_global_memory_reads
|
||||
// nvbench::state::add_global_memory_writes
|
||||
// nvbench::state::add_buffer_size
|
||||
// nvbench::state::set_global_memory_rw_bytes
|
||||
// nvbench::state::get_global_memory_rw_bytes
|
||||
// nvbench::state::skip
|
||||
// nvbench::state::is_skipped
|
||||
// nvbench::state::get_skip_reason
|
||||
// nvbench::state::get_min_samples
|
||||
// nvbench::state::set_min_samples
|
||||
// nvbench::state::get_criterion_params
|
||||
// nvbench::state::get_stopping_criterion
|
||||
// nvbench::state::get_run_once
|
||||
// nvbench::state::set_run_once
|
||||
// nvbench::state::get_disable_blocking_kernel
|
||||
// nvbench::state::set_disable_blocking_kernel
|
||||
// nvbench::state::set_skip_time
|
||||
// nvbench::state::get_skip_time
|
||||
// nvbench::state::set_timeout
|
||||
// nvbench::state::get_timeout
|
||||
// nvbench::state::set_throttle_threshold
|
||||
// nvbench::state::get_throttle_threshold
|
||||
// nvbench::state::set_throttle_recovery_delay
|
||||
// nvbench::state::get_throttle_recovery_delay
|
||||
// nvbench::state::get_blocking_kernel_timeout
|
||||
// nvbench::state::set_blocking_kernel_timeout
|
||||
// nvbench::state::get_axis_values
|
||||
// nvbench::state::get_axis_values_as_string
|
||||
// nvbench::state::get_benchmark
|
||||
// nvbench::state::collect_l1_hit_rates
|
||||
// nvbench::state::collect_l2_hit_rates
|
||||
// nvbench::state::collect_stores_efficiency
|
||||
// nvbench::state::collect_loads_efficiency
|
||||
// nvbench::state::collect_dram_throughput
|
||||
// nvbench::state::collect_cupti_metrics
|
||||
// nvbench::state::is_l1_hit_rate_collected
|
||||
// nvbench::state::is_l2_hit_rate_collected
|
||||
// nvbench::state::is_stores_efficiency_collected
|
||||
// nvbench::state::is_loads_efficiency_collected
|
||||
// nvbench::state::is_dram_throughput_collected
|
||||
// nvbench::state::is_cupti_required
|
||||
// nvbench::state::add_summary
|
||||
// nvbench::state::get_summary
|
||||
// nvbench::state::get_summaries
|
||||
// nvbench::state::get_short_description
|
||||
// nvbench::state::exec
|
||||
// NOTE:
|
||||
// PyState wraps std::reference_wrapper<nvbench::state>
|
||||
|
||||
using state_ref_t = std::reference_wrapper<nvbench::state>;
|
||||
auto pystate_cls = py::class_<nvbench::state>(m, "State");
|
||||
|
||||
pystate_cls.def("hasDevice", [](nvbench::state &state) -> bool {
|
||||
return static_cast<bool>(state.get_device());
|
||||
});
|
||||
pystate_cls.def("hasPrinters", [](nvbench::state &state) -> bool {
|
||||
return state.get_benchmark().get_printer().has_value();
|
||||
});
|
||||
|
||||
pystate_cls.def("getStream", &nvbench::state::get_cuda_stream);
|
||||
|
||||
pystate_cls.def("getInt64", &nvbench::state::get_int64);
|
||||
pystate_cls.def("getInt64", &nvbench::state::get_int64_or_default);
|
||||
|
||||
pystate_cls.def("getFloat64", &nvbench::state::get_float64);
|
||||
pystate_cls.def("getFloat64", &nvbench::state::get_float64_or_default);
|
||||
|
||||
pystate_cls.def("getString", &nvbench::state::get_string);
|
||||
pystate_cls.def("getString", &nvbench::state::get_string_or_default);
|
||||
|
||||
pystate_cls.def("addElementCount", &nvbench::state::add_element_count);
|
||||
pystate_cls.def("setElementCount", &nvbench::state::set_element_count);
|
||||
pystate_cls.def("getElementCount", &nvbench::state::get_element_count);
|
||||
|
||||
pystate_cls.def("skip", &nvbench::state::skip);
|
||||
pystate_cls.def("isSkipped", &nvbench::state::is_skipped);
|
||||
pystate_cls.def("getSkipReason", &nvbench::state::get_skip_reason);
|
||||
|
||||
pystate_cls.def(
|
||||
"addGlobalMemoryReads",
|
||||
[](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
|
||||
state.add_global_memory_reads(nbytes, column_name);
|
||||
},
|
||||
"Add size, in bytes, of global memory reads",
|
||||
py::arg("nbytes"),
|
||||
py::pos_only{},
|
||||
py::arg("column_name") = py::str(""));
|
||||
pystate_cls.def(
|
||||
"addGlobalMemoryWrites",
|
||||
[](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
|
||||
state.add_global_memory_writes(nbytes, column_name);
|
||||
},
|
||||
"Add size, in bytes, of global memory writes",
|
||||
py::arg("nbytes"),
|
||||
py::pos_only{},
|
||||
py::arg("column_name") = py::str(""));
|
||||
pystate_cls.def(
|
||||
"getBenchmark",
|
||||
[](nvbench::state &state) { return std::ref(state.get_benchmark()); },
|
||||
py::return_value_policy::reference);
|
||||
pystate_cls.def("getThrottleThreshold", &nvbench::state::get_throttle_threshold);
|
||||
|
||||
pystate_cls.def("getMinSamples", &nvbench::state::get_min_samples);
|
||||
pystate_cls.def("setMinSamples", &nvbench::state::set_min_samples);
|
||||
|
||||
pystate_cls.def("getDisableBlockingKernel", &nvbench::state::get_disable_blocking_kernel);
|
||||
pystate_cls.def("setDisableBlockingKernel", &nvbench::state::set_disable_blocking_kernel);
|
||||
|
||||
pystate_cls.def("getRunOnce", &nvbench::state::get_run_once);
|
||||
pystate_cls.def("setRunOnce", &nvbench::state::set_run_once);
|
||||
|
||||
pystate_cls.def("getTimeout", &nvbench::state::get_timeout);
|
||||
pystate_cls.def("setTimeout", &nvbench::state::set_timeout);
|
||||
|
||||
pystate_cls.def("getBlockingKernel", &nvbench::state::get_blocking_kernel_timeout);
|
||||
pystate_cls.def("setBlockingKernel", &nvbench::state::set_blocking_kernel_timeout);
|
||||
|
||||
pystate_cls.def("collectCUPTIMetrics", &nvbench::state::collect_cupti_metrics);
|
||||
pystate_cls.def("isCUPTIRequired", &nvbench::state::is_cupti_required);
|
||||
|
||||
pystate_cls.def(
|
||||
"exec",
|
||||
[](nvbench::state &state, py::object fn, bool batched, bool sync) {
|
||||
auto launcher_fn = [fn](nvbench::launch &launch_descr) -> void {
|
||||
fn(py::cast(std::ref(launch_descr), py::return_value_policy::reference));
|
||||
};
|
||||
|
||||
if (sync)
|
||||
{
|
||||
if (batched)
|
||||
{
|
||||
state.exec(nvbench::exec_tag::sync, launcher_fn);
|
||||
}
|
||||
else
|
||||
{
|
||||
state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::no_batch, launcher_fn);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (batched)
|
||||
{
|
||||
state.exec(nvbench::exec_tag::none, launcher_fn);
|
||||
}
|
||||
else
|
||||
{
|
||||
state.exec(nvbench::exec_tag::no_batch, launcher_fn);
|
||||
}
|
||||
}
|
||||
},
|
||||
"Executor for given callable fn(state : Launch)",
|
||||
py::arg("fn"),
|
||||
py::pos_only{},
|
||||
py::arg("batched") = true,
|
||||
py::arg("sync") = false);
|
||||
|
||||
// == STEP 6
|
||||
// ATTN: nvbench::benchmark_manager is a singleton
|
||||
|
||||
global_registry =
|
||||
std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete>(new GlobalBenchmarkRegistry(),
|
||||
py::nodelete{});
|
||||
|
||||
m.def(
|
||||
"register",
|
||||
[&](py::object fn) { return std::ref(global_registry->add_bench(fn)); },
|
||||
py::return_value_policy::reference);
|
||||
|
||||
m.def(
|
||||
"run_all_benchmarks",
|
||||
[&](py::object argv) -> void {
|
||||
if (!py::isinstance<py::list>(argv))
|
||||
{
|
||||
throw py::type_error("run_all_benchmarks expects a list of command-line arguments");
|
||||
}
|
||||
std::vector<std::string> args = py::cast<std::vector<std::string>>(argv);
|
||||
global_registry->run(args);
|
||||
},
|
||||
"Run all benchmarks",
|
||||
py::arg("argv") = py::list());
|
||||
}
|
||||
108
python/test/run_1.py
Executable file
108
python/test/run_1.py
Executable file
@@ -0,0 +1,108 @@
|
||||
import sys
|
||||
|
||||
import cuda.nvbench as nvbench
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
|
||||
|
||||
@cuda.jit()
|
||||
def kernel(a, b, c):
|
||||
tid = cuda.grid(1)
|
||||
size = len(a)
|
||||
|
||||
if tid < size:
|
||||
c[tid] = a[tid] + b[tid]
|
||||
|
||||
|
||||
def getNumbaStream(launch):
|
||||
return cuda.external_stream(launch.getStream().addressof())
|
||||
|
||||
|
||||
def add_two(state):
|
||||
# state.skip("Skipping this benchmark for no reason")
|
||||
N = state.getInt64("elements")
|
||||
a = cuda.to_device(np.random.random(N))
|
||||
c = cuda.device_array_like(a)
|
||||
|
||||
state.addGlobalMemoryReads(a.nbytes)
|
||||
state.addGlobalMemoryWrites(c.nbytes)
|
||||
|
||||
nthreads = 256
|
||||
nblocks = (len(a) + nthreads - 1) // nthreads
|
||||
|
||||
# First call locks, can't use async benchmarks until sync tag is supported
|
||||
kernel[nblocks, nthreads](a, a, c)
|
||||
cuda.synchronize()
|
||||
|
||||
def kernel_launcher(launch):
|
||||
stream = getNumbaStream(launch)
|
||||
kernel[nblocks, nthreads, stream](a, a, c)
|
||||
|
||||
state.exec(kernel_launcher, batched=True, sync=True)
|
||||
|
||||
|
||||
def add_float(state):
|
||||
N = state.getInt64("elements")
|
||||
v = state.getFloat64("v")
|
||||
name = state.getString("name")
|
||||
a = cuda.to_device(np.random.random(N).astype(np.float32))
|
||||
b = cuda.to_device(np.random.random(N).astype(np.float32))
|
||||
c = cuda.device_array_like(a)
|
||||
|
||||
state.addGlobalMemoryReads(a.nbytes + b.nbytes)
|
||||
state.addGlobalMemoryWrites(c.nbytes)
|
||||
|
||||
nthreads = 64
|
||||
nblocks = (len(a) + nthreads - 1) // nthreads
|
||||
|
||||
def kernel_launcher(launch):
|
||||
_ = v
|
||||
_ = name
|
||||
stream = getNumbaStream(launch)
|
||||
kernel[nblocks, nthreads, stream](a, b, c)
|
||||
|
||||
state.exec(kernel_launcher, batched=True, sync=True)
|
||||
|
||||
|
||||
def add_three(state):
|
||||
N = state.getInt64("elements")
|
||||
a = cuda.to_device(np.random.random(N).astype(np.float32))
|
||||
b = cuda.to_device(np.random.random(N).astype(np.float32))
|
||||
c = cuda.device_array_like(a)
|
||||
|
||||
state.addGlobalMemoryReads(a.nbytes + b.nbytes)
|
||||
state.addGlobalMemoryWrites(c.nbytes)
|
||||
|
||||
nthreads = 256
|
||||
nblocks = (len(a) + nthreads - 1) // nthreads
|
||||
|
||||
def kernel_launcher(launch):
|
||||
stream = getNumbaStream(launch)
|
||||
kernel[nblocks, nthreads, stream](a, b, c)
|
||||
|
||||
state.exec(kernel_launcher, batched=True, sync=True)
|
||||
cuda.synchronize()
|
||||
|
||||
|
||||
def register_benchmarks():
|
||||
(
|
||||
nvbench.register(add_two).addInt64Axis(
|
||||
"elements", [2**pow2 for pow2 in range(20, 23)]
|
||||
)
|
||||
)
|
||||
(
|
||||
nvbench.register(add_float)
|
||||
.addFloat64Axis("v", [0.1, 0.3])
|
||||
.addStringAxis("name", ["Anne", "Lynda"])
|
||||
.addInt64Axis("elements", [2**pow2 for pow2 in range(20, 23)])
|
||||
)
|
||||
(
|
||||
nvbench.register(add_three).addInt64Axis(
|
||||
"elements", [2**pow2 for pow2 in range(20, 22)]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
register_benchmarks()
|
||||
nvbench.run_all_benchmarks(sys.argv)
|
||||
Reference in New Issue
Block a user