mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-04-20 14:58:54 +00:00
Add the first example.
- New NVBench_ENABLE_EXAMPLES CMake option.
- examples/axis.cu provides examples of parameter sweeps.
- Moves testing/sleep_kernel.cuh -> nvbench/test_kernels.cuh
- Accessible to examples and provides some built-in kernels for users
to experiement with.
- Not included with `<nvbench/nvbench.cuh>`.
This commit is contained in:
@@ -5,6 +5,7 @@ cmake_minimum_required(VERSION 3.18.3)
|
||||
project(NVBench CUDA CXX)
|
||||
|
||||
option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
|
||||
option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF)
|
||||
|
||||
# Setup some vars for CPM packages:
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/")
|
||||
@@ -44,7 +45,14 @@ endif()
|
||||
|
||||
add_subdirectory(nvbench)
|
||||
|
||||
if (NVBench_ENABLE_TESTING)
|
||||
if (NVBench_ENABLE_EXAMPLES OR NVBench_ENABLE_TESTING)
|
||||
enable_testing()
|
||||
endif()
|
||||
|
||||
if (NVBench_ENABLE_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
endif()
|
||||
|
||||
if (NVBench_ENABLE_TESTING)
|
||||
add_subdirectory(testing)
|
||||
endif()
|
||||
|
||||
18
examples/CMakeLists.txt
Normal file
18
examples/CMakeLists.txt
Normal file
@@ -0,0 +1,18 @@
|
||||
set(example_srcs
|
||||
axes.cu
|
||||
)
|
||||
|
||||
foreach(example_src IN LISTS example_srcs)
|
||||
get_filename_component(example_name "${example_src}" NAME_WLE)
|
||||
string(PREPEND example_name "nvbench.example.")
|
||||
add_executable(${example_name} "${example_src}")
|
||||
target_include_directories(${example_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}")
|
||||
target_link_libraries(${example_name} PRIVATE nvbench::main)
|
||||
set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_17)
|
||||
set_target_properties(${example_name} PROPERTIES
|
||||
ARCHIVE_OUTPUT_DIRECTORY "${NVBench_LIBRARY_OUTPUT_DIR}"
|
||||
LIBRARY_OUTPUT_DIRECTORY "${NVBench_LIBRARY_OUTPUT_DIR}"
|
||||
RUNTIME_OUTPUT_DIRECTORY "${NVBench_EXECUTABLE_OUTPUT_DIR}"
|
||||
)
|
||||
add_test(NAME ${example_name} COMMAND "$<TARGET_FILE:${example_name}>")
|
||||
endforeach()
|
||||
177
examples/axes.cu
Normal file
177
examples/axes.cu
Normal file
@@ -0,0 +1,177 @@
|
||||
/*
|
||||
* Copyright 2020 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <nvbench/nvbench.cuh>
|
||||
|
||||
// Grab some testing kernels from NVBench:
|
||||
#include <nvbench/test_kernels.cuh>
|
||||
|
||||
// Thrust vectors simplify memory management:
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
//==============================================================================
|
||||
// Simple benchmark with no parameter axes:
|
||||
void simple(nvbench::state &state)
|
||||
{
|
||||
state.exec([](nvbench::launch &launch) {
|
||||
// Sleep for 1 millisecond:
|
||||
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(simple);
|
||||
|
||||
//==============================================================================
|
||||
// Single parameter sweep:
|
||||
void single_float64_axis(nvbench::state &state)
|
||||
{
|
||||
const auto duration = state.get_float64("Duration");
|
||||
|
||||
state.exec([duration](nvbench::launch &launch) {
|
||||
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(single_float64_axis)
|
||||
// 0 -> 1 ms in 100 us increments.
|
||||
.add_float64_axis("Duration", nvbench::range(0., 1e-3, 1e-4));
|
||||
|
||||
//==============================================================================
|
||||
// Multiple parameters:
|
||||
// Varies block_size and num_blocks while invoking a naive copy of 256 GiB worth
|
||||
// of int32_t.
|
||||
void copy_sweep_grid_shape(nvbench::state &state)
|
||||
{
|
||||
// Get current parameters:
|
||||
const int block_size = static_cast<int>(state.get_int64("BlockSize"));
|
||||
const int num_blocks = static_cast<int>(state.get_int64("NumBlocks"));
|
||||
|
||||
// Number of int32's in 256 MB:
|
||||
const std::size_t num_values = 256 * 1024 * 1024 / sizeof(nvbench::int32_t);
|
||||
|
||||
// Report throughput stats:
|
||||
state.add_element_count(num_values);
|
||||
state.add_global_memory_reads<nvbench::int32_t>(num_values);
|
||||
state.add_global_memory_writes<nvbench::int32_t>(num_values);
|
||||
|
||||
// Allocate device memory:
|
||||
thrust::device_vector<nvbench::int32_t> in(num_values, 0);
|
||||
thrust::device_vector<nvbench::int32_t> out(num_values, 0);
|
||||
|
||||
state.exec(
|
||||
[block_size,
|
||||
num_blocks,
|
||||
num_values,
|
||||
in_ptr = thrust::raw_pointer_cast(in.data()),
|
||||
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
|
||||
nvbench::copy_kernel<<<num_blocks, block_size>>>(in_ptr,
|
||||
out_ptr,
|
||||
num_values);
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(copy_sweep_grid_shape)
|
||||
// Alternating powers of two between 64->1024
|
||||
.add_int64_power_of_two_axis("BlockSize", nvbench::range(6, 10, 2))
|
||||
.add_int64_power_of_two_axis("NumBlocks", nvbench::range(6, 10, 2));
|
||||
|
||||
//==============================================================================
|
||||
// Type parameter sweep:
|
||||
// Copy 256 MiB of data, represented with various value_types.
|
||||
template <typename ValueType>
|
||||
void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
|
||||
{
|
||||
// Number of ValueType's in 256 MB:
|
||||
const std::size_t num_values = 256 * 1024 * 1024 / sizeof(ValueType);
|
||||
|
||||
// Report throughput stats:
|
||||
state.add_element_count(num_values);
|
||||
state.add_global_memory_reads<ValueType>(num_values);
|
||||
state.add_global_memory_writes<ValueType>(num_values);
|
||||
|
||||
// Allocate device memory:
|
||||
thrust::device_vector<ValueType> in(num_values, 0);
|
||||
thrust::device_vector<ValueType> out(num_values, 0);
|
||||
|
||||
state.exec(
|
||||
[num_values,
|
||||
in_ptr = thrust::raw_pointer_cast(in.data()),
|
||||
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
|
||||
nvbench::copy_kernel<<<256, 256>>>(in_ptr, out_ptr, num_values);
|
||||
});
|
||||
}
|
||||
// Define a type_list to use for the type axis:
|
||||
using cts_types = nvbench::type_list<nvbench::uint8_t,
|
||||
nvbench::uint16_t,
|
||||
nvbench::uint32_t,
|
||||
nvbench::uint64_t,
|
||||
nvbench::float32_t,
|
||||
nvbench::float64_t>;
|
||||
NVBENCH_BENCH_TYPES(copy_type_sweep, NVBENCH_TYPE_AXES(cts_types));
|
||||
|
||||
//==============================================================================
|
||||
// Type parameter sweep:
|
||||
// Convert 64 MiB of InputTypes to OutputTypes, represented with various
|
||||
// value_types.
|
||||
template <typename InputType, typename OutputType>
|
||||
void copy_type_conversion_sweep(nvbench::state &state,
|
||||
nvbench::type_list<InputType, OutputType>)
|
||||
{
|
||||
// Optional: Skip narrowing conversions.
|
||||
// - Still run for lossy same-size int->float.
|
||||
// - This could be done at compile-time with SFINAE to avoid instantiation.
|
||||
if (sizeof(InputType) > sizeof(OutputType))
|
||||
{
|
||||
state.skip("Narrowing conversion: sizeof(InputType) > sizeof(OutputType).");
|
||||
}
|
||||
|
||||
// Number of InputType's in 64 MB:
|
||||
const std::size_t num_values = 64 * 1024 * 1024 / sizeof(InputType);
|
||||
|
||||
// Report throughput stats:
|
||||
state.add_element_count(num_values);
|
||||
state.add_global_memory_reads<InputType>(num_values);
|
||||
state.add_global_memory_writes<OutputType>(num_values);
|
||||
|
||||
// Allocate device memory:
|
||||
thrust::device_vector<InputType> in(num_values, 0);
|
||||
thrust::device_vector<OutputType> out(num_values, 0);
|
||||
|
||||
state.exec(
|
||||
[num_values,
|
||||
in_ptr = thrust::raw_pointer_cast(in.data()),
|
||||
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
|
||||
nvbench::copy_kernel<<<256, 256>>>(in_ptr, out_ptr, num_values);
|
||||
});
|
||||
}
|
||||
// Optional: Skip when InputType == OutputType. This approach avoids
|
||||
// instantiating the benchmark at all.
|
||||
template <typename T>
|
||||
void copy_type_conversion_sweep(nvbench::state &state, nvbench::type_list<T, T>)
|
||||
{
|
||||
state.skip("Not a conversion: InputType == OutputType.");
|
||||
}
|
||||
// The same type_list is used for both inputs/outputs.
|
||||
using ctcs_types = nvbench::type_list<nvbench::int8_t,
|
||||
nvbench::int16_t,
|
||||
nvbench::int32_t,
|
||||
nvbench::float32_t,
|
||||
nvbench::int64_t,
|
||||
nvbench::float64_t>;
|
||||
NVBENCH_BENCH_TYPES(copy_type_conversion_sweep,
|
||||
NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
|
||||
.set_type_axes_names({"In", "Out"});
|
||||
@@ -22,6 +22,17 @@
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
/*!
|
||||
* @file test_kernels.cuh
|
||||
* A collection of simple kernels for testing purposes.
|
||||
*/
|
||||
|
||||
namespace nvbench
|
||||
{
|
||||
|
||||
/*!
|
||||
* Each launched thread just sleeps for `seconds`.
|
||||
*/
|
||||
__global__ void sleep_kernel(double seconds)
|
||||
{
|
||||
const auto start = cuda::std::chrono::high_resolution_clock::now();
|
||||
@@ -35,3 +46,20 @@ __global__ void sleep_kernel(double seconds)
|
||||
now = cuda::std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
* Naive copy of `n` values from `in` -> `out`.
|
||||
*/
|
||||
template <typename T, typename U>
|
||||
__global__ void copy_kernel(const T* in, U* out, std::size_t n)
|
||||
{
|
||||
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = blockDim.x * gridDim.x;
|
||||
|
||||
for (auto i = init; i < n; i += step)
|
||||
{
|
||||
out[i] = static_cast<U>(in[i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -19,9 +19,9 @@
|
||||
#include <nvbench/cuda_timer.cuh>
|
||||
|
||||
#include <nvbench/cuda_stream.cuh>
|
||||
#include <nvbench/test_kernels.cuh>
|
||||
#include <nvbench/types.cuh>
|
||||
|
||||
#include "sleep_kernel.cuh"
|
||||
#include "test_asserts.cuh"
|
||||
|
||||
#include <fmt/format.h>
|
||||
@@ -35,7 +35,7 @@ void test_basic(cudaStream_t time_stream,
|
||||
NVBENCH_CUDA_CALL(cudaDeviceSynchronize());
|
||||
|
||||
timer.start(time_stream);
|
||||
sleep_kernel<<<1, 1, 0, exec_stream>>>(0.25);
|
||||
nvbench::sleep_kernel<<<1, 1, 0, exec_stream>>>(0.25);
|
||||
timer.stop(time_stream);
|
||||
|
||||
NVBENCH_CUDA_CALL(cudaDeviceSynchronize());
|
||||
|
||||
Reference in New Issue
Block a user