mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-03-29 11:37:38 +00:00
429 lines
21 KiB
C++
429 lines
21 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include <iomanip>
|
|
|
|
#include "ck_tile/host/device_prop.hpp"
|
|
#include "ck_tile/ops/pooling.hpp"
|
|
#include "ck_tile/host/reference/reference_pool.hpp"
|
|
#include "pool_benchmark.hpp"
|
|
|
|
class PoolProfiler
|
|
{
|
|
public:
|
|
static PoolProfiler& instance(Setting setting)
|
|
{
|
|
static PoolProfiler instance{setting};
|
|
return instance;
|
|
}
|
|
|
|
// Overload for single kernel benchmarking
|
|
template <typename TensorShape, typename WindowShape>
|
|
void benchmark(PoolProblem& pool_problem,
|
|
std::function<float(const ck_tile::PoolHostArgs<TensorShape, WindowShape>&,
|
|
const ck_tile::stream_config&)> kernel_func)
|
|
{
|
|
// Create a vector with a single callable that returns both name and time
|
|
std::vector<std::function<std::tuple<std::string, float>(
|
|
ck_tile::PoolHostArgs<TensorShape, WindowShape>&, const ck_tile::stream_config&)>>
|
|
callables;
|
|
|
|
callables.push_back([kernel_func](ck_tile::PoolHostArgs<TensorShape, WindowShape>& args,
|
|
const ck_tile::stream_config& stream) {
|
|
float time = kernel_func(args, stream);
|
|
return std::make_tuple(std::string(KERNEL_NAME), time);
|
|
});
|
|
|
|
benchmark(pool_problem, callables);
|
|
}
|
|
|
|
template <typename TensorShape, typename WindowShape>
|
|
void benchmark(
|
|
PoolProblem& pool_problem,
|
|
std::vector<std::function<std::tuple<std::string, float>(
|
|
ck_tile::PoolHostArgs<TensorShape, WindowShape>&, const ck_tile::stream_config&)>>&
|
|
callables)
|
|
{
|
|
// Calculate output dimensions based on pool dimension
|
|
const ck_tile::index_t N = pool_problem.N;
|
|
const ck_tile::index_t D = pool_problem.D;
|
|
const ck_tile::index_t H = pool_problem.H;
|
|
const ck_tile::index_t W = pool_problem.W;
|
|
const ck_tile::index_t C = pool_problem.C;
|
|
|
|
const ck_tile::index_t Z = pool_problem.windowZ;
|
|
const ck_tile::index_t Y = pool_problem.windowY;
|
|
const ck_tile::index_t X = pool_problem.windowX;
|
|
|
|
const ck_tile::index_t Sz = pool_problem.strideZ;
|
|
const ck_tile::index_t Sy = pool_problem.strideY;
|
|
const ck_tile::index_t Sx = pool_problem.strideX;
|
|
|
|
const ck_tile::index_t Dz = pool_problem.dilationZ;
|
|
const ck_tile::index_t Dy = pool_problem.dilationY;
|
|
const ck_tile::index_t Dx = pool_problem.dilationX;
|
|
|
|
const ck_tile::index_t LeftPz = pool_problem.leftPadZ;
|
|
const ck_tile::index_t LeftPy = pool_problem.leftPadY;
|
|
const ck_tile::index_t LeftPx = pool_problem.leftPadX;
|
|
const ck_tile::index_t RightPz = pool_problem.rightPadZ;
|
|
const ck_tile::index_t RightPy = pool_problem.rightPadY;
|
|
const ck_tile::index_t RightPx = pool_problem.rightPadX;
|
|
|
|
// Calculate effective window sizes
|
|
const ck_tile::index_t Zs = (Z - 1) * Dz + 1;
|
|
const ck_tile::index_t Ys = (Y - 1) * Dy + 1;
|
|
const ck_tile::index_t Xs = (X - 1) * Dx + 1;
|
|
|
|
// Calculate output dimensions
|
|
const ck_tile::index_t Do = (D + LeftPz + RightPz - Zs) / Sz + 1;
|
|
const ck_tile::index_t Ho = (H + LeftPy + RightPy - Ys) / Sy + 1;
|
|
const ck_tile::index_t Wo = (W + LeftPx + RightPx - Xs) / Sx + 1;
|
|
|
|
// Create input/output tensors based on pool dimension (3D: NDHWC, 2D: NHWC)
|
|
ck_tile::HostTensor<InDataType> in_tensor(
|
|
pool_problem.poolDim == 3 ? std::vector<std::size_t>{static_cast<std::size_t>(N),
|
|
static_cast<std::size_t>(D),
|
|
static_cast<std::size_t>(H),
|
|
static_cast<std::size_t>(W),
|
|
static_cast<std::size_t>(C)}
|
|
: std::vector<std::size_t>{static_cast<std::size_t>(N),
|
|
static_cast<std::size_t>(H),
|
|
static_cast<std::size_t>(W),
|
|
static_cast<std::size_t>(C)});
|
|
|
|
ck_tile::HostTensor<OutDataType> out_tensor(
|
|
pool_problem.poolDim == 3 ? std::vector<std::size_t>{static_cast<std::size_t>(N),
|
|
static_cast<std::size_t>(Do),
|
|
static_cast<std::size_t>(Ho),
|
|
static_cast<std::size_t>(Wo),
|
|
static_cast<std::size_t>(C)}
|
|
: std::vector<std::size_t>{static_cast<std::size_t>(N),
|
|
static_cast<std::size_t>(Ho),
|
|
static_cast<std::size_t>(Wo),
|
|
static_cast<std::size_t>(C)});
|
|
|
|
ck_tile::HostTensor<OutDataType> out_host_result(
|
|
pool_problem.poolDim == 3 ? std::vector<std::size_t>{static_cast<std::size_t>(N),
|
|
static_cast<std::size_t>(Do),
|
|
static_cast<std::size_t>(Ho),
|
|
static_cast<std::size_t>(Wo),
|
|
static_cast<std::size_t>(C)}
|
|
: std::vector<std::size_t>{static_cast<std::size_t>(N),
|
|
static_cast<std::size_t>(Ho),
|
|
static_cast<std::size_t>(Wo),
|
|
static_cast<std::size_t>(C)});
|
|
|
|
ck_tile::HostTensor<IndexDataType> out_index_tensor(
|
|
pool_problem.outputIndex ? (pool_problem.poolDim == 3
|
|
? std::vector<std::size_t>{static_cast<std::size_t>(N),
|
|
static_cast<std::size_t>(Do),
|
|
static_cast<std::size_t>(Ho),
|
|
static_cast<std::size_t>(Wo),
|
|
static_cast<std::size_t>(C)}
|
|
: std::vector<std::size_t>{static_cast<std::size_t>(N),
|
|
static_cast<std::size_t>(Ho),
|
|
static_cast<std::size_t>(Wo),
|
|
static_cast<std::size_t>(C)})
|
|
: std::vector<std::size_t>{1});
|
|
|
|
ck_tile::HostTensor<IndexDataType> out_index_host_result(
|
|
pool_problem.outputIndex ? (pool_problem.poolDim == 3
|
|
? std::vector<std::size_t>{static_cast<std::size_t>(N),
|
|
static_cast<std::size_t>(Do),
|
|
static_cast<std::size_t>(Ho),
|
|
static_cast<std::size_t>(Wo),
|
|
static_cast<std::size_t>(C)}
|
|
: std::vector<std::size_t>{static_cast<std::size_t>(N),
|
|
static_cast<std::size_t>(Ho),
|
|
static_cast<std::size_t>(Wo),
|
|
static_cast<std::size_t>(C)})
|
|
: std::vector<std::size_t>{1});
|
|
|
|
// Initialize input tensor
|
|
if(setting_.init_method_ == 0)
|
|
{
|
|
ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(in_tensor);
|
|
}
|
|
else if(setting_.init_method_ == 1)
|
|
{
|
|
ck_tile::FillMonotonicSeq<InDataType>{}(in_tensor);
|
|
}
|
|
else if(setting_.init_method_ == 2)
|
|
{
|
|
ck_tile::FillConstant<InDataType>{static_cast<InDataType>(1)}(in_tensor);
|
|
}
|
|
else
|
|
{
|
|
in_tensor.SetZero();
|
|
}
|
|
|
|
// Allocate device memory
|
|
ck_tile::DeviceMem in_dev_buf(in_tensor.get_element_space_size_in_bytes());
|
|
ck_tile::DeviceMem out_dev_buf(out_tensor.get_element_space_size_in_bytes());
|
|
ck_tile::DeviceMem out_index_dev_buf(
|
|
pool_problem.outputIndex ? out_index_tensor.get_element_space_size_in_bytes() : 0);
|
|
|
|
in_dev_buf.ToDevice(in_tensor.data());
|
|
out_dev_buf.SetZero();
|
|
if(pool_problem.outputIndex)
|
|
{
|
|
out_index_dev_buf.SetZero();
|
|
}
|
|
|
|
// Create shapes for host args
|
|
TensorShape input_shape, output_shape, input_strides, output_strides;
|
|
WindowShape window_lengths, window_strides, window_dilations, input_left_pads,
|
|
input_right_pads;
|
|
|
|
// Create host arguments
|
|
ck_tile::PoolHostArgs<TensorShape, WindowShape> pool_args{
|
|
in_dev_buf.GetDeviceBuffer(),
|
|
out_dev_buf.GetDeviceBuffer(),
|
|
pool_problem.outputIndex ? out_index_dev_buf.GetDeviceBuffer() : nullptr,
|
|
input_shape,
|
|
output_shape,
|
|
input_strides,
|
|
output_strides,
|
|
window_lengths,
|
|
window_strides,
|
|
window_dilations,
|
|
input_left_pads,
|
|
input_right_pads};
|
|
|
|
// Run reference if verification is enabled
|
|
//
|
|
// Note:
|
|
// A host-side reference implementation is not invoked here. Although
|
|
// `ck_tile/host/reference/reference_pool.hpp` is available, this
|
|
// profiler operates directly on device buffers with generic
|
|
// TensorShape/WindowShape types, and a matching reference path has
|
|
// not yet been wired up for this configuration. Verification of
|
|
// pooling results is therefore expected to be performed by the
|
|
// caller or by a higher-level test harness.
|
|
//
|
|
// To make this limitation visible, emit a one-time warning if
|
|
// verification was requested for this profiler.
|
|
if(setting_.verify_)
|
|
{
|
|
static bool warned = false;
|
|
if(!warned)
|
|
{
|
|
warned = true;
|
|
std::cerr << "Warning: PoolProfiler verification requested, but no "
|
|
"host reference pooling implementation is invoked in "
|
|
"this configuration. Results are not being verified "
|
|
"by PoolProfiler."
|
|
<< std::endl;
|
|
}
|
|
}
|
|
|
|
for(auto& callable : callables)
|
|
{
|
|
auto kernel_run_result = callable(pool_args,
|
|
ck_tile::stream_config{nullptr,
|
|
true,
|
|
setting_.log_,
|
|
setting_.n_warmup_,
|
|
setting_.n_repeat_,
|
|
setting_.is_gpu_timer_,
|
|
setting_.flush_cache_,
|
|
setting_.rotating_count_});
|
|
process_result(pool_problem,
|
|
out_dev_buf,
|
|
out_host_result,
|
|
out_tensor,
|
|
out_index_dev_buf,
|
|
out_index_host_result,
|
|
out_index_tensor,
|
|
kernel_run_result);
|
|
}
|
|
}
|
|
|
|
void process_result(const PoolProblem& pool_problem,
|
|
ck_tile::DeviceMem& out_dev_buf,
|
|
ck_tile::HostTensor<OutDataType>& out_host_result,
|
|
ck_tile::HostTensor<OutDataType>& out_dev_result,
|
|
ck_tile::DeviceMem& out_index_dev_buf,
|
|
ck_tile::HostTensor<IndexDataType>& out_index_host_result,
|
|
ck_tile::HostTensor<IndexDataType>& out_index_dev_result,
|
|
const std::tuple<std::string, float>& kernel_run_result)
|
|
{
|
|
auto [name, avg_time] = kernel_run_result;
|
|
|
|
KernelInstance kernel_instance{name, pool_problem, {-1.0f, -1.0f, -1.0f}};
|
|
|
|
// Compute performance metrics
|
|
const ck_tile::index_t N = pool_problem.N;
|
|
const ck_tile::index_t D = pool_problem.D;
|
|
const ck_tile::index_t H = pool_problem.H;
|
|
const ck_tile::index_t W = pool_problem.W;
|
|
const ck_tile::index_t C = pool_problem.C;
|
|
const ck_tile::index_t Z = pool_problem.windowZ;
|
|
const ck_tile::index_t Y = pool_problem.windowY;
|
|
const ck_tile::index_t X = pool_problem.windowX;
|
|
const ck_tile::index_t Sz = pool_problem.strideZ;
|
|
const ck_tile::index_t Sy = pool_problem.strideY;
|
|
const ck_tile::index_t Sx = pool_problem.strideX;
|
|
const ck_tile::index_t Dz = pool_problem.dilationZ;
|
|
const ck_tile::index_t Dy = pool_problem.dilationY;
|
|
const ck_tile::index_t Dx = pool_problem.dilationX;
|
|
|
|
const ck_tile::index_t Zs = (Z - 1) * Dz + 1;
|
|
const ck_tile::index_t Ys = (Y - 1) * Dy + 1;
|
|
const ck_tile::index_t Xs = (X - 1) * Dx + 1;
|
|
|
|
const ck_tile::index_t Do =
|
|
(D + pool_problem.leftPadZ + pool_problem.rightPadZ - Zs) / Sz + 1;
|
|
const ck_tile::index_t Ho =
|
|
(H + pool_problem.leftPadY + pool_problem.rightPadY - Ys) / Sy + 1;
|
|
const ck_tile::index_t Wo =
|
|
(W + pool_problem.leftPadX + pool_problem.rightPadX - Xs) / Sx + 1;
|
|
|
|
// Calculate FLOPs: for pooling, we count one compare/add per window element per output
|
|
// element
|
|
std::size_t window_size =
|
|
static_cast<std::size_t>(Z) * static_cast<std::size_t>(Y) * static_cast<std::size_t>(X);
|
|
std::size_t output_elements = static_cast<std::size_t>(N) * static_cast<std::size_t>(Do) *
|
|
static_cast<std::size_t>(Ho) * static_cast<std::size_t>(Wo) *
|
|
static_cast<std::size_t>(C);
|
|
std::size_t flop = output_elements * window_size;
|
|
|
|
// Calculate memory bandwidth
|
|
std::size_t num_byte =
|
|
sizeof(InDataType) * N * D * H * W * C + sizeof(OutDataType) * N * Do * Ho * Wo * C;
|
|
|
|
// Update performance results
|
|
kernel_instance.perf_result_.latency_ = avg_time;
|
|
kernel_instance.perf_result_.tflops_ = static_cast<float>(flop) / 1.E9 / avg_time;
|
|
kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
|
|
|
|
if(setting_.log_ > 0 && !setting_.json_output_)
|
|
{
|
|
std::cout << kernel_instance << std::endl;
|
|
}
|
|
|
|
// Verify result
|
|
out_dev_buf.FromDevice(out_dev_result.data());
|
|
|
|
bool verified_correct = true;
|
|
if(setting_.verify_)
|
|
{
|
|
verified_correct = compare_pool_results(name, out_dev_result, out_host_result);
|
|
if(pool_problem.outputIndex)
|
|
{
|
|
out_index_dev_buf.FromDevice(out_index_dev_result.data());
|
|
verified_correct =
|
|
verified_correct &&
|
|
compare_pool_index_results(name, out_index_dev_result, out_index_host_result);
|
|
}
|
|
}
|
|
|
|
if(verified_correct)
|
|
{
|
|
kernel_instances_.emplace_back(kernel_instance);
|
|
}
|
|
else
|
|
{
|
|
std::cout << "Verification failed, skip kernel: " << name << std::endl;
|
|
}
|
|
|
|
// Clear tensors
|
|
out_dev_buf.SetZero();
|
|
out_dev_result.SetZero();
|
|
}
|
|
|
|
KernelInstance select_best_instance(Metric metric)
|
|
{
|
|
if(kernel_instances_.empty())
|
|
throw std::runtime_error("Empty instances");
|
|
|
|
auto kernel_instance = *std::max_element(kernel_instances_.begin(),
|
|
kernel_instances_.end(),
|
|
[metric](const auto& a, const auto& b) {
|
|
return PerformanceResult::compare(
|
|
b.perf_result_, a.perf_result_, metric);
|
|
});
|
|
|
|
if(setting_.json_output_)
|
|
{
|
|
// Output clean JSON only
|
|
std::cout << kernel_instance << std::endl;
|
|
}
|
|
else
|
|
{
|
|
std::cout << "**********************************" << std::endl;
|
|
std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
|
|
<< "Current kernel performance is: " << kernel_instance << std::endl;
|
|
std::cout << "**********************************" << std::endl;
|
|
}
|
|
|
|
if(!setting_.csv_filename_.empty())
|
|
{
|
|
std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
|
|
|
|
if(!file.is_open())
|
|
{
|
|
std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
|
|
}
|
|
else
|
|
{
|
|
if(file.tellp() == 0)
|
|
{
|
|
file << "rocm_version,device_name,"
|
|
<< "in_dtype,out_dtype,compute_dtype,index_dtype,"
|
|
<< "block_shape,reduce_op,pool_dim," << "N,D,H,W,C,"
|
|
<< "window_z,window_y,window_x," << "stride_z,stride_y,stride_x,"
|
|
<< "dilation_z,dilation_y,dilation_x,"
|
|
<< "left_pad_z,left_pad_y,left_pad_x,"
|
|
<< "right_pad_z,right_pad_y,right_pad_x," << "output_index,propagate_nan,"
|
|
<< "name," << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
|
|
}
|
|
|
|
const auto& problem = kernel_instance.problem_;
|
|
const auto& name = kernel_instance.name_;
|
|
const auto& perf = kernel_instance.perf_result_;
|
|
|
|
file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
|
|
<< problem.inDType << "," << problem.outDType << "," << problem.computeDType
|
|
<< "," << problem.indexDType << "," << problem.blockShape << ","
|
|
<< problem.reduceOp << "," << problem.poolDim << "," << problem.N << ","
|
|
<< problem.D << "," << problem.H << "," << problem.W << "," << problem.C << ","
|
|
<< problem.windowZ << "," << problem.windowY << "," << problem.windowX << ","
|
|
<< problem.strideZ << "," << problem.strideY << "," << problem.strideX << ","
|
|
<< problem.dilationZ << "," << problem.dilationY << "," << problem.dilationX
|
|
<< "," << problem.leftPadZ << "," << problem.leftPadY << ","
|
|
<< problem.leftPadX << "," << problem.rightPadZ << "," << problem.rightPadY
|
|
<< "," << problem.rightPadX << "," << problem.outputIndex << ","
|
|
<< problem.propagateNan << "," << name << "," << std::fixed
|
|
<< std::setprecision(4) << perf.latency_ << "," << std::fixed
|
|
<< std::setprecision(4) << perf.tflops_ << "," << std::fixed
|
|
<< std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
|
|
<< "\n";
|
|
|
|
if(!file)
|
|
{
|
|
std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
return kernel_instance;
|
|
}
|
|
|
|
PoolProfiler(const PoolProfiler&) = delete;
|
|
PoolProfiler& operator=(const PoolProfiler&) = delete;
|
|
|
|
private:
|
|
~PoolProfiler() { kernel_instances_.clear(); }
|
|
PoolProfiler(Setting setting) : setting_(setting) {}
|
|
|
|
Setting setting_;
|
|
|
|
std::vector<KernelInstance> kernel_instances_;
|
|
};
|