mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 09:16:52 +00:00
[CK_TILE] Restructure Tile Engine's benchmarking and profiling (#4769) ## Motivation This PR introduces a restructure for the benchmarking and profiling aspects of CK Tile's Tile Engine, expanding on the groundwork from this previous https://github.com/ROCm/composable_kernel/pull/3434 and outlined in this [design document](https://amdcloud-my.sharepoint.com/:w:/r/personal/astharai_amd_com/Documents/Restructuring%20Tile%20Engine.docx?d=w14ea28a30718416988ed5ebb759bd3b2&csf=1&web=1&e=l3VBuX). In PR 3434, to reduce repeated code we implemented: - Base class that centralizes common functionality and provides a default implementation (Universal GEMM) - Child classes for GEMM variants override virtual functions to handle variant-specific behavior This refactoring in this PR follows the same process and should greatly reduce the duplicated code present in Tile Engine and make it simpler to add in new operations, increasing scalability. ## Technical Details The files have been refactored around new base structs for benchmarks, profiling and problem descriptions. The new base structs are: - GemmProblem - GemmBenchmark - GemmProfiler Universal GEMM, Preshuffle GEMM, and Multi-D GEMM all have child classes that will inherit from these base structs overriding only what differs per variant. All common functions across the benchmarking and profiling files have been moved into newly added common utility files under the commons/ directory. The new utility files are: - utils.hpp: common functions for the benchmarking and profiling process - benchmark_utils.py: common utility functions for the benchmark generation ## Test Plan I tested using the existing tests for Tile Engine. ## Test Result All tests passed. ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
167 lines
5.1 KiB
C++
167 lines
5.1 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
#include <hip/hip_version.h>
|
|
#include <iostream>
|
|
#include <functional>
|
|
#include <tuple>
|
|
#include <exception>
|
|
#include <sstream>
|
|
#include <vector>
|
|
#include <string>
|
|
#include <cstdlib>
|
|
|
|
#include "ck_tile/core.hpp"
|
|
#include "ck_tile/host.hpp"
|
|
|
|
// Helper function to determine if a layout is row-major
|
|
template <typename Layout>
|
|
constexpr auto is_row_major(Layout)
|
|
{
|
|
return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
|
|
}
|
|
|
|
enum class Metric
|
|
{
|
|
LATENCY = 0,
|
|
TFLOPS = 1,
|
|
BANDWIDTH = 2
|
|
};
|
|
|
|
inline constexpr auto get_metric_name(Metric m)
|
|
{
|
|
switch(m)
|
|
{
|
|
case Metric::LATENCY: return "latency";
|
|
case Metric::TFLOPS: return "tflops";
|
|
case Metric::BANDWIDTH: return "bandwidth";
|
|
default: throw std::invalid_argument("Unsupported metric type");
|
|
}
|
|
}
|
|
|
|
struct PerformanceResult
|
|
{
|
|
double latency_;
|
|
double tflops_;
|
|
double bandwidth_;
|
|
|
|
static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
|
|
{
|
|
switch(m)
|
|
{
|
|
case Metric::LATENCY: return a.latency_ < b.latency_;
|
|
case Metric::TFLOPS: return a.tflops_ > b.tflops_;
|
|
case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
|
|
default: throw std::invalid_argument("Unsupported metric type");
|
|
}
|
|
}
|
|
};
|
|
|
|
template <typename Problem>
|
|
struct KernelInstance
|
|
{
|
|
std::string name_;
|
|
Problem problem_;
|
|
PerformanceResult perf_result_;
|
|
|
|
static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
|
|
{
|
|
return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
|
|
}
|
|
};
|
|
|
|
template <typename Problem>
|
|
std::ostream& operator<<(std::ostream& os, const KernelInstance<Problem>& obj)
|
|
{
|
|
os << "{\n"
|
|
<< " \"name\": \"" << obj.name_ << "\",\n"
|
|
<< " \"problem\": " << obj.problem_ << ",\n"
|
|
<< " \"perf_result\": " << obj.perf_result_ << "\n"
|
|
<< "}";
|
|
return os;
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
|
|
{
|
|
os << "{\n"
|
|
<< " \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_ << ",\n"
|
|
<< " \"tflops(TFlops)\": " << result.tflops_ << ",\n"
|
|
<< " \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
|
|
<< "}";
|
|
return os;
|
|
}
|
|
|
|
struct Settings
|
|
{
|
|
int n_warmup;
|
|
int n_repeat;
|
|
bool is_gpu_timer;
|
|
int verify;
|
|
int init_method;
|
|
bool log;
|
|
std::string csv_filename;
|
|
bool flush_cache;
|
|
int rotating_count;
|
|
bool json_output;
|
|
};
|
|
|
|
inline std::string get_rocm_version()
|
|
{
|
|
return std::to_string(HIP_VERSION_MAJOR) + "." + std::to_string(HIP_VERSION_MINOR);
|
|
}
|
|
|
|
template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
|
|
auto calculate_rtol_atol(const ck_tile::index_t K,
|
|
const ck_tile::index_t kbatch,
|
|
const float max_accumulated_value)
|
|
{
|
|
using ComputeType =
|
|
std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
|
|
// Calculate thresholds
|
|
const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
|
|
ck_tile::integer_divide_ceil(K, kbatch));
|
|
const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
|
|
max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
|
|
// Calculate error due to split_k accumulation
|
|
const auto rtol_split_k =
|
|
ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
|
|
const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
|
|
max_accumulated_value, kbatch);
|
|
// Use higher threshold
|
|
return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
|
|
}
|
|
|
|
template <typename ADataType,
|
|
typename BDataType,
|
|
typename D0DataType,
|
|
typename AccDataType,
|
|
typename CDataType>
|
|
auto calculate_rtol_atol(const ck_tile::index_t K,
|
|
const ck_tile::index_t kbatch,
|
|
const float max_accumulated_value)
|
|
{
|
|
using ComputeTypeAB =
|
|
std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
|
|
|
|
using ComputeType =
|
|
std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
|
|
|
|
// Calculate thresholds
|
|
const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
|
|
ck_tile::integer_divide_ceil(K, kbatch));
|
|
|
|
const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
|
|
max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
|
|
|
|
// Calculate error due to split_k accumulation
|
|
const auto rtol_split_k =
|
|
ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
|
|
|
|
const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
|
|
max_accumulated_value, kbatch);
|
|
|
|
// Use higher threshold
|
|
return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
|
|
}
|