mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 17:26:00 +00:00
[CK_TILE] Restructure Tile Engine's benchmarking and profiling (#4769) ## Motivation This PR introduces a restructure for the benchmarking and profiling aspects of CK Tile's Tile Engine, expanding on the groundwork from this previous https://github.com/ROCm/composable_kernel/pull/3434 and outlined in this [design document](https://amdcloud-my.sharepoint.com/:w:/r/personal/astharai_amd_com/Documents/Restructuring%20Tile%20Engine.docx?d=w14ea28a30718416988ed5ebb759bd3b2&csf=1&web=1&e=l3VBuX). In PR 3434, to reduce repeated code we implemented: - Base class that centralizes common functionality and provides a default implementation (Universal GEMM) - Child classes for GEMM variants override virtual functions to handle variant-specific behavior This refactoring in this PR follows the same process and should greatly reduce the duplicated code present in Tile Engine and make it simpler to add in new operations, increasing scalability. ## Technical Details The files have been refactored around new base structs for benchmarks, profiling and problem descriptions. The new base structs are: - GemmProblem - GemmBenchmark - GemmProfiler Universal GEMM, Preshuffle GEMM, and Multi-D GEMM all have child classes that will inherit from these base structs overriding only what differs per variant. All common functions across the benchmarking and profiling files have been moved into newly added common utility files under the commons/ directory. The new utility files are: - utils.hpp: common functions for the benchmarking and profiling process - benchmark_utils.py: common utility functions for the benchmark generation ## Test Plan I tested using the existing tests for Tile Engine. ## Test Result All tests passed. ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
97 lines
3.7 KiB
C++
97 lines
3.7 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
#include <iostream>
|
|
#include <functional>
|
|
#include <tuple>
|
|
#include <exception>
|
|
#include <sstream>
|
|
#include <vector>
|
|
#include <string>
|
|
|
|
#include "ck_tile/core.hpp"
|
|
#include "ck_tile/host.hpp"
|
|
|
|
// Structure to hold kernel traits for dispatcher
|
|
struct KernelTraits
|
|
{
|
|
std::string pipeline; // compv3, compv4, mem
|
|
std::string scheduler; // intrawave, interwave
|
|
std::string epilogue; // cshuffle, default
|
|
bool pad_m;
|
|
bool pad_n;
|
|
bool pad_k;
|
|
bool persistent;
|
|
|
|
// Constructor with defaults
|
|
KernelTraits()
|
|
: pipeline("compv3"),
|
|
scheduler("intrawave"),
|
|
epilogue("cshuffle"),
|
|
pad_m(false),
|
|
pad_n(false),
|
|
pad_k(false),
|
|
persistent(false)
|
|
{
|
|
}
|
|
};
|
|
|
|
// Create argument parser
|
|
inline auto create_args(int argc, char* argv[])
|
|
{
|
|
ck_tile::ArgParser arg_parser;
|
|
arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
|
|
.insert("n", "4096", "The value for n dimension. Default is 4096.")
|
|
.insert("k", "2048", "The value for k dimension. Default is 2048.")
|
|
.insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
|
|
.insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
|
|
.insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
|
|
.insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
|
|
.insert("split_k", "1", "The split value for k dimension. Default is 1.")
|
|
.insert("verify",
|
|
"2",
|
|
"The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
|
|
"for validation on GPU. Default is 2, GPU validation.")
|
|
.insert("log",
|
|
"false",
|
|
"Whether output kernel instance information or not. Possible values are true or "
|
|
"false. Default is false")
|
|
.insert(
|
|
"warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
|
|
.insert(
|
|
"repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
|
|
.insert("timer",
|
|
"true",
|
|
"Whether if the timer is gpu timer or not. Possible values are false or true. "
|
|
"Default is true.")
|
|
.insert("init",
|
|
"0",
|
|
"The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
|
|
"for constant(1). Default is 0, random.")
|
|
.insert("flush_cache",
|
|
"true",
|
|
"To flush cache, possible values are true or false. "
|
|
"Default is false.")
|
|
.insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
|
|
.insert("metric",
|
|
"0",
|
|
"Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
|
|
"tflops, or 2 for bandwidth. Default is 0, latency.")
|
|
.insert("csv_filename",
|
|
"",
|
|
"The filename of benchmark result. Default is empty (no CSV output).")
|
|
.insert("structured_sparsity",
|
|
"false",
|
|
"Whether use sparsity kernel or not. Possible values are true or false. Default is "
|
|
"false")
|
|
.insert("json_output",
|
|
"false",
|
|
"Whether to output results in JSON format only. Possible values are true or false. "
|
|
"Default is "
|
|
"false");
|
|
|
|
bool result = arg_parser.parse(argc, argv);
|
|
return std::make_tuple(result, arg_parser);
|
|
}
|