mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 09:16:52 +00:00
* Format * Format * Format * Remove const * Use the right template * Format * Format * add row/col instances * Add missing file * fixed * fixing block to etile error * Format * Updates * Format * fixed rrr layout * generating a sample JSON file: currently contains includes, prologue/epilogue and instances * version where the json is passed into the instances to generate a key * updated run function to just launch kernel * updated run function: only contains kernel object, json file is updated but still needs to be cleaned up, added front-end API to parse JSON into character buffer * adding in testing files * cleaned up comments, still need to work on including header files * removed unneeded files * removed/commented out JSON implementation * added fusion(prologue/epilogue) into instance generation * working on instance selection * added instance selection, need to fix instance validation * removed block2etile map validity check for testing purposes * test running: failing due to incorrect files/input * all grid descs/ptrs completed, but device file not found * Update test and embed modules * Restore older version * added convolution operation, written test, debugging generated code for compilation * attempting to include CK in host directory: _Float16 error * CK header file issues * slight fix * don't crash when hip can't report total memory * dump generated code to a file * changing sizes * creating tensor descriptors using CK methods: set up grid desc manually, also trying to set up an argument pointer - this needs to be fixed * some fixes to call the device code * separating test files for conv and gemm * completed arg ptr, now have linking errors * clang format fix * resolved linker issues in conv test * remove dependency on libutility from ck * resolved num dim error * properly passing arg ptr, errors with passing typenames: redefinition/redeclaration * undo the commenting of device function * hand created kernel code to find rtc issues * dump the full src to file * resolved redeclaration errors, cleaned up errors for Amber's kernel code * debugging purposes: redeclaration error * config files * resolved errors for NumTensor and redeclaration, formatted version.h * resolved most errors in manually added kernel and my own. error with calling kernel object: overloaded function type * WIP: close to getting kernel compiled * WIP: fixing rtc errors * fixed sequence errors, formatting, still one error with run fcn * yay: kernel compiles and runs * updated templated/generated version to run and compile * minor fixes * working generated example, resolved memory access error due to padding * adding in reference kernel, validation failing against reference * debugging: printing kernel argsz * reduced error in results * debugged reference kernel and output errors, added to generated version, currently debugging prologue function issues * working validation (using reference convolution) with prologue function for both hard-coded and generated version * WIP: create an alt version that creates Argument on the device * wip: added new duplicate files, fixed fusion templating errors from working example, setting up kernel arguments * wip: making necessary methods device code * added grid descs, working on grid pointers, errors with stl numerics * wip: updating kernel args - issue, replacing some std functions * replaced std::accumulate call with temp hardcoded version * wip: args causing memory issue * Construct Argument object inside the kernel and use it to call convolution device function. Code runs and verification passes * adding object file dump * temporary hardcoding of grid size, can remove device op inst + arg ptr * minor fix for grid size * added modified example where arg ptr is created on the device for generated version as well * removed device op instance and arg ptr from modified examples * moving device op file for testing purposes and to properly build CK * commenting out print-outs * adjust compiler args to produce a valid ELF file * temporary removal of validation * reverting compiler args back for working example * retrieve necessary arguments from generated template parameters in correct format * calculating grid size on host-side, still need to clean up process, pass parameters to host functions properly * scaled up factory functions/wrapper structs to implement host-side launch parameter calculations using CK host side functions - in hard-coded example * temporary change to generate ELF format binary object file * removed unecessary code, added comments * formatting fix * cleaned up code, added new tests, restructured library: move helper into CK * refactored launch parameter calculation to be more concise * renamed files and variables for more clarity/uniformity * more code cleaning, removed debug statements * moved majority of my files into codegen directory, running properly * updated Embed.cmake(string_view) in codegen directory * updated host directory to match Embed.cmake as well * added old tests in * updated instance generation methods to be more concise * removed layout from launch parameter calculation * working test * fixed issue with verification, all instances working * updated verification in other tests * removed duplicate matrix padder file, removed code dumps * removed old hard-coded tests * removed old host directory, all files in codegen directory now * fixed copyright in files * commenting out validation * renamed files * made changes for review: fixed copyright, renamed files for clarity, removed comments, refactored code * updated headers * removing duplicate file for fwd conv to gemm, merging with original file * fix building codegen with clang++ directly * resolving build error from conv_fwd_to_gemm * fix for previous error * renaming tests * created common test file * cleaned up code, added comments * renamed device op * fixed typos in comments * removed extra space * code cleanup: resolving Amber's comments * removed wrapper struct for matrix padder, fixed template * cleaned up if statements for better readability --------- Co-authored-by: Paul <pfultz2@yahoo.com> Co-authored-by: Jing Zhang <jizha@amd.com> Co-authored-by: M. Amber Hassaan <amber_474@yahoo.com> Co-authored-by: illsilin <Illia.Silin@amd.com> Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
106 lines
3.4 KiB
C++
106 lines
3.4 KiB
C++
|
|
#include <functional>
|
|
#include <iostream>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
#include "ck/host/device_gemm_multiple_d/operation.hpp"
|
|
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
|
|
#include "ck/host/stringutils.hpp"
|
|
|
|
using ck::host::Transform;
|
|
|
|
struct Emitters
|
|
{
|
|
// retrieve the hard-coded instances provided, template them, and then store them in a map
|
|
std::unordered_map<std::string, std::function<std::vector<std::string>()>> m;
|
|
|
|
template <class T>
|
|
void Register(const std::string& name, const std::string& prologue, const std::string& epilogue)
|
|
{
|
|
m[name] = [&] {
|
|
auto configs = T::CreateOperations(prologue, epilogue);
|
|
|
|
return Transform(configs, [](const auto& ops) { return ToTuple(ops); });
|
|
};
|
|
}
|
|
|
|
// takes in an operation instance and uses it to substitute the correct values into the template
|
|
template <class T>
|
|
static std::string ToTuple(const T& ops)
|
|
{
|
|
auto templates = Transform(
|
|
ops, [](const auto& op) { return " " + op.ToSolution().ToTemplateString(); });
|
|
return "std::tuple<\n" + ck::host::JoinStrings(templates, ",\n") + ">";
|
|
}
|
|
|
|
// Join together all the strings in the map
|
|
std::string Emit(const std::string& name) { return ck::host::JoinStrings(m.at(name)(), "\n"); }
|
|
|
|
std::vector<std::string> List() const
|
|
{
|
|
return Transform(m, [](auto&& p) { return p.first; });
|
|
}
|
|
};
|
|
|
|
int main(int argc, const char* argv[])
|
|
{
|
|
std::string prog = argv[0];
|
|
std::vector<std::string> args(argv + 1, argv + argc);
|
|
|
|
// Specify problem type and problem size
|
|
ck::host::device_gemm_multiple_d::Problem prob;
|
|
prob.M = 1024;
|
|
prob.N = 1024;
|
|
prob.K = 1024;
|
|
|
|
// user provided fusion
|
|
std::string prologue = "";
|
|
std::string epilogue = R"(
|
|
struct Epilogue
|
|
{
|
|
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
|
|
|
template <typename E, typename D>
|
|
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
|
|
|
template <>
|
|
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
|
const ck::half_t& d) const
|
|
{
|
|
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
|
}
|
|
|
|
float alpha_;
|
|
float beta_;
|
|
};)";
|
|
|
|
// Load in operations into the Register
|
|
Emitters e;
|
|
e.Register<ck::host::device_gemm_multiple_d::Operation_Xdl_CShuffle>(
|
|
"DeviceGemmMultipleD_Xdl_CShuffle", prologue, epilogue);
|
|
|
|
if(args.empty() or std::any_of(args.begin(), args.end(), [](auto arg) {
|
|
return arg == "-h" or arg == "--help";
|
|
}))
|
|
{
|
|
std::cout << "USAGE:" << std::endl;
|
|
std::cout << " " << prog << " [TEMPLATE]" << std::endl;
|
|
std::cout << std::endl;
|
|
std::cout << "FLAGS:" << std::endl;
|
|
std::cout << " -h, --help Show help" << std::endl;
|
|
std::cout << std::endl;
|
|
std::cout << "TEMPLATES:" << std::endl;
|
|
for(auto x : e.List())
|
|
std::cout << " " << x << std::endl;
|
|
std::cout << std::endl;
|
|
return 0;
|
|
}
|
|
|
|
// print out all the instances for the operation that was chosen at the command line
|
|
for(auto name : args)
|
|
std::cout << e.Emit(name) << std::endl;
|
|
|
|
return 0;
|
|
}
|