Files
composable_kernel/codegen/test/gemm_multiple_d.cpp
arai713 3e9711f0cb CK Instance Gen (#1145)
* Format

* Format

* Format

* Remove const

* Use the right template

* Format

* Format

* add row/col instances

* Add missing file

* fixed

* fixing block to etile error

* Format

* Updates

* Format

* fixed rrr layout

* generating a sample JSON file: currently contains includes, prologue/epilogue and instances

* version where the json is passed into the instances to generate a key

* updated run function to just launch kernel

* updated run function: only contains kernel object, json file is updated but still needs to be cleaned up, added front-end API to parse JSON into character buffer

* adding in testing files

* cleaned up comments, still need to work on including header files

* removed unneeded files

* removed/commented out JSON implementation

* added fusion(prologue/epilogue) into instance generation

* working on instance selection

* added instance selection, need to fix instance validation

* removed block2etile map validity check for testing purposes

* test running: failing due to incorrect files/input

* all grid descs/ptrs completed, but device file not found

* Update test and embed modules

* Restore older version

* added convolution operation, written test, debugging generated code for compilation

* attempting to include CK in host directory: _Float16 error

* CK header file issues

* slight fix

* don't crash when hip can't report total memory

* dump generated code to a file

* changing sizes

* creating tensor descriptors using CK methods: set up grid desc manually, also trying to set up an argument pointer - this needs to be fixed

* some fixes to call the device code

* separating test files for conv and gemm

* completed arg ptr, now have linking errors

* clang format fix

* resolved linker issues in conv test

* remove dependency on libutility from ck

* resolved num dim error

* properly passing arg ptr, errors with passing typenames: redefinition/redeclaration

* undo the commenting of device function

* hand created kernel code to find rtc issues

* dump the full src to file

* resolved redeclaration errors, cleaned up errors for Amber's kernel code

* debugging purposes: redeclaration error

* config files

* resolved errors for NumTensor and redeclaration, formatted version.h

* resolved most errors in manually added kernel and my own. error with calling kernel object: overloaded function type

* WIP: close to getting kernel compiled

* WIP: fixing rtc errors

* fixed sequence errors, formatting, still one error with run fcn

* yay: kernel compiles and runs

* updated templated/generated version to run and compile

* minor fixes

* working generated example, resolved memory access error due to padding

* adding in reference kernel, validation failing against reference

* debugging: printing kernel argsz

* reduced error in results

* debugged reference kernel and output errors, added to generated version, currently debugging prologue function issues

* working validation (using reference convolution) with prologue function for both hard-coded and generated version

* WIP: create an alt version that creates Argument on the device

* wip: added new duplicate files, fixed fusion templating errors from working example, setting up kernel arguments

* wip: making necessary methods device code

* added grid descs, working on grid pointers, errors with stl numerics

* wip: updating kernel args - issue, replacing some std functions

* replaced std::accumulate call with temp hardcoded version

* wip: args causing memory issue

* Construct Argument object inside the kernel and use it to call convolution device function. Code runs and verification passes

* adding object file dump

* temporary hardcoding of grid size, can remove device op inst + arg ptr

* minor fix for grid size

* added modified example where arg ptr is created on the device for generated version as well

* removed device op instance and arg ptr from modified examples

* moving device op file for testing purposes and to properly build CK

* commenting out print-outs

* adjust compiler args to produce a valid ELF file

* temporary removal of validation

* reverting compiler args back for working example

* retrieve necessary arguments from generated template parameters in correct format

* calculating grid size on host-side, still need to clean up process, pass parameters to host functions properly

* scaled up factory functions/wrapper structs to implement host-side launch parameter calculations using CK host side functions - in hard-coded example

* temporary change to generate ELF format binary object file

* removed unecessary code, added comments

* formatting fix

* cleaned up code, added new tests, restructured library: move helper into CK

* refactored launch parameter calculation to be more concise

* renamed files and variables for more clarity/uniformity

* more code cleaning, removed debug statements

* moved majority of my files into codegen directory, running properly

* updated Embed.cmake(string_view) in codegen directory

* updated host directory to match Embed.cmake as well

* added old tests in

* updated instance generation methods to be more concise

* removed layout from launch parameter calculation

* working test

* fixed issue with verification, all instances working

* updated verification in other tests

* removed duplicate matrix padder file, removed code dumps

* removed old hard-coded tests

* removed old host directory, all files in codegen directory now

* fixed copyright in files

* commenting out validation

* renamed files

* made changes for review: fixed copyright, renamed files for clarity, removed comments, refactored code

* updated headers

* removing duplicate file for fwd conv to gemm, merging with original file

* fix building codegen with clang++ directly

* resolving build error from conv_fwd_to_gemm

* fix for previous error

* renaming tests

* created common test file

* cleaned up code, added comments

* renamed device op

* fixed typos in comments

* removed extra space

* code cleanup: resolving Amber's comments

* removed wrapper struct for matrix padder, fixed template

* cleaned up if statements for better readability

---------

Co-authored-by: Paul <pfultz2@yahoo.com>
Co-authored-by: Jing Zhang <jizha@amd.com>
Co-authored-by: M. Amber Hassaan <amber_474@yahoo.com>
Co-authored-by: illsilin <Illia.Silin@amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
2024-06-25 16:37:35 -05:00

191 lines
6.1 KiB
C++

#include "ck/host/device_gemm_multiple_d/problem.hpp"
#include "ck/host/device_gemm_multiple_d/operation.hpp"
#include "ck/host/headers.hpp"
#include "ck/host/stringutils.hpp"
#include "ck/host/utils.hpp"
#include <algorithm>
#include <cmath>
#include <iterator>
#include <random>
#include <test.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/hip.hpp>
#include <fstream>
using half = _Float16;
// using half = __fp16;
std::vector<rtc::src_file> get_headers_for_test()
{
std::vector<rtc::src_file> result;
auto hs = ck::host::GetHeaders();
std::transform(
hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> rtc::src_file {
return {p.first, p.second};
});
return result;
}
template <class T>
rtc::buffer<T> generate_buffer(std::size_t n, std::size_t seed = 0)
{
rtc::buffer<T> result(n);
std::mt19937 gen(seed);
std::uniform_real_distribution<double> dis(-1.0);
std::generate(result.begin(), result.end(), [&] { return dis(gen); });
return result;
}
template <class T, class U>
bool allclose(const T& a, const U& b, double atol = 0.01, double rtol = 0.01)
{
return std::equal(a.begin(), a.end(), b.begin(), b.end(), [&](double x, double y) {
return fabs(x - y) < atol + rtol * fabs(y);
});
}
std::string classify(double x)
{
switch(std::fpclassify(x))
{
case FP_INFINITE: return "inf";
case FP_NAN: return "nan";
case FP_NORMAL: return "normal";
case FP_SUBNORMAL: return "subnormal";
case FP_ZERO: return "zero";
default: return "unknown";
}
}
template <class Buffer>
void print_classification(const Buffer& x)
{
std::unordered_set<std::string> result;
for(const auto& i : x)
result.insert(classify(i));
for(const auto& c : result)
std::cout << c << ", ";
std::cout << std::endl;
}
template <class Buffer>
void print_statistics(const Buffer& x)
{
std::cout << "Min value: " << *std::min_element(x.begin(), x.end()) << ", ";
std::cout << "Max value: " << *std::max_element(x.begin(), x.end()) << ", ";
double num_elements = x.size();
auto mean =
std::accumulate(x.begin(), x.end(), double{0.0}, std::plus<double>{}) / num_elements;
auto stddev = std::sqrt(
std::accumulate(x.begin(),
x.end(),
double{0.0},
[&](double r, double v) { return r + std::pow((v - mean), 2.0); }) /
num_elements);
std::cout << "Mean: " << mean << ", ";
std::cout << "StdDev: " << stddev << "\n";
}
template <class Buffer>
void print_preview(const Buffer& x)
{
if(x.size() <= 10)
{
std::for_each(x.begin(), x.end(), [&](double i) { std::cout << i << ", "; });
}
else
{
std::for_each(x.begin(), x.begin() + 5, [&](double i) { std::cout << i << ", "; });
std::cout << "..., ";
std::for_each(x.end() - 5, x.end(), [&](double i) { std::cout << i << ", "; });
}
std::cout << std::endl;
}
template <class T>
struct check_all
{
rtc::buffer<T> data{};
bool operator()(const rtc::buffer<T>& x)
{
if(data.empty())
{
data = x;
return true;
}
if(std::any_of(x.begin(), x.end(), [](double y) { return std::isnan(y); }))
return false;
return allclose(data, x);
}
};
template <class Solution>
auto report(const Solution& solution, bool pass)
{
return test::make_predicate(solution.ToTemplateString(), [=] { return pass; });
}
const std::string gemm_compile_check = R"__ck__(
#include <${include}>
extern "C" __global__ void f(const ck::half_t* a, const ck::half_t* b, ck::half_t* c) {
using G = ${template};
constexpr auto desc = ${template}::make_descriptor(ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${k})),
ck::make_naive_tensor_descriptor(ck::make_tuple(${n}, ${k}), ck::make_tuple(1, ${n})),
ck::make_tuple(),
ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${n})));
static_assert(desc.IsValid(), "Invalid ck gemm.");
if constexpr(desc.IsValid())
{
${template}::Run(desc,
a,
b,
ck::make_tuple(),
c);
}
}
)__ck__";
TEST_CASE(test_problem_kernel)
{
ck::host::device_gemm_multiple_d::Problem prob;
prob.M = 1024;
prob.N = 1024;
prob.K = 1024;
check_all<half> check;
auto a = to_gpu(generate_buffer<half>(1024 * 1024, 0));
auto b = to_gpu(generate_buffer<half>(1024 * 1024, 1));
auto c = to_gpu(generate_buffer<half>(1024 * 1024, 2));
std::string epilogue = "";
std::string prologue = "";
for(auto solution : prob.GetSolutions("gfx90a", prologue, epilogue))
{
auto src = ck::host::InterpolateString(gemm_compile_check,
{{"include", prob.GetIncludeHeader()},
{"template", solution.ToTemplateString()},
{"m", std::to_string(prob.M)},
{"n", std::to_string(prob.N)},
{"k", std::to_string(prob.K)}});
auto srcs = get_headers_for_test();
srcs.push_back({"main.cpp", src});
rtc::compile_options options;
options.kernel_name = "f";
auto k = rtc::compile_kernel(srcs, options);
auto block_size = solution.GetTemplateParameter<std::size_t>("BlockSize");
auto m_per_block = solution.GetTemplateParameter<std::size_t>("MPerBlock");
auto n_per_block = solution.GetTemplateParameter<std::size_t>("NPerBlock");
auto grid_size = ck::host::integer_divide_ceil(prob.M, m_per_block) *
ck::host::integer_divide_ceil(prob.N, n_per_block);
k.launch(nullptr, grid_size * block_size, block_size)(a.data(), b.data(), c.data());
CHECK(report(solution, check(rtc::from_gpu(c))));
}
}
int main(int argc, const char* argv[]) { test::run(argc, argv); }