mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
* Format
* Format
* Format
* Remove const
* Use the right template
* Format
* Format
* add row/col instances
* Add missing file
* fixed
* fixing block to etile error
* Format
* Updates
* Format
* fixed rrr layout
* generating a sample JSON file: currently contains includes, prologue/epilogue and instances
* version where the json is passed into the instances to generate a key
* updated run function to just launch kernel
* updated run function: only contains kernel object, json file is updated but still needs to be cleaned up, added front-end API to parse JSON into character buffer
* adding in testing files
* cleaned up comments, still need to work on including header files
* removed unneeded files
* removed/commented out JSON implementation
* added fusion(prologue/epilogue) into instance generation
* working on instance selection
* added instance selection, need to fix instance validation
* removed block2etile map validity check for testing purposes
* test running: failing due to incorrect files/input
* all grid descs/ptrs completed, but device file not found
* Update test and embed modules
* Restore older version
* added convolution operation, written test, debugging generated code for compilation
* attempting to include CK in host directory: _Float16 error
* CK header file issues
* slight fix
* don't crash when hip can't report total memory
* dump generated code to a file
* changing sizes
* creating tensor descriptors using CK methods: set up grid desc manually, also trying to set up an argument pointer - this needs to be fixed
* some fixes to call the device code
* separating test files for conv and gemm
* completed arg ptr, now have linking errors
* clang format fix
* resolved linker issues in conv test
* remove dependency on libutility from ck
* resolved num dim error
* properly passing arg ptr, errors with passing typenames: redefinition/redeclaration
* undo the commenting of device function
* hand created kernel code to find rtc issues
* dump the full src to file
* resolved redeclaration errors, cleaned up errors for Amber's kernel code
* debugging purposes: redeclaration error
* config files
* resolved errors for NumTensor and redeclaration, formatted version.h
* resolved most errors in manually added kernel and my own. error with calling kernel object: overloaded function type
* WIP: close to getting kernel compiled
* WIP: fixing rtc errors
* fixed sequence errors, formatting, still one error with run fcn
* yay: kernel compiles and runs
* updated templated/generated version to run and compile
* minor fixes
* working generated example, resolved memory access error due to padding
* adding in reference kernel, validation failing against reference
* debugging: printing kernel argsz
* reduced error in results
* debugged reference kernel and output errors, added to generated version, currently debugging prologue function issues
* working validation (using reference convolution) with prologue function for both hard-coded and generated version
* WIP: create an alt version that creates Argument on the device
* wip: added new duplicate files, fixed fusion templating errors from working example, setting up kernel arguments
* wip: making necessary methods device code
* added grid descs, working on grid pointers, errors with stl numerics
* wip: updating kernel args - issue, replacing some std functions
* replaced std::accumulate call with temp hardcoded version
* wip: args causing memory issue
* Construct Argument object inside the kernel and use it to call convolution device function. Code runs and verification passes
* adding object file dump
* temporary hardcoding of grid size, can remove device op inst + arg ptr
* minor fix for grid size
* added modified example where arg ptr is created on the device for generated version as well
* removed device op instance and arg ptr from modified examples
* moving device op file for testing purposes and to properly build CK
* commenting out print-outs
* adjust compiler args to produce a valid ELF file
* temporary removal of validation
* reverting compiler args back for working example
* retrieve necessary arguments from generated template parameters in correct format
* calculating grid size on host-side, still need to clean up process, pass parameters to host functions properly
* scaled up factory functions/wrapper structs to implement host-side launch parameter calculations using CK host side functions - in hard-coded example
* temporary change to generate ELF format binary object file
* removed unecessary code, added comments
* formatting fix
* cleaned up code, added new tests, restructured library: move helper into CK
* refactored launch parameter calculation to be more concise
* renamed files and variables for more clarity/uniformity
* more code cleaning, removed debug statements
* moved majority of my files into codegen directory, running properly
* updated Embed.cmake(string_view) in codegen directory
* updated host directory to match Embed.cmake as well
* added old tests in
* updated instance generation methods to be more concise
* removed layout from launch parameter calculation
* working test
* fixed issue with verification, all instances working
* updated verification in other tests
* removed duplicate matrix padder file, removed code dumps
* removed old hard-coded tests
* removed old host directory, all files in codegen directory now
* fixed copyright in files
* commenting out validation
* renamed files
* made changes for review: fixed copyright, renamed files for clarity, removed comments, refactored code
* updated headers
* removing duplicate file for fwd conv to gemm, merging with original file
* fix building codegen with clang++ directly
* resolving build error from conv_fwd_to_gemm
* fix for previous error
* renaming tests
* created common test file
* cleaned up code, added comments
* renamed device op
* fixed typos in comments
* removed extra space
* code cleanup: resolving Amber's comments
* removed wrapper struct for matrix padder, fixed template
* cleaned up if statements for better readability
---------
Co-authored-by: Paul <pfultz2@yahoo.com>
Co-authored-by: Jing Zhang <jizha@amd.com>
Co-authored-by: M. Amber Hassaan <amber_474@yahoo.com>
Co-authored-by: illsilin <Illia.Silin@amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
[ROCm/composable_kernel commit: 3e9711f0cb]
191 lines
6.1 KiB
C++
191 lines
6.1 KiB
C++
#include "ck/host/device_gemm_multiple_d/problem.hpp"
|
|
#include "ck/host/device_gemm_multiple_d/operation.hpp"
|
|
#include "ck/host/headers.hpp"
|
|
#include "ck/host/stringutils.hpp"
|
|
#include "ck/host/utils.hpp"
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <iterator>
|
|
#include <random>
|
|
#include <test.hpp>
|
|
#include <rtc/compile_kernel.hpp>
|
|
#include <rtc/hip.hpp>
|
|
#include <fstream>
|
|
|
|
using half = _Float16;
|
|
// using half = __fp16;
|
|
|
|
std::vector<rtc::src_file> get_headers_for_test()
|
|
{
|
|
std::vector<rtc::src_file> result;
|
|
auto hs = ck::host::GetHeaders();
|
|
std::transform(
|
|
hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> rtc::src_file {
|
|
return {p.first, p.second};
|
|
});
|
|
return result;
|
|
}
|
|
|
|
template <class T>
|
|
rtc::buffer<T> generate_buffer(std::size_t n, std::size_t seed = 0)
|
|
{
|
|
rtc::buffer<T> result(n);
|
|
std::mt19937 gen(seed);
|
|
std::uniform_real_distribution<double> dis(-1.0);
|
|
std::generate(result.begin(), result.end(), [&] { return dis(gen); });
|
|
return result;
|
|
}
|
|
|
|
template <class T, class U>
|
|
bool allclose(const T& a, const U& b, double atol = 0.01, double rtol = 0.01)
|
|
{
|
|
return std::equal(a.begin(), a.end(), b.begin(), b.end(), [&](double x, double y) {
|
|
return fabs(x - y) < atol + rtol * fabs(y);
|
|
});
|
|
}
|
|
|
|
std::string classify(double x)
|
|
{
|
|
switch(std::fpclassify(x))
|
|
{
|
|
case FP_INFINITE: return "inf";
|
|
case FP_NAN: return "nan";
|
|
case FP_NORMAL: return "normal";
|
|
case FP_SUBNORMAL: return "subnormal";
|
|
case FP_ZERO: return "zero";
|
|
default: return "unknown";
|
|
}
|
|
}
|
|
|
|
template <class Buffer>
|
|
void print_classification(const Buffer& x)
|
|
{
|
|
std::unordered_set<std::string> result;
|
|
for(const auto& i : x)
|
|
result.insert(classify(i));
|
|
for(const auto& c : result)
|
|
std::cout << c << ", ";
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
template <class Buffer>
|
|
void print_statistics(const Buffer& x)
|
|
{
|
|
std::cout << "Min value: " << *std::min_element(x.begin(), x.end()) << ", ";
|
|
std::cout << "Max value: " << *std::max_element(x.begin(), x.end()) << ", ";
|
|
double num_elements = x.size();
|
|
auto mean =
|
|
std::accumulate(x.begin(), x.end(), double{0.0}, std::plus<double>{}) / num_elements;
|
|
auto stddev = std::sqrt(
|
|
std::accumulate(x.begin(),
|
|
x.end(),
|
|
double{0.0},
|
|
[&](double r, double v) { return r + std::pow((v - mean), 2.0); }) /
|
|
num_elements);
|
|
std::cout << "Mean: " << mean << ", ";
|
|
std::cout << "StdDev: " << stddev << "\n";
|
|
}
|
|
|
|
template <class Buffer>
|
|
void print_preview(const Buffer& x)
|
|
{
|
|
if(x.size() <= 10)
|
|
{
|
|
std::for_each(x.begin(), x.end(), [&](double i) { std::cout << i << ", "; });
|
|
}
|
|
else
|
|
{
|
|
std::for_each(x.begin(), x.begin() + 5, [&](double i) { std::cout << i << ", "; });
|
|
std::cout << "..., ";
|
|
std::for_each(x.end() - 5, x.end(), [&](double i) { std::cout << i << ", "; });
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
template <class T>
|
|
struct check_all
|
|
{
|
|
rtc::buffer<T> data{};
|
|
bool operator()(const rtc::buffer<T>& x)
|
|
{
|
|
if(data.empty())
|
|
{
|
|
data = x;
|
|
return true;
|
|
}
|
|
if(std::any_of(x.begin(), x.end(), [](double y) { return std::isnan(y); }))
|
|
return false;
|
|
return allclose(data, x);
|
|
}
|
|
};
|
|
|
|
template <class Solution>
|
|
auto report(const Solution& solution, bool pass)
|
|
{
|
|
return test::make_predicate(solution.ToTemplateString(), [=] { return pass; });
|
|
}
|
|
|
|
const std::string gemm_compile_check = R"__ck__(
|
|
#include <${include}>
|
|
|
|
extern "C" __global__ void f(const ck::half_t* a, const ck::half_t* b, ck::half_t* c) {
|
|
using G = ${template};
|
|
constexpr auto desc = ${template}::make_descriptor(ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${k})),
|
|
ck::make_naive_tensor_descriptor(ck::make_tuple(${n}, ${k}), ck::make_tuple(1, ${n})),
|
|
ck::make_tuple(),
|
|
ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${n})));
|
|
|
|
static_assert(desc.IsValid(), "Invalid ck gemm.");
|
|
|
|
if constexpr(desc.IsValid())
|
|
{
|
|
${template}::Run(desc,
|
|
a,
|
|
b,
|
|
ck::make_tuple(),
|
|
c);
|
|
}
|
|
}
|
|
|
|
)__ck__";
|
|
|
|
TEST_CASE(test_problem_kernel)
|
|
{
|
|
ck::host::device_gemm_multiple_d::Problem prob;
|
|
prob.M = 1024;
|
|
prob.N = 1024;
|
|
prob.K = 1024;
|
|
check_all<half> check;
|
|
auto a = to_gpu(generate_buffer<half>(1024 * 1024, 0));
|
|
auto b = to_gpu(generate_buffer<half>(1024 * 1024, 1));
|
|
auto c = to_gpu(generate_buffer<half>(1024 * 1024, 2));
|
|
|
|
std::string epilogue = "";
|
|
std::string prologue = "";
|
|
|
|
for(auto solution : prob.GetSolutions("gfx90a", prologue, epilogue))
|
|
{
|
|
auto src = ck::host::InterpolateString(gemm_compile_check,
|
|
{{"include", prob.GetIncludeHeader()},
|
|
{"template", solution.ToTemplateString()},
|
|
{"m", std::to_string(prob.M)},
|
|
{"n", std::to_string(prob.N)},
|
|
{"k", std::to_string(prob.K)}});
|
|
auto srcs = get_headers_for_test();
|
|
srcs.push_back({"main.cpp", src});
|
|
rtc::compile_options options;
|
|
options.kernel_name = "f";
|
|
auto k = rtc::compile_kernel(srcs, options);
|
|
auto block_size = solution.GetTemplateParameter<std::size_t>("BlockSize");
|
|
auto m_per_block = solution.GetTemplateParameter<std::size_t>("MPerBlock");
|
|
auto n_per_block = solution.GetTemplateParameter<std::size_t>("NPerBlock");
|
|
auto grid_size = ck::host::integer_divide_ceil(prob.M, m_per_block) *
|
|
ck::host::integer_divide_ceil(prob.N, n_per_block);
|
|
k.launch(nullptr, grid_size * block_size, block_size)(a.data(), b.data(), c.data());
|
|
|
|
CHECK(report(solution, check(rtc::from_gpu(c))));
|
|
}
|
|
}
|
|
|
|
int main(int argc, const char* argv[]) { test::run(argc, argv); }
|