mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 04:49:54 +00:00
CK Instance Gen (#1145)
* Format
* Format
* Format
* Remove const
* Use the right template
* Format
* Format
* add row/col instances
* Add missing file
* fixed
* fixing block to etile error
* Format
* Updates
* Format
* fixed rrr layout
* generating a sample JSON file: currently contains includes, prologue/epilogue and instances
* version where the json is passed into the instances to generate a key
* updated run function to just launch kernel
* updated run function: only contains kernel object, json file is updated but still needs to be cleaned up, added front-end API to parse JSON into character buffer
* adding in testing files
* cleaned up comments, still need to work on including header files
* removed unneeded files
* removed/commented out JSON implementation
* added fusion(prologue/epilogue) into instance generation
* working on instance selection
* added instance selection, need to fix instance validation
* removed block2etile map validity check for testing purposes
* test running: failing due to incorrect files/input
* all grid descs/ptrs completed, but device file not found
* Update test and embed modules
* Restore older version
* added convolution operation, written test, debugging generated code for compilation
* attempting to include CK in host directory: _Float16 error
* CK header file issues
* slight fix
* don't crash when hip can't report total memory
* dump generated code to a file
* changing sizes
* creating tensor descriptors using CK methods: set up grid desc manually, also trying to set up an argument pointer - this needs to be fixed
* some fixes to call the device code
* separating test files for conv and gemm
* completed arg ptr, now have linking errors
* clang format fix
* resolved linker issues in conv test
* remove dependency on libutility from ck
* resolved num dim error
* properly passing arg ptr, errors with passing typenames: redefinition/redeclaration
* undo the commenting of device function
* hand created kernel code to find rtc issues
* dump the full src to file
* resolved redeclaration errors, cleaned up errors for Amber's kernel code
* debugging purposes: redeclaration error
* config files
* resolved errors for NumTensor and redeclaration, formatted version.h
* resolved most errors in manually added kernel and my own. error with calling kernel object: overloaded function type
* WIP: close to getting kernel compiled
* WIP: fixing rtc errors
* fixed sequence errors, formatting, still one error with run fcn
* yay: kernel compiles and runs
* updated templated/generated version to run and compile
* minor fixes
* working generated example, resolved memory access error due to padding
* adding in reference kernel, validation failing against reference
* debugging: printing kernel argsz
* reduced error in results
* debugged reference kernel and output errors, added to generated version, currently debugging prologue function issues
* working validation (using reference convolution) with prologue function for both hard-coded and generated version
* WIP: create an alt version that creates Argument on the device
* wip: added new duplicate files, fixed fusion templating errors from working example, setting up kernel arguments
* wip: making necessary methods device code
* added grid descs, working on grid pointers, errors with stl numerics
* wip: updating kernel args - issue, replacing some std functions
* replaced std::accumulate call with temp hardcoded version
* wip: args causing memory issue
* Construct Argument object inside the kernel and use it to call convolution device function. Code runs and verification passes
* adding object file dump
* temporary hardcoding of grid size, can remove device op inst + arg ptr
* minor fix for grid size
* added modified example where arg ptr is created on the device for generated version as well
* removed device op instance and arg ptr from modified examples
* moving device op file for testing purposes and to properly build CK
* commenting out print-outs
* adjust compiler args to produce a valid ELF file
* temporary removal of validation
* reverting compiler args back for working example
* retrieve necessary arguments from generated template parameters in correct format
* calculating grid size on host-side, still need to clean up process, pass parameters to host functions properly
* scaled up factory functions/wrapper structs to implement host-side launch parameter calculations using CK host side functions - in hard-coded example
* temporary change to generate ELF format binary object file
* removed unecessary code, added comments
* formatting fix
* cleaned up code, added new tests, restructured library: move helper into CK
* refactored launch parameter calculation to be more concise
* renamed files and variables for more clarity/uniformity
* more code cleaning, removed debug statements
* moved majority of my files into codegen directory, running properly
* updated Embed.cmake(string_view) in codegen directory
* updated host directory to match Embed.cmake as well
* added old tests in
* updated instance generation methods to be more concise
* removed layout from launch parameter calculation
* working test
* fixed issue with verification, all instances working
* updated verification in other tests
* removed duplicate matrix padder file, removed code dumps
* removed old hard-coded tests
* removed old host directory, all files in codegen directory now
* fixed copyright in files
* commenting out validation
* renamed files
* made changes for review: fixed copyright, renamed files for clarity, removed comments, refactored code
* updated headers
* removing duplicate file for fwd conv to gemm, merging with original file
* fix building codegen with clang++ directly
* resolving build error from conv_fwd_to_gemm
* fix for previous error
* renaming tests
* created common test file
* cleaned up code, added comments
* renamed device op
* fixed typos in comments
* removed extra space
* code cleanup: resolving Amber's comments
* removed wrapper struct for matrix padder, fixed template
* cleaned up if statements for better readability
---------
Co-authored-by: Paul <pfultz2@yahoo.com>
Co-authored-by: Jing Zhang <jizha@amd.com>
Co-authored-by: M. Amber Hassaan <amber_474@yahoo.com>
Co-authored-by: illsilin <Illia.Silin@amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
[ROCm/composable_kernel commit: 3e9711f0cb]
This commit is contained in:
@@ -1,11 +1,13 @@
|
||||
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
||||
add_subdirectory(rtc)
|
||||
|
||||
file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
|
||||
foreach(TEST_SRC ${TEST_SRCS})
|
||||
get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
|
||||
rocm_add_test_executable(test_host_${BASE_NAME} ${TEST_SRC})
|
||||
target_link_libraries(test_host_${BASE_NAME} ck_rtc ck_host)
|
||||
target_include_directories(test_host_${BASE_NAME} PUBLIC include())
|
||||
set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
|
||||
get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
|
||||
rocm_add_test_executable(test_host_${BASE_NAME} ${TEST_SRC})
|
||||
target_link_libraries(test_host_${BASE_NAME} ck_rtc ck_host)
|
||||
# target_link_libraries(test_host_${BASE_NAME} ${CK_ROOT}/build/lib/libutility.a)
|
||||
target_include_directories(test_host_${BASE_NAME} PUBLIC include())
|
||||
target_include_directories(test_host_${BASE_NAME} PUBLIC ${CK_ROOT}/include)
|
||||
target_include_directories(test_host_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include)
|
||||
endforeach()
|
||||
|
||||
134
codegen/test/common.hpp
Normal file
134
codegen/test/common.hpp
Normal file
@@ -0,0 +1,134 @@
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
#include <test.hpp>
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include <fstream>
|
||||
|
||||
std::vector<rtc::src_file> get_headers_for_test()
|
||||
{
|
||||
std::vector<rtc::src_file> result;
|
||||
auto hs = ck::host::GetHeaders();
|
||||
std::transform(
|
||||
hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> rtc::src_file {
|
||||
return {p.first, p.second};
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
std::size_t GetSize(V mLens, V mStrides)
|
||||
{
|
||||
std::size_t space = 1;
|
||||
for(std::size_t i = 0; i < mLens.Size(); ++i)
|
||||
{
|
||||
if(mLens[i] == 0)
|
||||
continue;
|
||||
|
||||
space += (mLens[i] - 1) * mStrides[i];
|
||||
}
|
||||
return space;
|
||||
}
|
||||
|
||||
template <class T, typename V>
|
||||
rtc::buffer<T> generate_buffer(V mLens, V mStrides, std::size_t seed = 0)
|
||||
{
|
||||
std::size_t space = GetSize(mLens, mStrides);
|
||||
rtc::buffer<T> result(space);
|
||||
std::mt19937 gen(seed);
|
||||
std::uniform_real_distribution<double> dis(-1.0);
|
||||
std::generate(result.begin(), result.end(), [&] { return dis(gen); });
|
||||
// std::fill(result.begin(), result.end(), 1);
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
bool allclose(const T& a, const U& b, double atol = 0.01, double rtol = 0.01)
|
||||
{
|
||||
return std::equal(a.begin(), a.end(), b.begin(), b.end(), [&](double x, double y) {
|
||||
return fabs(x - y) < atol + rtol * fabs(y);
|
||||
});
|
||||
}
|
||||
|
||||
std::string classify(double x)
|
||||
{
|
||||
switch(std::fpclassify(x))
|
||||
{
|
||||
case FP_INFINITE: return "inf";
|
||||
case FP_NAN: return "nan";
|
||||
case FP_NORMAL: return "normal";
|
||||
case FP_SUBNORMAL: return "subnormal";
|
||||
case FP_ZERO: return "zero";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
template <class Buffer>
|
||||
void print_classification(const Buffer& x)
|
||||
{
|
||||
std::unordered_set<std::string> result;
|
||||
for(const auto& i : x)
|
||||
result.insert(classify(i));
|
||||
for(const auto& c : result)
|
||||
std::cout << c << ", ";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
template <class Buffer>
|
||||
void print_statistics(const Buffer& x)
|
||||
{
|
||||
std::cout << "Min value: " << *std::min_element(x.begin(), x.end()) << ", ";
|
||||
std::cout << "Max value: " << *std::max_element(x.begin(), x.end()) << ", ";
|
||||
double num_elements = x.size();
|
||||
auto mean =
|
||||
std::accumulate(x.begin(), x.end(), double{0.0}, std::plus<double>{}) / num_elements;
|
||||
auto stddev = std::sqrt(
|
||||
std::accumulate(x.begin(),
|
||||
x.end(),
|
||||
double{0.0},
|
||||
[&](double r, double v) { return r + std::pow((v - mean), 2.0); }) /
|
||||
num_elements);
|
||||
std::cout << "Mean: " << mean << ", ";
|
||||
std::cout << "StdDev: " << stddev << "\n";
|
||||
}
|
||||
|
||||
template <class Buffer>
|
||||
void print_preview(const Buffer& x)
|
||||
{
|
||||
if(x.size() <= 10)
|
||||
{
|
||||
std::for_each(x.begin(), x.end(), [&](double i) { std::cout << i << ", "; });
|
||||
}
|
||||
else
|
||||
{
|
||||
std::for_each(x.begin(), x.begin() + 5, [&](double i) { std::cout << i << ", "; });
|
||||
std::cout << "..., ";
|
||||
std::for_each(x.end() - 5, x.end(), [&](double i) { std::cout << i << ", "; });
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
struct check_all
|
||||
{
|
||||
rtc::buffer<T> data{};
|
||||
bool operator()(const rtc::buffer<T>& x)
|
||||
{
|
||||
if(data.empty())
|
||||
{
|
||||
data = x;
|
||||
return true;
|
||||
}
|
||||
return allclose(data, x);
|
||||
}
|
||||
};
|
||||
|
||||
template <class Solution>
|
||||
auto report(const Solution& solution, bool pass)
|
||||
{
|
||||
return test::make_predicate(solution.ToTemplateString(), [=] { return pass; });
|
||||
}
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <test.hpp>
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include <fstream>
|
||||
|
||||
using half = _Float16;
|
||||
// using half = __fp16;
|
||||
@@ -159,7 +160,10 @@ TEST_CASE(test_problem_kernel)
|
||||
auto b = to_gpu(generate_buffer<half>(1024 * 1024, 1));
|
||||
auto c = to_gpu(generate_buffer<half>(1024 * 1024, 2));
|
||||
|
||||
for(auto solution : prob.GetSolutions("gfx90a"))
|
||||
std::string epilogue = "";
|
||||
std::string prologue = "";
|
||||
|
||||
for(auto solution : prob.GetSolutions("gfx90a", prologue, epilogue))
|
||||
{
|
||||
auto src = ck::host::InterpolateString(gemm_compile_check,
|
||||
{{"include", prob.GetIncludeHeader()},
|
||||
@@ -178,6 +182,7 @@ TEST_CASE(test_problem_kernel)
|
||||
auto grid_size = ck::host::integer_divide_ceil(prob.M, m_per_block) *
|
||||
ck::host::integer_divide_ceil(prob.N, n_per_block);
|
||||
k.launch(nullptr, grid_size * block_size, block_size)(a.data(), b.data(), c.data());
|
||||
|
||||
CHECK(report(solution, check(rtc::from_gpu(c))));
|
||||
}
|
||||
}
|
||||
|
||||
209
codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
Normal file
209
codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
Normal file
@@ -0,0 +1,209 @@
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
|
||||
#include "ck/host/headers.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/helper.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
#include <test.hpp>
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include "common.hpp"
|
||||
#include <fstream>
|
||||
|
||||
// Need this for verification
|
||||
/**struct Epilogue
|
||||
{
|
||||
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};**/
|
||||
const std::string conv_compile_check = R"__ck__(
|
||||
#include <${include}>
|
||||
|
||||
${template};
|
||||
|
||||
)__ck__";
|
||||
|
||||
TEST_CASE(test_problem_kernel)
|
||||
{
|
||||
// set up problem specification
|
||||
ck::host::conv::Problem_Conv_Fwd prob;
|
||||
prob.NumDim = 2;
|
||||
prob.G = 32;
|
||||
prob.N = 256;
|
||||
prob.C = 32;
|
||||
prob.K = 64;
|
||||
prob.Y = 3;
|
||||
prob.X = 3;
|
||||
prob.Hi = 28;
|
||||
prob.Wi = 28;
|
||||
prob.Ho = 28;
|
||||
prob.Wo = 28;
|
||||
check_all<ck::half_t> check;
|
||||
|
||||
// user provided fusion operations
|
||||
std::string epilogue = R"(
|
||||
struct Epilogue
|
||||
{
|
||||
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};
|
||||
)";
|
||||
std::string prologue = "";
|
||||
|
||||
// length+stride arrays
|
||||
ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi),
|
||||
static_cast<int>(prob.Wi)};
|
||||
ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho),
|
||||
static_cast<int>(prob.Wo)};
|
||||
ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Y),
|
||||
static_cast<int>(prob.X)};
|
||||
ck::Array<ck::index_t, 5> d_lengths = {};
|
||||
|
||||
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.Wi * prob.G * prob.C),
|
||||
static_cast<int>(prob.G * prob.C)};
|
||||
ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
|
||||
1,
|
||||
static_cast<int>(prob.Wo * prob.G * prob.K),
|
||||
static_cast<int>(prob.G * prob.K)};
|
||||
ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
|
||||
static_cast<int>(prob.Y * prob.X * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.X * prob.C),
|
||||
static_cast<int>(prob.C)};
|
||||
ck::Array<ck::index_t, 5> d_strides = {};
|
||||
|
||||
ck::Array<ck::index_t, 2> conv_filter_strides = {2, 2};
|
||||
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
|
||||
ck::Array<ck::index_t, 2> input_left_pads = {1, 1};
|
||||
ck::Array<ck::index_t, 2> input_right_pads = {1, 1};
|
||||
|
||||
// move the data onto the device
|
||||
auto in_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
|
||||
auto wei_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
|
||||
auto out_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
|
||||
|
||||
// CK Verficiation: Reference Kernel
|
||||
/**bool pass = true;
|
||||
Tensor<ck::half_t> in_host(in_lengths, in_strides);
|
||||
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
|
||||
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> out_host(out_lengths, out_strides);
|
||||
|
||||
std::vector<ck::index_t> conv_filter_strides_ = {2, 2};
|
||||
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
|
||||
std::vector<ck::index_t> input_left_pads_ = {1, 1};
|
||||
std::vector<ck::index_t> input_right_pads_ = {1, 1};
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
|
||||
2,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Epilogue>();
|
||||
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in_host,
|
||||
wei_host,
|
||||
out_host,
|
||||
conv_filter_strides_,
|
||||
conv_filter_dilations_,
|
||||
input_left_pads_,
|
||||
input_right_pads_,
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
Epilogue{1.0f, 1.0f});
|
||||
out_host.SetZero();
|
||||
ref_invoker.Run(ref_argument);**/
|
||||
|
||||
for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
|
||||
{
|
||||
// substitute instance values into the template
|
||||
auto src = ck::host::InterpolateString(
|
||||
conv_compile_check,
|
||||
{{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
|
||||
|
||||
auto srcs = get_headers_for_test();
|
||||
srcs.push_back({"main.cpp", src});
|
||||
rtc::compile_options options;
|
||||
auto name = solution.GetTemplateParameter<std::string>("name");
|
||||
options.kernel_name = "run_" + name;
|
||||
auto k = rtc::compile_kernel(srcs, options);
|
||||
|
||||
// Grid size calculation
|
||||
auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
|
||||
|
||||
auto tmp = get_launch_params(solution, out_lengths, out_strides);
|
||||
|
||||
auto grid_size = tmp * in_lengths[1];
|
||||
|
||||
// launch the kernel with arguments needed for the argument pointer
|
||||
k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
|
||||
wei_dev.data(),
|
||||
out_dev.data(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
// auto res = rtc::from_gpu(out_dev);
|
||||
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
// assert(pass);
|
||||
|
||||
// Simple check: this checks that the output from each instance matches the output from the
|
||||
// first instance
|
||||
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char* argv[]) { test::run(argc, argv); }
|
||||
209
codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
Normal file
209
codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
Normal file
@@ -0,0 +1,209 @@
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
|
||||
#include "ck/host/headers.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/helper.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
#include <test.hpp>
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include <fstream>
|
||||
|
||||
// need this for validation
|
||||
/**struct Epilogue
|
||||
{
|
||||
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};**/
|
||||
const std::string conv_compile_check = R"__ck__(
|
||||
#include <${include}>
|
||||
|
||||
${template};
|
||||
|
||||
)__ck__";
|
||||
|
||||
TEST_CASE(test_problem_kernel)
|
||||
{
|
||||
// set up problem specification
|
||||
ck::host::conv::Problem_Conv_Fwd prob;
|
||||
prob.NumDim = 2;
|
||||
prob.G = 32;
|
||||
prob.N = 256;
|
||||
prob.C = 32;
|
||||
prob.K = 64;
|
||||
prob.Y = 3;
|
||||
prob.X = 3;
|
||||
prob.Hi = 28;
|
||||
prob.Wi = 28;
|
||||
prob.Ho = 28;
|
||||
prob.Wo = 28;
|
||||
check_all<ck::half_t> check;
|
||||
|
||||
// user provided fusion operations
|
||||
std::string epilogue = R"(
|
||||
struct Epilogue
|
||||
{
|
||||
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};
|
||||
)";
|
||||
std::string prologue = "";
|
||||
|
||||
// length+stride arrays
|
||||
ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi),
|
||||
static_cast<int>(prob.Wi)};
|
||||
ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho),
|
||||
static_cast<int>(prob.Wo)};
|
||||
ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Y),
|
||||
static_cast<int>(prob.X)};
|
||||
ck::Array<ck::index_t, 5> d_lengths = {};
|
||||
|
||||
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.Wi * prob.G * prob.C),
|
||||
static_cast<int>(prob.G * prob.C)};
|
||||
ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
|
||||
1,
|
||||
static_cast<int>(prob.Wo * prob.G * prob.K),
|
||||
static_cast<int>(prob.G * prob.K)};
|
||||
ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
|
||||
static_cast<int>(prob.Y * prob.X * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.X * prob.C),
|
||||
static_cast<int>(prob.C)};
|
||||
ck::Array<ck::index_t, 5> d_strides = {};
|
||||
|
||||
ck::Array<ck::index_t, 2> conv_filter_strides = {1, 1};
|
||||
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
|
||||
ck::Array<ck::index_t, 2> input_left_pads = {0, 0};
|
||||
ck::Array<ck::index_t, 2> input_right_pads = {0, 0};
|
||||
|
||||
// move the data onto the device
|
||||
auto in_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
|
||||
auto wei_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
|
||||
auto out_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
|
||||
|
||||
// CK Verficiation: Reference Kernel
|
||||
/**bool pass = true;
|
||||
Tensor<ck::half_t> in_host(in_lengths, in_strides);
|
||||
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
|
||||
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> out_host(out_lengths, out_strides);
|
||||
|
||||
std::vector<ck::index_t> conv_filter_strides_ = {1, 1};
|
||||
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
|
||||
std::vector<ck::index_t> input_left_pads_ = {0, 0};
|
||||
std::vector<ck::index_t> input_right_pads_ = {0, 0};
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
|
||||
2,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Epilogue>();
|
||||
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in_host,
|
||||
wei_host,
|
||||
out_host,
|
||||
conv_filter_strides_,
|
||||
conv_filter_dilations_,
|
||||
input_left_pads_,
|
||||
input_right_pads_,
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
Epilogue{1.0f, 1.0f});
|
||||
out_host.SetZero();
|
||||
ref_invoker.Run(ref_argument);**/
|
||||
|
||||
for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
|
||||
{
|
||||
// substitute instance values into the template
|
||||
auto src = ck::host::InterpolateString(
|
||||
conv_compile_check,
|
||||
{{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
|
||||
|
||||
auto srcs = get_headers_for_test();
|
||||
srcs.push_back({"main.cpp", src});
|
||||
rtc::compile_options options;
|
||||
auto name = solution.GetTemplateParameter<std::string>("name");
|
||||
options.kernel_name = "run_" + name;
|
||||
auto k = rtc::compile_kernel(srcs, options);
|
||||
|
||||
// Grid size calculation
|
||||
auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
|
||||
|
||||
auto tmp = get_launch_params(solution, out_lengths, out_strides);
|
||||
|
||||
auto grid_size = tmp * in_lengths[1];
|
||||
|
||||
// launch the kernel with arguments needed for the argument pointer
|
||||
k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
|
||||
wei_dev.data(),
|
||||
out_dev.data(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
// auto res = rtc::from_gpu(out_dev);
|
||||
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
// assert(pass);
|
||||
|
||||
// Simple check: this checks that the output from each instance matches the output from the
|
||||
// first instance
|
||||
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char* argv[]) { test::run(argc, argv); }
|
||||
209
codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
Normal file
209
codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
Normal file
@@ -0,0 +1,209 @@
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
|
||||
#include "ck/host/headers.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/helper.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
#include "common.hpp"
|
||||
#include <test.hpp>
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include <fstream>
|
||||
|
||||
// need this for verification
|
||||
/**struct Epilogue
|
||||
{
|
||||
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};**/
|
||||
const std::string conv_compile_check = R"__ck__(
|
||||
#include <${include}>
|
||||
|
||||
${template};
|
||||
|
||||
)__ck__";
|
||||
|
||||
TEST_CASE(test_problem_kernel)
|
||||
{
|
||||
// set up problem specification
|
||||
ck::host::conv::Problem_Conv_Fwd prob;
|
||||
prob.NumDim = 2;
|
||||
prob.G = 32;
|
||||
prob.N = 256;
|
||||
prob.C = 32;
|
||||
prob.K = 64;
|
||||
prob.Y = 3;
|
||||
prob.X = 3;
|
||||
prob.Hi = 28;
|
||||
prob.Wi = 28;
|
||||
prob.Ho = 28;
|
||||
prob.Wo = 28;
|
||||
check_all<ck::half_t> check;
|
||||
|
||||
// user provided fusion operations
|
||||
std::string epilogue = R"(
|
||||
struct Epilogue
|
||||
{
|
||||
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};
|
||||
)";
|
||||
std::string prologue = "";
|
||||
|
||||
// length+stride arrays
|
||||
ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi),
|
||||
static_cast<int>(prob.Wi)};
|
||||
ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho),
|
||||
static_cast<int>(prob.Wo)};
|
||||
ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Y),
|
||||
static_cast<int>(prob.X)};
|
||||
ck::Array<ck::index_t, 5> d_lengths = {};
|
||||
|
||||
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.Wi * prob.G * prob.C),
|
||||
static_cast<int>(prob.G * prob.C)};
|
||||
ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
|
||||
1,
|
||||
static_cast<int>(prob.Wo * prob.G * prob.K),
|
||||
static_cast<int>(prob.G * prob.K)};
|
||||
ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
|
||||
static_cast<int>(prob.Y * prob.X * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.X * prob.C),
|
||||
static_cast<int>(prob.C)};
|
||||
ck::Array<ck::index_t, 5> d_strides = {};
|
||||
|
||||
ck::Array<ck::index_t, 2> conv_filter_strides = {2, 2};
|
||||
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
|
||||
ck::Array<ck::index_t, 2> input_left_pads = {0, 0};
|
||||
ck::Array<ck::index_t, 2> input_right_pads = {0, 0};
|
||||
|
||||
// move the data onto the device
|
||||
auto in_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
|
||||
auto wei_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
|
||||
auto out_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
|
||||
|
||||
// CK Verficiation: Reference Kernel
|
||||
/**bool pass = true;
|
||||
Tensor<ck::half_t> in_host(in_lengths, in_strides);
|
||||
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
|
||||
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> out_host(out_lengths, out_strides);
|
||||
|
||||
std::vector<ck::index_t> conv_filter_strides_ = {2, 2};
|
||||
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
|
||||
std::vector<ck::index_t> input_left_pads_ = {0, 0};
|
||||
std::vector<ck::index_t> input_right_pads_ = {0, 0};
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
|
||||
2,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Epilogue>();
|
||||
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in_host,
|
||||
wei_host,
|
||||
out_host,
|
||||
conv_filter_strides_,
|
||||
conv_filter_dilations_,
|
||||
input_left_pads_,
|
||||
input_right_pads_,
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
Epilogue{1.0f, 1.0f});
|
||||
out_host.SetZero();
|
||||
ref_invoker.Run(ref_argument);**/
|
||||
|
||||
for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
|
||||
{
|
||||
// substitute instance values into the template
|
||||
auto src = ck::host::InterpolateString(
|
||||
conv_compile_check,
|
||||
{{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
|
||||
|
||||
auto srcs = get_headers_for_test();
|
||||
srcs.push_back({"main.cpp", src});
|
||||
rtc::compile_options options;
|
||||
auto name = solution.GetTemplateParameter<std::string>("name");
|
||||
options.kernel_name = "run_" + name;
|
||||
auto k = rtc::compile_kernel(srcs, options);
|
||||
|
||||
// Grid size calculation
|
||||
auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
|
||||
|
||||
auto tmp = get_launch_params(solution, out_lengths, out_strides);
|
||||
|
||||
auto grid_size = tmp * in_lengths[1];
|
||||
|
||||
// launch the kernel with arguments needed for the argument pointer
|
||||
k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
|
||||
wei_dev.data(),
|
||||
out_dev.data(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
// auto res = rtc::from_gpu(out_dev);
|
||||
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
// assert(pass);
|
||||
|
||||
// Simple check: this checks that the output from each instance matches the output from the
|
||||
// first instance
|
||||
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char* argv[]) { test::run(argc, argv); }
|
||||
209
codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
Normal file
209
codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
Normal file
@@ -0,0 +1,209 @@
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
|
||||
#include "ck/host/headers.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/helper.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
#include "common.hpp"
|
||||
#include <test.hpp>
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include <fstream>
|
||||
|
||||
// need this for verification
|
||||
/**struct Epilogue
|
||||
{
|
||||
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};**/
|
||||
const std::string conv_compile_check = R"__ck__(
|
||||
#include <${include}>
|
||||
|
||||
${template};
|
||||
|
||||
)__ck__";
|
||||
|
||||
TEST_CASE(test_problem_kernel)
|
||||
{
|
||||
// set up problem specification
|
||||
ck::host::conv::Problem_Conv_Fwd prob;
|
||||
prob.NumDim = 2;
|
||||
prob.G = 32;
|
||||
prob.N = 256;
|
||||
prob.C = 32;
|
||||
prob.K = 64;
|
||||
prob.Y = 3;
|
||||
prob.X = 3;
|
||||
prob.Hi = 28;
|
||||
prob.Wi = 28;
|
||||
prob.Ho = 28;
|
||||
prob.Wo = 28;
|
||||
check_all<ck::half_t> check;
|
||||
|
||||
// user provided fusion operations
|
||||
std::string epilogue = R"(
|
||||
struct Epilogue
|
||||
{
|
||||
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};
|
||||
)";
|
||||
std::string prologue = "";
|
||||
|
||||
// length+stride arrays
|
||||
ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi),
|
||||
static_cast<int>(prob.Wi)};
|
||||
ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho),
|
||||
static_cast<int>(prob.Wo)};
|
||||
ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Y),
|
||||
static_cast<int>(prob.X)};
|
||||
ck::Array<ck::index_t, 5> d_lengths = {};
|
||||
|
||||
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.Wi * prob.G * prob.C),
|
||||
static_cast<int>(prob.G * prob.C)};
|
||||
ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
|
||||
1,
|
||||
static_cast<int>(prob.Wo * prob.G * prob.K),
|
||||
static_cast<int>(prob.G * prob.K)};
|
||||
ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
|
||||
static_cast<int>(prob.Y * prob.X * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.X * prob.C),
|
||||
static_cast<int>(prob.C)};
|
||||
ck::Array<ck::index_t, 5> d_strides = {};
|
||||
|
||||
ck::Array<ck::index_t, 2> conv_filter_strides = {1, 1};
|
||||
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
|
||||
ck::Array<ck::index_t, 2> input_left_pads = {1, 1};
|
||||
ck::Array<ck::index_t, 2> input_right_pads = {1, 1};
|
||||
|
||||
// move the data onto the device
|
||||
auto in_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
|
||||
auto wei_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
|
||||
auto out_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
|
||||
|
||||
// CK Verficiation: Reference Kernel
|
||||
/**bool pass = true;
|
||||
Tensor<ck::half_t> in_host(in_lengths, in_strides);
|
||||
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
|
||||
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> out_host(out_lengths, out_strides);
|
||||
|
||||
std::vector<ck::index_t> conv_filter_strides_ = {1, 1};
|
||||
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
|
||||
std::vector<ck::index_t> input_left_pads_ = {1, 1};
|
||||
std::vector<ck::index_t> input_right_pads_ = {1, 1};
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
|
||||
2,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Epilogue>();
|
||||
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in_host,
|
||||
wei_host,
|
||||
out_host,
|
||||
conv_filter_strides_,
|
||||
conv_filter_dilations_,
|
||||
input_left_pads_,
|
||||
input_right_pads_,
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
Epilogue{1.0f, 1.0f});
|
||||
out_host.SetZero();
|
||||
ref_invoker.Run(ref_argument);**/
|
||||
|
||||
for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
|
||||
{
|
||||
// substitute instance values into the template
|
||||
auto src = ck::host::InterpolateString(
|
||||
conv_compile_check,
|
||||
{{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
|
||||
|
||||
auto srcs = get_headers_for_test();
|
||||
srcs.push_back({"main.cpp", src});
|
||||
rtc::compile_options options;
|
||||
auto name = solution.GetTemplateParameter<std::string>("name");
|
||||
options.kernel_name = "run_" + name;
|
||||
auto k = rtc::compile_kernel(srcs, options);
|
||||
|
||||
// Grid size calculation
|
||||
auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
|
||||
|
||||
auto tmp = get_launch_params(solution, out_lengths, out_strides);
|
||||
|
||||
auto grid_size = tmp * in_lengths[1];
|
||||
|
||||
// launch the kernel with arguments needed for the argument pointer
|
||||
k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
|
||||
wei_dev.data(),
|
||||
out_dev.data(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
// auto res = rtc::from_gpu(out_dev);
|
||||
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
// assert(pass);
|
||||
|
||||
// Simple check: this checks that the output from each instance matches the output from the
|
||||
// first instance
|
||||
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char* argv[]) { test::run(argc, argv); }
|
||||
@@ -56,6 +56,8 @@ void write_string(const std::string& filename, const std::string_view& buffer)
|
||||
}
|
||||
|
||||
std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip --cuda-device-only"; }
|
||||
// TODO: undo after extracting the codeobj
|
||||
// std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip"; }
|
||||
|
||||
kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options)
|
||||
{
|
||||
@@ -89,6 +91,12 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
|
||||
|
||||
auto obj = read_buffer(out_path.string());
|
||||
|
||||
std::ofstream ofh("obj.o", std::ios::binary);
|
||||
for(auto i : obj)
|
||||
ofh << i;
|
||||
ofh.close();
|
||||
// int s = std::system(("/usr/bin/cp " + out_path.string() + " codeobj.bin").c_str());
|
||||
// assert(s == 0);
|
||||
return kernel{obj.data(), options.kernel_name};
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include <rtc/manage_ptr.hpp>
|
||||
#include <stdexcept>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
|
||||
namespace rtc {
|
||||
|
||||
@@ -49,7 +50,10 @@ std::size_t get_available_gpu_memory()
|
||||
size_t total;
|
||||
auto status = hipMemGetInfo(&free, &total);
|
||||
if(status != hipSuccess)
|
||||
throw std::runtime_error("Failed getting available memory: " + hip_error(status));
|
||||
{
|
||||
std::cerr << "Failed getting available memory: " + hip_error(status) << std::endl;
|
||||
return (8ull * 1024ull * 1024ull * 1024ull);
|
||||
}
|
||||
return free;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user