mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 17:26:00 +00:00
* create files for xdlops * working on blockwise_gemm_xdlops * add KReduction * add m/n repeats * add 2x2 pipeline * added 128x128 wavegemm * use StaticBuffer of vector_type * break vector type to blk_size * add kpack into xldops_gemm and blockwise_gemm * abroadcast only * add fp32 mfma instructions * adding fp16 mfma * pack half4_t * rename kperwave to kpack * add 32x32x8fp16 * add fp16 mfma * clean code * clean code * V4r4 xdlops kpack (#35) * add kpack with incorrect results * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * add 1x1 kernel * add gridwise_gemm_v2 - single_buffer * enabled dwordx4 for fp16 Co-authored-by: Chao Liu <chao.liu2@amd.com> * refactor fwd-v4r4-xdlops * add v4r4-nhwc-xdlop * improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop * tweak scheduling in gridwise gemm * add v4r3 with a single output copy * init commit: output with slice win * adding sliceWin * add multiple repeats pattern * starting adding bwd-v4r1-xdlops * use tuple as SrcBuffer * adding bwd-data v4r1 nhwc xdlops * fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2() * fix bug in host bwd-data conv * initial implementation of bwd-data v4r1 nhwc xdlops * add launch bound flags * enable launch bound * add m/nrepeat=4 * tweak bwd-data v4r1 nhwc xdlops * added bwd-data v4r1 nhwc xlops with output A and weight B * add fwd-v4r4 nhwc xdlops, A input, B weight, C output Co-authored-by: Chao Liu <chao.liu2@amd.com>
59 lines
1.3 KiB
C++
59 lines
1.3 KiB
C++
#ifndef HOST_TENSOR_GENERATOR_HPP
|
|
#define HOST_TENSOR_GENERATOR_HPP
|
|
|
|
#include <cmath>
|
|
#include "config.hpp"
|
|
|
|
struct GeneratorTensor_1
|
|
{
|
|
int value = 1;
|
|
|
|
template <typename... Is>
|
|
double operator()(Is... is)
|
|
{
|
|
return value;
|
|
}
|
|
};
|
|
|
|
struct GeneratorTensor_2
|
|
{
|
|
int min_value = 0;
|
|
int max_value = 1;
|
|
|
|
template <typename... Is>
|
|
double operator()(Is...)
|
|
{
|
|
return (std::rand() % (max_value - min_value)) + min_value;
|
|
}
|
|
};
|
|
|
|
struct GeneratorTensor_3
|
|
{
|
|
template <typename... Is>
|
|
double operator()(Is... is)
|
|
{
|
|
std::array<ck::index_t, sizeof...(Is)> dims = {{static_cast<ck::index_t>(is)...}};
|
|
|
|
auto f_acc = [](auto a, auto b) { return 10 * a + b; };
|
|
|
|
return std::accumulate(dims.begin(), dims.end(), ck::index_t(0), f_acc);
|
|
}
|
|
};
|
|
|
|
struct GeneratorTensor_Checkboard
|
|
{
|
|
template <typename... Ts>
|
|
double operator()(Ts... Xs) const
|
|
{
|
|
std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
|
|
return std::accumulate(dims.begin(),
|
|
dims.end(),
|
|
true,
|
|
[](bool init, ck::index_t x) -> int { return init != (x % 2); })
|
|
? 1
|
|
: -1;
|
|
}
|
|
};
|
|
|
|
#endif
|