mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 09:16:52 +00:00
* create files for xdlops * working on blockwise_gemm_xdlops * add KReduction * add m/n repeats * add 2x2 pipeline * added 128x128 wavegemm * use StaticBuffer of vector_type * break vector type to blk_size * add kpack into xldops_gemm and blockwise_gemm * abroadcast only * add fp32 mfma instructions * adding fp16 mfma * pack half4_t * rename kperwave to kpack * add 32x32x8fp16 * add fp16 mfma * clean code * clean code * V4r4 xdlops kpack (#35) * add kpack with incorrect results * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * add 1x1 kernel * add gridwise_gemm_v2 - single_buffer * enabled dwordx4 for fp16 Co-authored-by: Chao Liu <chao.liu2@amd.com> * refactor fwd-v4r4-xdlops * add v4r4-nhwc-xdlop * improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop * tweak scheduling in gridwise gemm * add v4r3 with a single output copy * init commit: output with slice win * adding sliceWin * add multiple repeats pattern * starting adding bwd-v4r1-xdlops * use tuple as SrcBuffer * adding bwd-data v4r1 nhwc xdlops * fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2() * fix bug in host bwd-data conv * initial implementation of bwd-data v4r1 nhwc xdlops * add launch bound flags * enable launch bound * add m/nrepeat=4 * tweak bwd-data v4r1 nhwc xdlops * added bwd-data v4r1 nhwc xlops with output A and weight B * add fwd-v4r4 nhwc xdlops, A input, B weight, C output Co-authored-by: Chao Liu <chao.liu2@amd.com>
144 lines
4.9 KiB
C++
144 lines
4.9 KiB
C++
#pragma once
|
|
#include "host_tensor.hpp"
|
|
|
|
template <typename TIn,
|
|
typename TWei,
|
|
typename TOut,
|
|
typename ConvStrides,
|
|
typename ConvDilations,
|
|
typename InLeftPads,
|
|
typename InRightPads>
|
|
void host_direct_convolution_backward_data(Tensor<TIn>& in,
|
|
const Tensor<TWei>& wei,
|
|
const Tensor<TOut>& out,
|
|
const ConvStrides& conv_strides,
|
|
const ConvDilations& conv_dilations,
|
|
const InLeftPads& in_left_pads,
|
|
const InRightPads& in_right_pads,
|
|
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
|
|
{
|
|
using namespace ck;
|
|
|
|
constexpr auto I0 = Number<0>{};
|
|
constexpr auto I1 = Number<1>{};
|
|
constexpr auto I2 = Number<2>{};
|
|
constexpr auto I3 = Number<3>{};
|
|
|
|
auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
|
|
std::size_t N = in.mDesc.GetLengths()[I0];
|
|
std::size_t C = in.mDesc.GetLengths()[I1];
|
|
std::size_t Hi = in.mDesc.GetLengths()[I2];
|
|
std::size_t Wi = in.mDesc.GetLengths()[I3];
|
|
|
|
std::size_t K = wei.mDesc.GetLengths()[I0];
|
|
std::size_t Y = wei.mDesc.GetLengths()[I2];
|
|
std::size_t X = wei.mDesc.GetLengths()[I3];
|
|
|
|
std::size_t Ho = out.mDesc.GetLengths()[I2];
|
|
std::size_t Wo = out.mDesc.GetLengths()[I3];
|
|
|
|
double v = 0;
|
|
|
|
for(int y = 0; y < Y; ++y)
|
|
{
|
|
int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
|
|
|
|
if(h_tmp % conv_strides[I0] == 0)
|
|
{
|
|
int ho = h_tmp / conv_strides[I0];
|
|
|
|
if(ho >= 0 && ho < Ho)
|
|
{
|
|
for(int x = 0; x < X; ++x)
|
|
{
|
|
int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
|
|
|
|
if(w_tmp % conv_strides[I1] == 0)
|
|
{
|
|
int wo = w_tmp / conv_strides[I1];
|
|
|
|
if(wo >= 0 && wo < Wo)
|
|
{
|
|
for(int k = 0; k < K; ++k)
|
|
{
|
|
v += out(n, k, ho, wo) * wei(k, c, y, x);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
in(n, c, hi, wi) = v;
|
|
};
|
|
|
|
auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
|
|
std::size_t N = in.mDesc.GetLengths()[I0];
|
|
std::size_t Hi = in.mDesc.GetLengths()[I1];
|
|
std::size_t Wi = in.mDesc.GetLengths()[I2];
|
|
std::size_t C = in.mDesc.GetLengths()[I3];
|
|
|
|
std::size_t K = wei.mDesc.GetLengths()[I0];
|
|
std::size_t Y = wei.mDesc.GetLengths()[I1];
|
|
std::size_t X = wei.mDesc.GetLengths()[I2];
|
|
|
|
std::size_t Ho = out.mDesc.GetLengths()[I1];
|
|
std::size_t Wo = out.mDesc.GetLengths()[I2];
|
|
|
|
double v = 0;
|
|
|
|
for(int y = 0; y < Y; ++y)
|
|
{
|
|
int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
|
|
|
|
if(h_tmp % conv_strides[I0] == 0)
|
|
{
|
|
int ho = h_tmp / conv_strides[I0];
|
|
|
|
if(ho >= 0 && ho < Ho)
|
|
{
|
|
for(int x = 0; x < X; ++x)
|
|
{
|
|
int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
|
|
|
|
if(w_tmp % conv_strides[I1] == 0)
|
|
{
|
|
int wo = w_tmp / conv_strides[I1];
|
|
|
|
if(wo >= 0 && wo < Wo)
|
|
{
|
|
for(int k = 0; k < K; ++k)
|
|
{
|
|
v += out(n, ho, wo, k) * wei(k, y, x, c);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
in(n, hi, wi, c) = v;
|
|
};
|
|
|
|
switch(layout)
|
|
{
|
|
case ConvTensorLayout::NCHW:
|
|
make_ParallelTensorFunctor(f_nchw,
|
|
in.mDesc.GetLengths()[0],
|
|
in.mDesc.GetLengths()[1],
|
|
in.mDesc.GetLengths()[2],
|
|
in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
|
break;
|
|
case ConvTensorLayout::NHWC:
|
|
make_ParallelTensorFunctor(f_nhwc,
|
|
in.mDesc.GetLengths()[0],
|
|
in.mDesc.GetLengths()[1],
|
|
in.mDesc.GetLengths()[2],
|
|
in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
|
break;
|
|
default: throw std::runtime_error("wrong! not supported layout");
|
|
}
|
|
}
|