mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
* create files for xdlops
* working on blockwise_gemm_xdlops
* add KReduction
* add m/n repeats
* add 2x2 pipeline
* added 128x128 wavegemm
* use StaticBuffer of vector_type
* break vector type to blk_size
* add kpack into xldops_gemm and blockwise_gemm
* abroadcast only
* add fp32 mfma instructions
* adding fp16 mfma
* pack half4_t
* rename kperwave to kpack
* add 32x32x8fp16
* add fp16 mfma
* clean code
* clean code
* V4r4 xdlops kpack (#35)
* add kpack with incorrect results
* bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2
* add 1x1 kernel
* add gridwise_gemm_v2 - single_buffer
* enabled dwordx4 for fp16
Co-authored-by: Chao Liu <chao.liu2@amd.com>
* refactor fwd-v4r4-xdlops
* add v4r4-nhwc-xdlop
* improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop
* tweak scheduling in gridwise gemm
* add v4r3 with a single output copy
* init commit: output with slice win
* adding sliceWin
* add multiple repeats pattern
* starting adding bwd-v4r1-xdlops
* use tuple as SrcBuffer
* adding bwd-data v4r1 nhwc xdlops
* fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2()
* fix bug in host bwd-data conv
* initial implementation of bwd-data v4r1 nhwc xdlops
* add launch bound flags
* enable launch bound
* add m/nrepeat=4
* tweak bwd-data v4r1 nhwc xdlops
* added bwd-data v4r1 nhwc xlops with output A and weight B
* add fwd-v4r4 nhwc xdlops, A input, B weight, C output
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: 3835318cc3]
144 lines
4.9 KiB
C++
144 lines
4.9 KiB
C++
#pragma once
|
|
#include "host_tensor.hpp"
|
|
|
|
template <typename TIn,
|
|
typename TWei,
|
|
typename TOut,
|
|
typename ConvStrides,
|
|
typename ConvDilations,
|
|
typename InLeftPads,
|
|
typename InRightPads>
|
|
void host_direct_convolution_backward_data(Tensor<TIn>& in,
|
|
const Tensor<TWei>& wei,
|
|
const Tensor<TOut>& out,
|
|
const ConvStrides& conv_strides,
|
|
const ConvDilations& conv_dilations,
|
|
const InLeftPads& in_left_pads,
|
|
const InRightPads& in_right_pads,
|
|
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
|
|
{
|
|
using namespace ck;
|
|
|
|
constexpr auto I0 = Number<0>{};
|
|
constexpr auto I1 = Number<1>{};
|
|
constexpr auto I2 = Number<2>{};
|
|
constexpr auto I3 = Number<3>{};
|
|
|
|
auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
|
|
std::size_t N = in.mDesc.GetLengths()[I0];
|
|
std::size_t C = in.mDesc.GetLengths()[I1];
|
|
std::size_t Hi = in.mDesc.GetLengths()[I2];
|
|
std::size_t Wi = in.mDesc.GetLengths()[I3];
|
|
|
|
std::size_t K = wei.mDesc.GetLengths()[I0];
|
|
std::size_t Y = wei.mDesc.GetLengths()[I2];
|
|
std::size_t X = wei.mDesc.GetLengths()[I3];
|
|
|
|
std::size_t Ho = out.mDesc.GetLengths()[I2];
|
|
std::size_t Wo = out.mDesc.GetLengths()[I3];
|
|
|
|
double v = 0;
|
|
|
|
for(int y = 0; y < Y; ++y)
|
|
{
|
|
int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
|
|
|
|
if(h_tmp % conv_strides[I0] == 0)
|
|
{
|
|
int ho = h_tmp / conv_strides[I0];
|
|
|
|
if(ho >= 0 && ho < Ho)
|
|
{
|
|
for(int x = 0; x < X; ++x)
|
|
{
|
|
int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
|
|
|
|
if(w_tmp % conv_strides[I1] == 0)
|
|
{
|
|
int wo = w_tmp / conv_strides[I1];
|
|
|
|
if(wo >= 0 && wo < Wo)
|
|
{
|
|
for(int k = 0; k < K; ++k)
|
|
{
|
|
v += out(n, k, ho, wo) * wei(k, c, y, x);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
in(n, c, hi, wi) = v;
|
|
};
|
|
|
|
auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
|
|
std::size_t N = in.mDesc.GetLengths()[I0];
|
|
std::size_t Hi = in.mDesc.GetLengths()[I1];
|
|
std::size_t Wi = in.mDesc.GetLengths()[I2];
|
|
std::size_t C = in.mDesc.GetLengths()[I3];
|
|
|
|
std::size_t K = wei.mDesc.GetLengths()[I0];
|
|
std::size_t Y = wei.mDesc.GetLengths()[I1];
|
|
std::size_t X = wei.mDesc.GetLengths()[I2];
|
|
|
|
std::size_t Ho = out.mDesc.GetLengths()[I1];
|
|
std::size_t Wo = out.mDesc.GetLengths()[I2];
|
|
|
|
double v = 0;
|
|
|
|
for(int y = 0; y < Y; ++y)
|
|
{
|
|
int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
|
|
|
|
if(h_tmp % conv_strides[I0] == 0)
|
|
{
|
|
int ho = h_tmp / conv_strides[I0];
|
|
|
|
if(ho >= 0 && ho < Ho)
|
|
{
|
|
for(int x = 0; x < X; ++x)
|
|
{
|
|
int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
|
|
|
|
if(w_tmp % conv_strides[I1] == 0)
|
|
{
|
|
int wo = w_tmp / conv_strides[I1];
|
|
|
|
if(wo >= 0 && wo < Wo)
|
|
{
|
|
for(int k = 0; k < K; ++k)
|
|
{
|
|
v += out(n, ho, wo, k) * wei(k, y, x, c);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
in(n, hi, wi, c) = v;
|
|
};
|
|
|
|
switch(layout)
|
|
{
|
|
case ConvTensorLayout::NCHW:
|
|
make_ParallelTensorFunctor(f_nchw,
|
|
in.mDesc.GetLengths()[0],
|
|
in.mDesc.GetLengths()[1],
|
|
in.mDesc.GetLengths()[2],
|
|
in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
|
break;
|
|
case ConvTensorLayout::NHWC:
|
|
make_ParallelTensorFunctor(f_nhwc,
|
|
in.mDesc.GetLengths()[0],
|
|
in.mDesc.GetLengths()[1],
|
|
in.mDesc.GetLengths()[2],
|
|
in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
|
break;
|
|
default: throw std::runtime_error("wrong! not supported layout");
|
|
}
|
|
}
|