mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
* create files for xdlops * working on blockwise_gemm_xdlops * add KReduction * add m/n repeats * add 2x2 pipeline * added 128x128 wavegemm * use StaticBuffer of vector_type * break vector type to blk_size * add kpack into xldops_gemm and blockwise_gemm * abroadcast only * add fp32 mfma instructions * adding fp16 mfma * pack half4_t * rename kperwave to kpack * add 32x32x8fp16 * add fp16 mfma * clean code * clean code * V4r4 xdlops kpack (#35) * add kpack with incorrect results * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * add 1x1 kernel * add gridwise_gemm_v2 - single_buffer * enabled dwordx4 for fp16 Co-authored-by: Chao Liu <chao.liu2@amd.com> * refactor fwd-v4r4-xdlops * add v4r4-nhwc-xdlop * improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop * tweak scheduling in gridwise gemm * add v4r3 with a single output copy * init commit: output with slice win * adding sliceWin * add multiple repeats pattern * starting adding bwd-v4r1-xdlops * use tuple as SrcBuffer * adding bwd-data v4r1 nhwc xdlops * fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2() * fix bug in host bwd-data conv * initial implementation of bwd-data v4r1 nhwc xdlops * add launch bound flags * enable launch bound * add m/nrepeat=4 * tweak bwd-data v4r1 nhwc xdlops * added bwd-data v4r1 nhwc xlops with output A and weight B * add fwd-v4r4 nhwc xdlops, A input, B weight, C output Co-authored-by: Chao Liu <chao.liu2@amd.com>
300 lines
9.0 KiB
C++
300 lines
9.0 KiB
C++
#include <iostream>
|
|
#include <numeric>
|
|
#include <initializer_list>
|
|
#include <cstdlib>
|
|
#include <stdlib.h>
|
|
#include "config.hpp"
|
|
#include "print.hpp"
|
|
#include "device.hpp"
|
|
#include "host_tensor_generator.hpp"
|
|
#include "device_tensor.hpp"
|
|
#include "conv_common.hpp"
|
|
#include "host_conv_bwd_data.hpp"
|
|
#include "device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"
|
|
#include "device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp"
|
|
#include "device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
|
|
#include "device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk.hpp"
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
using namespace launcher;
|
|
|
|
#if 1
|
|
// 1x1 filter, 14x14 image
|
|
constexpr index_t N = 1;
|
|
constexpr index_t C = 256;
|
|
constexpr index_t HI = 1;
|
|
constexpr index_t WI = 128;
|
|
constexpr index_t K = 16;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
constexpr index_t N = 64;
|
|
constexpr index_t C = 256;
|
|
constexpr index_t HI = 56;
|
|
constexpr index_t WI = 56;
|
|
constexpr index_t K = 256;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
// 3x3, 34x34
|
|
constexpr index_t N = 64;
|
|
constexpr index_t C = 256;
|
|
constexpr index_t HI = 34;
|
|
constexpr index_t WI = 34;
|
|
constexpr index_t K = 256;
|
|
constexpr index_t Y = 3;
|
|
constexpr index_t X = 3;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
// 3x3, 28x28
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 128;
|
|
constexpr index_t HI = 28;
|
|
constexpr index_t WI = 28;
|
|
constexpr index_t K = 128;
|
|
constexpr index_t Y = 3;
|
|
constexpr index_t X = 3;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<1, 1>;
|
|
using RightPads = Sequence<1, 1>;
|
|
#elif 0
|
|
// 1x1 filter, 8x8 image
|
|
constexpr index_t N = 256;
|
|
constexpr index_t C = 1024;
|
|
constexpr index_t HI = 8;
|
|
constexpr index_t WI = 8;
|
|
constexpr index_t K = 1024;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
// 1x1 filter, 7x7 image
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 1024;
|
|
constexpr index_t HI = 7;
|
|
constexpr index_t WI = 7;
|
|
constexpr index_t K = 1024;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 1
|
|
// 1x1 filter, 14x14 image
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 512;
|
|
constexpr index_t HI = 14;
|
|
constexpr index_t WI = 14;
|
|
constexpr index_t K = 128;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
// 1x1 filter, 28x28 image
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 128;
|
|
constexpr index_t HI = 28;
|
|
constexpr index_t WI = 28;
|
|
constexpr index_t K = 128;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
// 1x1 filter, 17x17 input
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 1024;
|
|
constexpr index_t HI = 17;
|
|
constexpr index_t WI = 17;
|
|
constexpr index_t K = 1024;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
// 5x5 filter, 2x2 pad, 7x7 input
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 1024;
|
|
constexpr index_t HI = 7;
|
|
constexpr index_t WI = 7;
|
|
constexpr index_t K = 1024;
|
|
constexpr index_t Y = 5;
|
|
constexpr index_t X = 5;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<2, 2>;
|
|
using RightPads = Sequence<2, 2>;
|
|
#elif 0
|
|
// 1x7 filter, 0x3 pad, 17x17 input
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 128;
|
|
constexpr index_t HI = 17;
|
|
constexpr index_t WI = 17;
|
|
constexpr index_t K = 128;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 7;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 3>;
|
|
using RightPads = Sequence<0, 3>;
|
|
#elif 0
|
|
// 7x1 filter, 3x0 pad, 17x17 input
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 256;
|
|
constexpr index_t HI = 17;
|
|
constexpr index_t WI = 17;
|
|
constexpr index_t K = 1024;
|
|
constexpr index_t Y = 7;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<3, 0>;
|
|
using RightPads = Sequence<3, 0>;
|
|
#elif 1
|
|
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 256;
|
|
constexpr index_t HI = 35;
|
|
constexpr index_t WI = 35;
|
|
constexpr index_t K = 1280;
|
|
constexpr index_t Y = 3;
|
|
constexpr index_t X = 3;
|
|
|
|
using ConvStrides = Sequence<2, 2>;
|
|
using ConvDilations = Sequence<2, 2>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#endif
|
|
|
|
constexpr auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
|
|
constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
|
|
constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
|
|
in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
|
|
|
|
ostream_tensor_descriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
|
|
ostream_tensor_descriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
|
|
ostream_tensor_descriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
|
|
print_array("LeftPads", LeftPads{});
|
|
print_array("LeftPads", LeftPads{});
|
|
print_array("RightPads", RightPads{});
|
|
print_array("ConvStrides", ConvStrides{});
|
|
print_array("ConvDilations", ConvDilations{});
|
|
|
|
Tensor<float> in_nchw_device(make_HostTensorDescriptor(in_nchw_desc));
|
|
Tensor<float> in_nchw_host(make_HostTensorDescriptor(in_nchw_desc));
|
|
Tensor<float> wei_kcyx(make_HostTensorDescriptor(wei_kcyx_desc));
|
|
Tensor<float> out_nkhw(make_HostTensorDescriptor(out_nkhw_desc));
|
|
|
|
std::size_t num_thread = std::thread::hardware_concurrency();
|
|
|
|
if(argc != 3)
|
|
{
|
|
printf("arg1: do_verification, arg2: nrepeat\n");
|
|
exit(1);
|
|
}
|
|
|
|
bool do_verification = atoi(argv[1]);
|
|
std::size_t nrepeat = atoi(argv[2]);
|
|
|
|
if(do_verification)
|
|
{
|
|
#if 0
|
|
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
|
|
out_nkhw.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
|
|
#else
|
|
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
|
out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
|
#endif
|
|
}
|
|
|
|
#if 0
|
|
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
|
|
#elif 0
|
|
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
|
|
#elif 0
|
|
device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw
|
|
#elif 1
|
|
device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
|
|
#endif
|
|
(in_nchw_desc,
|
|
in_nchw_device,
|
|
wei_kcyx_desc,
|
|
wei_kcyx,
|
|
out_nkhw_desc,
|
|
out_nkhw,
|
|
ConvStrides{},
|
|
ConvDilations{},
|
|
LeftPads{},
|
|
RightPads{},
|
|
nrepeat);
|
|
|
|
if(do_verification)
|
|
{
|
|
host_direct_convolution_backward_data(in_nchw_host,
|
|
wei_kcyx,
|
|
out_nkhw,
|
|
ConvStrides{},
|
|
ConvDilations{},
|
|
LeftPads{},
|
|
RightPads{});
|
|
|
|
check_error(in_nchw_host, in_nchw_device);
|
|
|
|
#if 0
|
|
LogRange(std::cout << "out_nkhw : ", out_nkhw.mData, ",") << std::endl;
|
|
LogRange(std::cout << "wei_kcyx : ", wei_kcyx.mData, ",") << std::endl;
|
|
LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl;
|
|
LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl;
|
|
#endif
|
|
}
|
|
}
|