mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 18:17:44 +00:00
* create files for xdlops
* working on blockwise_gemm_xdlops
* add KReduction
* add m/n repeats
* add 2x2 pipeline
* added 128x128 wavegemm
* use StaticBuffer of vector_type
* break vector type to blk_size
* add kpack into xldops_gemm and blockwise_gemm
* abroadcast only
* add fp32 mfma instructions
* adding fp16 mfma
* pack half4_t
* rename kperwave to kpack
* add 32x32x8fp16
* add fp16 mfma
* clean code
* clean code
* V4r4 xdlops kpack (#35)
* add kpack with incorrect results
* bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2
* add 1x1 kernel
* add gridwise_gemm_v2 - single_buffer
* enabled dwordx4 for fp16
Co-authored-by: Chao Liu <chao.liu2@amd.com>
* refactor fwd-v4r4-xdlops
* add v4r4-nhwc-xdlop
* improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop
* tweak scheduling in gridwise gemm
* add v4r3 with a single output copy
* init commit: output with slice win
* adding sliceWin
* add multiple repeats pattern
* starting adding bwd-v4r1-xdlops
* use tuple as SrcBuffer
* adding bwd-data v4r1 nhwc xdlops
* fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2()
* fix bug in host bwd-data conv
* initial implementation of bwd-data v4r1 nhwc xdlops
* add launch bound flags
* enable launch bound
* add m/nrepeat=4
* tweak bwd-data v4r1 nhwc xdlops
* added bwd-data v4r1 nhwc xlops with output A and weight B
* add fwd-v4r4 nhwc xdlops, A input, B weight, C output
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: 3835318cc3]
300 lines
9.0 KiB
C++
300 lines
9.0 KiB
C++
#include <iostream>
|
|
#include <numeric>
|
|
#include <initializer_list>
|
|
#include <cstdlib>
|
|
#include <stdlib.h>
|
|
#include "config.hpp"
|
|
#include "print.hpp"
|
|
#include "device.hpp"
|
|
#include "host_tensor_generator.hpp"
|
|
#include "device_tensor.hpp"
|
|
#include "conv_common.hpp"
|
|
#include "host_conv_bwd_data.hpp"
|
|
#include "device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"
|
|
#include "device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp"
|
|
#include "device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
|
|
#include "device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk.hpp"
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
using namespace launcher;
|
|
|
|
#if 1
|
|
// 1x1 filter, 14x14 image
|
|
constexpr index_t N = 1;
|
|
constexpr index_t C = 256;
|
|
constexpr index_t HI = 1;
|
|
constexpr index_t WI = 128;
|
|
constexpr index_t K = 16;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
constexpr index_t N = 64;
|
|
constexpr index_t C = 256;
|
|
constexpr index_t HI = 56;
|
|
constexpr index_t WI = 56;
|
|
constexpr index_t K = 256;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
// 3x3, 34x34
|
|
constexpr index_t N = 64;
|
|
constexpr index_t C = 256;
|
|
constexpr index_t HI = 34;
|
|
constexpr index_t WI = 34;
|
|
constexpr index_t K = 256;
|
|
constexpr index_t Y = 3;
|
|
constexpr index_t X = 3;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
// 3x3, 28x28
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 128;
|
|
constexpr index_t HI = 28;
|
|
constexpr index_t WI = 28;
|
|
constexpr index_t K = 128;
|
|
constexpr index_t Y = 3;
|
|
constexpr index_t X = 3;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<1, 1>;
|
|
using RightPads = Sequence<1, 1>;
|
|
#elif 0
|
|
// 1x1 filter, 8x8 image
|
|
constexpr index_t N = 256;
|
|
constexpr index_t C = 1024;
|
|
constexpr index_t HI = 8;
|
|
constexpr index_t WI = 8;
|
|
constexpr index_t K = 1024;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
// 1x1 filter, 7x7 image
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 1024;
|
|
constexpr index_t HI = 7;
|
|
constexpr index_t WI = 7;
|
|
constexpr index_t K = 1024;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 1
|
|
// 1x1 filter, 14x14 image
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 512;
|
|
constexpr index_t HI = 14;
|
|
constexpr index_t WI = 14;
|
|
constexpr index_t K = 128;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
// 1x1 filter, 28x28 image
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 128;
|
|
constexpr index_t HI = 28;
|
|
constexpr index_t WI = 28;
|
|
constexpr index_t K = 128;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
// 1x1 filter, 17x17 input
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 1024;
|
|
constexpr index_t HI = 17;
|
|
constexpr index_t WI = 17;
|
|
constexpr index_t K = 1024;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#elif 0
|
|
// 5x5 filter, 2x2 pad, 7x7 input
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 1024;
|
|
constexpr index_t HI = 7;
|
|
constexpr index_t WI = 7;
|
|
constexpr index_t K = 1024;
|
|
constexpr index_t Y = 5;
|
|
constexpr index_t X = 5;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<2, 2>;
|
|
using RightPads = Sequence<2, 2>;
|
|
#elif 0
|
|
// 1x7 filter, 0x3 pad, 17x17 input
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 128;
|
|
constexpr index_t HI = 17;
|
|
constexpr index_t WI = 17;
|
|
constexpr index_t K = 128;
|
|
constexpr index_t Y = 1;
|
|
constexpr index_t X = 7;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<0, 3>;
|
|
using RightPads = Sequence<0, 3>;
|
|
#elif 0
|
|
// 7x1 filter, 3x0 pad, 17x17 input
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 256;
|
|
constexpr index_t HI = 17;
|
|
constexpr index_t WI = 17;
|
|
constexpr index_t K = 1024;
|
|
constexpr index_t Y = 7;
|
|
constexpr index_t X = 1;
|
|
|
|
using ConvStrides = Sequence<1, 1>;
|
|
using ConvDilations = Sequence<1, 1>;
|
|
|
|
using LeftPads = Sequence<3, 0>;
|
|
using RightPads = Sequence<3, 0>;
|
|
#elif 1
|
|
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
|
|
constexpr index_t N = 128;
|
|
constexpr index_t C = 256;
|
|
constexpr index_t HI = 35;
|
|
constexpr index_t WI = 35;
|
|
constexpr index_t K = 1280;
|
|
constexpr index_t Y = 3;
|
|
constexpr index_t X = 3;
|
|
|
|
using ConvStrides = Sequence<2, 2>;
|
|
using ConvDilations = Sequence<2, 2>;
|
|
|
|
using LeftPads = Sequence<0, 0>;
|
|
using RightPads = Sequence<0, 0>;
|
|
#endif
|
|
|
|
constexpr auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
|
|
constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
|
|
constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
|
|
in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
|
|
|
|
ostream_tensor_descriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
|
|
ostream_tensor_descriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
|
|
ostream_tensor_descriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
|
|
print_array("LeftPads", LeftPads{});
|
|
print_array("LeftPads", LeftPads{});
|
|
print_array("RightPads", RightPads{});
|
|
print_array("ConvStrides", ConvStrides{});
|
|
print_array("ConvDilations", ConvDilations{});
|
|
|
|
Tensor<float> in_nchw_device(make_HostTensorDescriptor(in_nchw_desc));
|
|
Tensor<float> in_nchw_host(make_HostTensorDescriptor(in_nchw_desc));
|
|
Tensor<float> wei_kcyx(make_HostTensorDescriptor(wei_kcyx_desc));
|
|
Tensor<float> out_nkhw(make_HostTensorDescriptor(out_nkhw_desc));
|
|
|
|
std::size_t num_thread = std::thread::hardware_concurrency();
|
|
|
|
if(argc != 3)
|
|
{
|
|
printf("arg1: do_verification, arg2: nrepeat\n");
|
|
exit(1);
|
|
}
|
|
|
|
bool do_verification = atoi(argv[1]);
|
|
std::size_t nrepeat = atoi(argv[2]);
|
|
|
|
if(do_verification)
|
|
{
|
|
#if 0
|
|
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
|
|
out_nkhw.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
|
|
#else
|
|
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
|
out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
|
#endif
|
|
}
|
|
|
|
#if 0
|
|
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
|
|
#elif 0
|
|
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
|
|
#elif 0
|
|
device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw
|
|
#elif 1
|
|
device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
|
|
#endif
|
|
(in_nchw_desc,
|
|
in_nchw_device,
|
|
wei_kcyx_desc,
|
|
wei_kcyx,
|
|
out_nkhw_desc,
|
|
out_nkhw,
|
|
ConvStrides{},
|
|
ConvDilations{},
|
|
LeftPads{},
|
|
RightPads{},
|
|
nrepeat);
|
|
|
|
if(do_verification)
|
|
{
|
|
host_direct_convolution_backward_data(in_nchw_host,
|
|
wei_kcyx,
|
|
out_nkhw,
|
|
ConvStrides{},
|
|
ConvDilations{},
|
|
LeftPads{},
|
|
RightPads{});
|
|
|
|
check_error(in_nchw_host, in_nchw_device);
|
|
|
|
#if 0
|
|
LogRange(std::cout << "out_nkhw : ", out_nkhw.mData, ",") << std::endl;
|
|
LogRange(std::cout << "wei_kcyx : ", wei_kcyx.mData, ",") << std::endl;
|
|
LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl;
|
|
LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl;
|
|
#endif
|
|
}
|
|
}
|