xdlops_v4r4_fwd fp32/fp16 (#34)

* create files for xdlops

* working on blockwise_gemm_xdlops

* add KReduction

* add m/n repeats

* add 2x2 pipeline

* added 128x128 wavegemm

* use StaticBuffer of vector_type

* break vector type to blk_size

* add kpack into xldops_gemm and blockwise_gemm

* abroadcast only

* add fp32 mfma instructions

* adding fp16 mfma

* pack half4_t

* rename kperwave to kpack

* add 32x32x8fp16

* add fp16 mfma

* clean code

* clean code

* V4r4 xdlops kpack (#35)

* add kpack with incorrect results

* bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2

* add 1x1 kernel

* add gridwise_gemm_v2 - single_buffer

* enabled dwordx4 for fp16

Co-authored-by: Chao Liu <chao.liu2@amd.com>

* refactor fwd-v4r4-xdlops

* add v4r4-nhwc-xdlop

* improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop

* tweak scheduling in gridwise gemm

* add v4r3 with a single output copy

* init commit: output with slice win

* adding sliceWin

* add multiple repeats pattern

* starting adding bwd-v4r1-xdlops

* use tuple as SrcBuffer

* adding bwd-data v4r1 nhwc xdlops

* fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2()

* fix bug in host bwd-data conv

* initial implementation of bwd-data v4r1 nhwc xdlops

* add launch bound flags

* enable launch bound

* add m/nrepeat=4

* tweak bwd-data v4r1 nhwc xdlops

* added bwd-data v4r1 nhwc xlops with output A and weight B

* add fwd-v4r4 nhwc xdlops, A input, B weight, C output

Co-authored-by: Chao Liu <chao.liu2@amd.com>

[ROCm/composable_kernel commit: 3835318cc3]
This commit is contained in:
zjing14
2021-07-01 14:33:00 -05:00
committed by GitHub
parent 817b2a47c6
commit 67dcc552b6
54 changed files with 9813 additions and 245 deletions

View File

@@ -26,18 +26,32 @@ int main(int argc, char* argv[])
}
const bool do_verification = atoi(argv[1]);
const int init_method = atoi(argv[2]);
const bool do_log = atoi(argv[3]);
const bool do_log = atoi(argv[2]);
const int init_method = atoi(argv[3]);
const int nrepeat = atoi(argv[4]);
#if 0
constexpr index_t N = 8;
constexpr index_t C = 8;
constexpr index_t Hi = 4;
constexpr index_t Wi = 8;
constexpr index_t N = 256;
constexpr index_t C = 256;
constexpr index_t HI = 16;
constexpr index_t WI = 16;
constexpr index_t K = 256;
constexpr index_t Y = 3;
constexpr index_t X = 3;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using InLeftPads = Sequence<0, 0>;
using InRightPads = Sequence<0, 0>;
#elif 0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t HI = 1080;
constexpr index_t WI = 1920;
constexpr index_t K = 16;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
@@ -162,9 +176,9 @@ int main(int argc, char* argv[])
// 3x3, 71x71
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t Hi = 71;
constexpr index_t Wi = 71;
constexpr index_t K = 128;
constexpr index_t HI = 71;
constexpr index_t WI = 71;
constexpr index_t K = 256;
constexpr index_t Y = 3;
constexpr index_t X = 3;
@@ -430,7 +444,7 @@ int main(int argc, char* argv[])
using InRightPads = Sequence<0, 0>;
#elif 0
// 1x1, 14x14, stride 2
constexpr index_t N = 128;
constexpr index_t N = 256;
constexpr index_t C = 1024;
constexpr index_t Hi = 14;
constexpr index_t Wi = 14;
@@ -445,7 +459,7 @@ int main(int argc, char* argv[])
using InRightPads = Sequence<0, 0>;
#elif 0
// 1x1, 14x14
constexpr index_t N = 128;
constexpr index_t N = 256;
constexpr index_t C = 1024;
constexpr index_t Hi = 14;
constexpr index_t Wi = 14;
@@ -636,6 +650,11 @@ int main(int argc, char* argv[])
using in_data_t = typename vector_type<float, in_vector_size>::type;
using acc_data_t = float;
using out_data_t = float;
#elif 1
using in_data_t = half_t;
constexpr index_t in_vector_size = 1;
using acc_data_t = float;
using out_data_t = half_t;
#elif 0
constexpr index_t in_vector_size = 1;
using in_data_t = typename vector_type<float, in_vector_size>::type;