mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 21:09:08 +00:00
Navi21 gemm (#197)
* start adding navi21 GEMM
* navi_gemm_km_kn_mn_fp32 compiles and passes one test.
* rename variables and functions in gridwise_gemm_dlops_v1r3
* add other 3 layouts; format instance
* adding more tuning parameters
add tuning parameters for other 3 layouts
* add gemm_dlops_f16
* tmp
* add dependence of DeviceGemm::IsSupportedArg() on arch
* minor changes
* minor changes
* minor changes
* minor changes
* minor changes
* minor changes
* minor changes
* push gemm_dlops into profiler
* minor changes
* if using xdl or dlops is moved into profiler_gemm_impl
* minor changes
* minor changes
* remove is_xdl from profile_gemm_impl
* make IsSupportedArg dependent on arch for other device_gemm
* minor changes
* minor changes
* fix a bug in f_generate_tensor_value
* add 64x64x64 for gemm_dlops_int8
* add 64x64x64 for gemm_dlops_int8
* comment out 3 layouts in gemm_dlops_int8; add 32x32x32 for gemm_dlops_int8; init A values to 1
* fix
* start fixing tuning parameters
* monir
* minor changes
* minor changes
* minor changes
* fixing
* adding example
* adding example
* adding example
* add gemm fp32 example
* clean up
* use 128x128x16 as MNK tile in navi21 gemm example
* bug fix
* fix test
* use new block c tile
* clean
* fix build
Co-authored-by: Chao Liu <chao.liu2@amd.com>
Co-authored-by: shaojiewang <wsjmessi@163.com>
[ROCm/composable_kernel commit: 40b59a63cc]
This commit is contained in:
@@ -1,6 +1,4 @@
|
||||
#ifndef CK_THREADWISE_CONTRACTION_DLOPS_HPP
|
||||
#define CK_THREADWISE_CONTRACTION_DLOPS_HPP
|
||||
|
||||
#pragma once
|
||||
#include "common_header.hpp"
|
||||
#include "math.hpp"
|
||||
|
||||
@@ -25,9 +23,9 @@ template <typename FloatA,
|
||||
BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
|
||||
CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
|
||||
bool>::type = false>
|
||||
struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
|
||||
struct ThreadwiseGemmDl_km0m1_kn0n1_m0m1n0n1
|
||||
{
|
||||
__device__ constexpr ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1()
|
||||
__device__ constexpr ThreadwiseGemmDl_km0m1_kn0n1_m0m1n0n1()
|
||||
{
|
||||
static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
|
||||
BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
|
||||
@@ -124,9 +122,9 @@ template <typename FloatA,
|
||||
BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
|
||||
CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
|
||||
bool>::type = false>
|
||||
struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
|
||||
struct ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
|
||||
{
|
||||
__device__ constexpr ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1()
|
||||
__device__ constexpr ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1()
|
||||
{
|
||||
static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
|
||||
BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
|
||||
@@ -220,4 +218,3 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -1,5 +1,4 @@
|
||||
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
|
||||
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
|
||||
#pragma once
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
@@ -609,4 +608,3 @@ struct ThreadwiseTensorSliceTransfer_v5r1
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user