mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-23 06:16:12 +00:00
Split GEMM instance library & enable pipeline v2 optimization (#783)
* Move source file into sub-directories
* Add missing include directive
* Split DeviceGemmXdl<> fp16 instances
* Fix format
* Remove unnecessary CMakeLists.txt
* Add macros to toggle new features
* Remove debug message
* Turn off GEMM v2 pipeline optimization by default
* Fix format
* Extract duplicated string as list
* Enlarge indent in CMakeLists.txt
[ROCm/composable_kernel commit: 850144a0d3]
This commit is contained in:
@@ -27,6 +27,21 @@
|
||||
#define CK_WAVELET_MIN_BLOCK_PER_CU 2
|
||||
#endif
|
||||
|
||||
// kernel attribute: amdgpu_waves_per_eu()
|
||||
#ifdef CK_USE_WAVES_PER_EU
|
||||
// for 1-wave kernels, control arguments of amdgpu_waves_per_eu() attribute
|
||||
#ifndef CK_MIN_WAVES_PER_EU
|
||||
#define CK_MIN_WAVES_PER_EU 0
|
||||
#endif
|
||||
|
||||
#ifndef CK_MAX_WAVES_PER_EU
|
||||
#define CK_MAX_WAVES_PER_EU 0
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define CK_USE_WAVES_PER_EU 0
|
||||
#endif
|
||||
|
||||
// buffer resource
|
||||
#ifndef __HIP_DEVICE_COMPILE__ // for host code
|
||||
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
|
||||
@@ -148,6 +163,10 @@
|
||||
#define CK_EXPERIMENTAL_INTER_WAVE_INSTANCES 1
|
||||
// experimental feature: add instances using pipeline v2
|
||||
#define CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES 1
|
||||
// experimental feature: optimize pipeline v2 by IGLP strategy (value=ID of strategy)
|
||||
#ifndef CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT
|
||||
#define CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT 0
|
||||
#endif
|
||||
|
||||
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
|
||||
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
|
||||
|
||||
|
||||
@@ -79,6 +79,10 @@ struct GridwiseGemmPipeline_v2
|
||||
|
||||
do
|
||||
{
|
||||
#if CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT
|
||||
__builtin_amdgcn_iglp_opt(CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT);
|
||||
#endif
|
||||
|
||||
block_sync_lds();
|
||||
|
||||
// GEMM i
|
||||
|
||||
@@ -27,6 +27,9 @@ template <typename GridwiseGemm,
|
||||
__global__ void
|
||||
#if CK_USE_LAUNCH_BOUNDS
|
||||
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
|
||||
#endif
|
||||
#if CK_USE_WAVES_PER_EU
|
||||
__attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
|
||||
#endif
|
||||
kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
|
||||
const FloatAB* __restrict__ p_b_grid,
|
||||
@@ -60,6 +63,9 @@ template <typename GridwiseGemm, bool HasMainKBlockLoop>
|
||||
__global__ void
|
||||
#if CK_USE_LAUNCH_BOUNDS
|
||||
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
|
||||
#endif
|
||||
#if CK_USE_WAVES_PER_EU
|
||||
__attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
|
||||
#endif
|
||||
kernel_gemm_xdlops_v2r3(const typename GridwiseGemm::Argument karg)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user