mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 06:01:23 +00:00
Split GEMM instance library & enable pipeline v2 optimization (#783)
* Move source file into sub-directories * Add missing include directive * Split DeviceGemmXdl<> fp16 instances * Fix format * Remove unnecessary CMakeLists.txt * Add macros to toggle new features * Remove debug message * Turn off GEMM v2 pipeline optimization by default * Fix format * Extract duplicated string as list * Enlarge indent in CMakeLists.txt
This commit is contained in:
@@ -3,6 +3,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
|
||||
|
||||
|
||||
@@ -79,6 +79,10 @@ struct GridwiseGemmPipeline_v2
|
||||
|
||||
do
|
||||
{
|
||||
#if CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT
|
||||
__builtin_amdgcn_iglp_opt(CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT);
|
||||
#endif
|
||||
|
||||
block_sync_lds();
|
||||
|
||||
// GEMM i
|
||||
|
||||
@@ -27,6 +27,9 @@ template <typename GridwiseGemm,
|
||||
__global__ void
|
||||
#if CK_USE_LAUNCH_BOUNDS
|
||||
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
|
||||
#endif
|
||||
#if CK_USE_WAVES_PER_EU
|
||||
__attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
|
||||
#endif
|
||||
kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
|
||||
const FloatAB* __restrict__ p_b_grid,
|
||||
@@ -60,6 +63,9 @@ template <typename GridwiseGemm, bool HasMainKBlockLoop>
|
||||
__global__ void
|
||||
#if CK_USE_LAUNCH_BOUNDS
|
||||
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
|
||||
#endif
|
||||
#if CK_USE_WAVES_PER_EU
|
||||
__attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
|
||||
#endif
|
||||
kernel_gemm_xdlops_v2r3(const typename GridwiseGemm::Argument karg)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user