mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-03 21:21:22 +00:00
This reverts commit a4f72a314a.
This commit is contained in:
@@ -271,8 +271,7 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr index_t
|
||||
CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
|
||||
__host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
|
||||
{
|
||||
const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
|
||||
const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
|
||||
@@ -625,35 +624,23 @@ struct OffsettedBlockToCTileMap
|
||||
index_t block_start_;
|
||||
};
|
||||
|
||||
///
|
||||
/// @brief Simple tile mapping which creates 3D grid of block of threads.
|
||||
///
|
||||
/// @paragraph Description
|
||||
/// This Block-to-C-tile-map creates a 3D grid (n_blocks, m_blocks, z_blocks) of thread
|
||||
/// blocks. The first 2D are regular 2D tiles created by division of output GEMM
|
||||
/// dimenions by corresponding tile size. The third dimension (Z) is a k-split
|
||||
/// dimension, which denotes the number of blocks we use to divide work on GEMM K
|
||||
/// dimension onto.
|
||||
///
|
||||
/// @tparam MPerBlock Output block tile size in M dimension.
|
||||
/// @tparam NPerBlock Output block tile size in N dimension.
|
||||
///
|
||||
/**
|
||||
* @brief Simple tile mapping which creates 3D grid of block of threads.
|
||||
*
|
||||
* @paragraph Description
|
||||
* This Block-to-C-tile-map creates a 3D grid (n_blocks, m_blocks, z_blocks) of thread
|
||||
* blocks. The first 2D are regular 2D tiles created by division of output GEMM
|
||||
* dimenions by corresponding tile size. The third dimension (Z) is a k-split dimension,
|
||||
* which denotes the number of blocks we use to divide work on GEMM K dimension onto.
|
||||
*
|
||||
* @tparam MPerBlock Output block tile size in M dimension.
|
||||
* @tparam NPerBlock Output block tile size in N dimension.
|
||||
*/
|
||||
template <index_t MPerBlock, index_t NPerBlock>
|
||||
struct BlockToCTileMap_3DGrid_KSplit
|
||||
{
|
||||
__host__ __device__ BlockToCTileMap_3DGrid_KSplit() = default;
|
||||
|
||||
///
|
||||
/// @brief Constructs a new instance.
|
||||
///
|
||||
/// @param[in] top_idx Swallow blockIdx.
|
||||
///
|
||||
/// @tparam TopIdx The type of block index.
|
||||
///
|
||||
template <typename TopIdx>
|
||||
__host__ __device__ BlockToCTileMap_3DGrid_KSplit([[maybe_unused]] TopIdx top_idx)
|
||||
{
|
||||
}
|
||||
__host__ __device__ BlockToCTileMap_3DGrid_KSplit() = default;
|
||||
|
||||
__host__ __device__ constexpr auto
|
||||
CalculateGridSize(index_t M, index_t N, index_t k_split) const
|
||||
@@ -665,7 +652,8 @@ struct BlockToCTileMap_3DGrid_KSplit
|
||||
return std::make_tuple(N0, M0, k_split);
|
||||
}
|
||||
|
||||
__device__ constexpr auto CalculateBottomIndex() const
|
||||
template <typename TopIdx>
|
||||
__device__ constexpr auto CalculateBottomIndex(const TopIdx&) const
|
||||
{
|
||||
return make_tuple(blockIdx.z, blockIdx.y, blockIdx.x);
|
||||
}
|
||||
@@ -684,53 +672,6 @@ struct BlockToCTileMap_3DGrid_KSplit
|
||||
}
|
||||
};
|
||||
|
||||
///
|
||||
/// @brief Block to CTile Map which foster external mechanism for setting up local block id.
|
||||
///
|
||||
/// In example this type can be easily used to implement tile looping work distribution
|
||||
/// scheme.
|
||||
///
|
||||
/// @tparam UnderlyingBlockToCTileMap The type of the local tile mapp.
|
||||
///
|
||||
template <typename UnderlyingBlockToCTileMap>
|
||||
struct LocalBlockToCTileMap
|
||||
{
|
||||
using underlying_type = UnderlyingBlockToCTileMap;
|
||||
|
||||
__host__ __device__ LocalBlockToCTileMap(UnderlyingBlockToCTileMap block_to_ctile_map,
|
||||
index_t local_id)
|
||||
: block_to_ctile_map_{block_to_ctile_map}, local_block_id_{local_id}
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr auto CalculateBottomIndex() const
|
||||
{
|
||||
return block_to_ctile_map_.CalculateBottomIndex(make_multi_index(local_block_id_));
|
||||
}
|
||||
|
||||
template <typename CTileIdx, typename CTileDim>
|
||||
__host__ __device__ bool ValidCTileIndex(const CTileIdx& c_tile_idx,
|
||||
const CTileDim& c_tile_dim) const
|
||||
{
|
||||
return block_to_ctile_map_.ValidCTileIndex(c_tile_idx, c_tile_dim);
|
||||
}
|
||||
|
||||
template <typename CGridDesc_M_N>
|
||||
__host__ bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
|
||||
{
|
||||
return block_to_ctile_map_.CheckValidity(c_grid_desc_m_n);
|
||||
}
|
||||
|
||||
template <typename CGridDesc_M_N>
|
||||
__host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
|
||||
{
|
||||
return block_to_ctile_map_.CalculateGridSize(c_grid_desc_m_n);
|
||||
}
|
||||
|
||||
UnderlyingBlockToCTileMap block_to_ctile_map_;
|
||||
index_t local_block_id_;
|
||||
};
|
||||
|
||||
enum StreamKReductionStrategy
|
||||
{
|
||||
Atomic = 0, // sk block use atomic to do reduction
|
||||
|
||||
@@ -4,8 +4,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
|
||||
@@ -44,20 +42,4 @@ constexpr auto GridwiseGemmPipeline_Selector()
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string getPipelineVersionString(const PipelineVersion& pv)
|
||||
{
|
||||
switch(pv)
|
||||
{
|
||||
case PipelineVersion::v1: return "PipelineVersion::v1";
|
||||
case PipelineVersion::v2: return "PipelineVersion::v2";
|
||||
default: return "Unrecognized pipeline version!";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion pv)
|
||||
{
|
||||
os << ck::getPipelineVersionString(pv);
|
||||
return os;
|
||||
}
|
||||
|
||||
@@ -27,7 +27,8 @@ __global__ void
|
||||
#if CK_USE_LAUNCH_BOUNDS
|
||||
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
|
||||
#endif
|
||||
kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg)
|
||||
kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg,
|
||||
const Block2CTileMap& b2c_map)
|
||||
{
|
||||
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
|
||||
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
|
||||
@@ -35,12 +36,11 @@ __global__ void
|
||||
|
||||
__shared__ uint8_t p_shared[shared_size];
|
||||
|
||||
Block2CTileMap b2c_map{get_block_1d_id()};
|
||||
|
||||
GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
|
||||
karg, static_cast<void*>(p_shared), b2c_map);
|
||||
#else
|
||||
ignore = karg;
|
||||
ignore = b2c_map;
|
||||
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
|
||||
}
|
||||
|
||||
@@ -541,6 +541,15 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
|
||||
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
|
||||
}
|
||||
|
||||
// return block_id to C matrix tile idx (m0, n0) mapping
|
||||
template <typename CGridDesc>
|
||||
__host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
|
||||
const CGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
|
||||
{
|
||||
return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc>(
|
||||
c_m_n_grid_desc, 8, KBatch);
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto
|
||||
GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
|
||||
{
|
||||
@@ -566,28 +575,18 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
|
||||
template <bool HasMainKBlockLoop,
|
||||
InMemoryDataOperationEnum CGlobalMemoryDataOperation,
|
||||
typename Block2CTileMap>
|
||||
__device__ static void Run(const FloatA* p_a_grid,
|
||||
const FloatB* p_b_grid,
|
||||
FloatC* p_c_grid,
|
||||
index_t M,
|
||||
index_t N,
|
||||
index_t K,
|
||||
index_t StrideA,
|
||||
index_t StrideB,
|
||||
index_t StrideC,
|
||||
index_t MPadded,
|
||||
index_t NPadded,
|
||||
index_t KPadded,
|
||||
index_t K0,
|
||||
index_t k_batch,
|
||||
__device__ static void Run(const Argument& karg,
|
||||
void* __restrict__ p_shared_block,
|
||||
const Block2CTileMap& block_2_ctile_map)
|
||||
{
|
||||
const auto a_b_k0_m_k1_grid_desc =
|
||||
MakeAGridDescriptor_KBatch_K0_M_K1(M, MPadded, K, StrideA, k_batch, K0, KPadded);
|
||||
const auto b_b_k0_n_k1_grid_desc =
|
||||
MakeBGridDescriptor_KBatch_K0_N_K1(K, NPadded, N, StrideB, k_batch, K0, KPadded);
|
||||
const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(M, N, StrideC);
|
||||
const FloatA* p_a_grid = karg.p_a_grid;
|
||||
const FloatB* p_b_grid = karg.p_b_grid;
|
||||
FloatC* p_c_grid = karg.p_c_grid;
|
||||
const auto a_b_k0_m_k1_grid_desc = MakeAGridDescriptor_KBatch_K0_M_K1(
|
||||
karg.M, karg.MPadded, karg.K, karg.StrideA, karg.k_batch, karg.K0, karg.KPadded);
|
||||
const auto b_b_k0_n_k1_grid_desc = MakeBGridDescriptor_KBatch_K0_N_K1(
|
||||
karg.K, karg.NPadded, karg.N, karg.StrideB, karg.k_batch, karg.K0, karg.KPadded);
|
||||
const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC);
|
||||
|
||||
const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
|
||||
MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n);
|
||||
@@ -603,7 +602,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
|
||||
p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
|
||||
|
||||
// divide block work by [KBatch, M, N]
|
||||
const auto block_work_idx = block_2_ctile_map.CalculateBottomIndex();
|
||||
const auto block_work_idx =
|
||||
block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
|
||||
|
||||
if(!block_2_ctile_map.ValidCTileIndex(
|
||||
block_work_idx,
|
||||
@@ -1010,34 +1010,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
|
||||
}
|
||||
}
|
||||
|
||||
template <bool HasMainKBlockLoop,
|
||||
InMemoryDataOperationEnum CGlobalMemoryDataOperation,
|
||||
typename Block2CTileMap>
|
||||
__device__ static void Run(const Argument& karg,
|
||||
void* __restrict__ p_shared_block,
|
||||
const Block2CTileMap& block_2_ctile_map)
|
||||
{
|
||||
Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, Block2CTileMap>(karg.p_a_grid,
|
||||
karg.p_b_grid,
|
||||
karg.p_c_grid,
|
||||
karg.M,
|
||||
karg.N,
|
||||
karg.K,
|
||||
karg.StrideA,
|
||||
karg.StrideB,
|
||||
karg.StrideC,
|
||||
karg.MPadded,
|
||||
karg.NPadded,
|
||||
karg.KPadded,
|
||||
karg.K0,
|
||||
karg.k_batch,
|
||||
p_shared_block,
|
||||
block_2_ctile_map);
|
||||
}
|
||||
|
||||
static constexpr auto GetMPerBlock() { return MPerBlock; }
|
||||
static constexpr auto GetNPerBlock() { return NPerBlock; }
|
||||
|
||||
static std::string GetTypeString()
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
Reference in New Issue
Block a user