mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 12:59:49 +00:00
This commit is contained in:
@@ -0,0 +1,30 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
enum struct ConvolutionSpecialization
|
||||
{
|
||||
Default,
|
||||
Filter1x1Pad0,
|
||||
Filter1x1Stride1Pad0,
|
||||
Filter3x3,
|
||||
};
|
||||
|
||||
CK_TILE_HOST std::string getConvSpecializationString(const ConvolutionSpecialization& s)
|
||||
{
|
||||
switch(s)
|
||||
{
|
||||
case ConvolutionSpecialization::Default: return "Default";
|
||||
case ConvolutionSpecialization::Filter1x1Pad0: return "Filter1x1Pad0";
|
||||
case ConvolutionSpecialization::Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
|
||||
case ConvolutionSpecialization::Filter3x3: return "Filter3x3";
|
||||
default: return "Unrecognized specialization!";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,261 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/convolution_parameter.hpp"
|
||||
#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
enum class GroupedConvDirection
|
||||
{
|
||||
FORWARD,
|
||||
BACKWARD_DATA,
|
||||
BACKWARD_WEIGHT
|
||||
};
|
||||
|
||||
/// @brief The Grouped Conv kernel host arguments.
|
||||
///
|
||||
/// @par Overview
|
||||
/// This structure is passed to Grouped Convolution Kernels when creating kernel
|
||||
/// arguments object. It contain all necessary information required to
|
||||
/// build proper kernel argument and launch kernel on GPU.
|
||||
template <typename InPtr, typename WeiPtr, typename OutPtr, typename CDElementwise>
|
||||
struct GroupedConvHostArgs : public conv::ConvParam
|
||||
{
|
||||
CK_TILE_HOST GroupedConvHostArgs() = delete;
|
||||
CK_TILE_HOST GroupedConvHostArgs(ConvParam conv_param,
|
||||
InPtr in_ptr_,
|
||||
WeiPtr wei_ptr_,
|
||||
const std::vector<const void*> ds_ptr_,
|
||||
OutPtr out_ptr_,
|
||||
index_t k_batch_,
|
||||
CDElementwise elfunc_ = CDElementwise{})
|
||||
: conv::ConvParam(conv_param),
|
||||
in_ptr(in_ptr_),
|
||||
wei_ptr(wei_ptr_),
|
||||
ds_ptr(ds_ptr_),
|
||||
out_ptr(out_ptr_),
|
||||
k_batch(k_batch_),
|
||||
elfunc(elfunc_)
|
||||
{
|
||||
}
|
||||
|
||||
InPtr in_ptr;
|
||||
WeiPtr wei_ptr;
|
||||
const std::vector<const void*> ds_ptr;
|
||||
OutPtr out_ptr;
|
||||
index_t k_batch;
|
||||
const CDElementwise elfunc;
|
||||
};
|
||||
|
||||
using PassThrough = ck_tile::element_wise::PassThrough;
|
||||
|
||||
template <typename CDElementwise = PassThrough>
|
||||
using GroupedConvFwdHostArgs = GroupedConvHostArgs<const void*, const void*, void*, CDElementwise>;
|
||||
using GroupedConvBwdWeightHostArgs =
|
||||
GroupedConvHostArgs<const void*, void*, const void*, PassThrough>;
|
||||
using GroupedConvBwdDataHostArgs =
|
||||
GroupedConvHostArgs<void*, const void*, const void*, PassThrough>;
|
||||
|
||||
template <index_t NDimSpatial_,
|
||||
ConvolutionSpecialization ConvSpecialization_,
|
||||
typename InLayout_,
|
||||
typename WeiLayout_,
|
||||
typename DsLayout_,
|
||||
typename OutLayout_,
|
||||
index_t VectorSizeA_ = 1,
|
||||
index_t VectorSizeB_ = 1,
|
||||
index_t VectorSizeC_ = 1,
|
||||
index_t NumGroupsToMerge_ = 1,
|
||||
bool EnableSplitImage_ = false,
|
||||
bool ExplicitGemm_ = false>
|
||||
struct GroupedConvTraits
|
||||
{
|
||||
private:
|
||||
static constexpr auto generate_implicit_gemm_layout()
|
||||
{
|
||||
return generate_tuple([](auto) { return ck_tile::tensor_layout::gemm::RowMajor{}; },
|
||||
number<DsLayout_::size()>{});
|
||||
}
|
||||
|
||||
public:
|
||||
// Fixed values for Implicit GEMM
|
||||
struct FixedGemmParams
|
||||
{
|
||||
static constexpr ck_tile::index_t TilePartitionerGroupNum = 8;
|
||||
static constexpr ck_tile::index_t TilePartitionerM01 = 4;
|
||||
static constexpr bool kPadM = true;
|
||||
static constexpr bool kPadN = true;
|
||||
static constexpr bool kPadK = true;
|
||||
static constexpr bool TransposeC = false;
|
||||
static constexpr bool FixedVectorSize = true;
|
||||
static constexpr bool UseStructuredSparsity = false;
|
||||
static constexpr bool Persistent = false;
|
||||
using ELayout = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
};
|
||||
// Compile time parameters
|
||||
static constexpr index_t NumGroupsToMerge = NumGroupsToMerge_;
|
||||
static constexpr bool EnableSplitImage = EnableSplitImage_;
|
||||
static constexpr bool ExplicitGemm = ExplicitGemm_;
|
||||
static constexpr index_t NDimSpatial = NDimSpatial_;
|
||||
static constexpr ConvolutionSpecialization ConvSpecialization = ConvSpecialization_;
|
||||
using InLayout = InLayout_;
|
||||
using WeiLayout = WeiLayout_;
|
||||
using DsLayout = DsLayout_;
|
||||
using OutLayout = OutLayout_;
|
||||
|
||||
// Forward Gemm Layouts
|
||||
using AsLayoutFwd = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
using BsLayoutFwd = ck_tile::tensor_layout::gemm::ColumnMajor;
|
||||
using CLayoutFwd = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
// Backward Data Gemm Layouts
|
||||
using AsLayoutBwdData = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
using BsLayoutBwdData = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
using CLayoutBwdData = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
// Backward Weight Gemm Layouts
|
||||
using AsLayoutBwdWeight = ck_tile::tensor_layout::gemm::ColumnMajor;
|
||||
using BsLayoutBwdWeight = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
using CLayoutBwdWeight = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
|
||||
template <GroupedConvDirection Direction>
|
||||
struct GemmLayouts
|
||||
{
|
||||
static_assert(false, "Unsupported direction.");
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GemmLayouts<GroupedConvDirection::FORWARD>
|
||||
{
|
||||
using AsLayout = AsLayoutFwd;
|
||||
using BsLayout = BsLayoutFwd;
|
||||
using CLayout = CLayoutFwd;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GemmLayouts<GroupedConvDirection::BACKWARD_DATA>
|
||||
{
|
||||
using AsLayout = AsLayoutBwdData;
|
||||
using BsLayout = BsLayoutBwdData;
|
||||
using CLayout = CLayoutBwdData;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GemmLayouts<GroupedConvDirection::BACKWARD_WEIGHT>
|
||||
{
|
||||
using AsLayout = AsLayoutBwdWeight;
|
||||
using BsLayout = BsLayoutBwdWeight;
|
||||
using CLayout = CLayoutBwdWeight;
|
||||
};
|
||||
|
||||
template <ck_tile::index_t NumWaveGroups = 1>
|
||||
using GroupedConvImplicitGemmTraitsFwd =
|
||||
TileGemmTraits<true, true, true, AsLayoutFwd, BsLayoutFwd, CLayoutFwd, NumWaveGroups>;
|
||||
template <ck_tile::index_t NumWaveGroups = 1>
|
||||
using GroupedConvImplicitGemmTraitsBwdData = TileGemmTraits<true,
|
||||
true,
|
||||
true,
|
||||
AsLayoutBwdData,
|
||||
BsLayoutBwdData,
|
||||
CLayoutBwdData,
|
||||
NumWaveGroups>;
|
||||
template <ck_tile::index_t NumWaveGroups = 1>
|
||||
using GroupedConvImplicitGemmTraitsBwdWeight = TileGemmTraits<true,
|
||||
true,
|
||||
true,
|
||||
AsLayoutBwdWeight,
|
||||
BsLayoutBwdWeight,
|
||||
CLayoutBwdWeight,
|
||||
NumWaveGroups>;
|
||||
static constexpr ck_tile::index_t VectorSizeA = VectorSizeA_;
|
||||
static constexpr ck_tile::index_t VectorSizeB = VectorSizeB_;
|
||||
static constexpr ck_tile::index_t VectorSizeC = VectorSizeC_;
|
||||
static constexpr ck_tile::index_t NumDTensor = DsLayout::size();
|
||||
using ImplicitGemmDsLayout = decltype(generate_implicit_gemm_layout());
|
||||
};
|
||||
|
||||
/// @brief Helper struct for split-image piece information
|
||||
///
|
||||
/// @par Overview
|
||||
/// Stores metadata for a single spatial piece in split-image convolution.
|
||||
/// Used to track block ranges and spatial coordinates for each piece.
|
||||
struct SplitImagePieceInfo
|
||||
{
|
||||
ck_tile::index_t block_start, block_end; ///< GPU block range for this piece
|
||||
ck_tile::index_t d_start, h_start, w_start; ///< Spatial start coordinates (output space)
|
||||
ck_tile::index_t d_size, h_size, w_size; ///< Spatial dimensions of this piece
|
||||
};
|
||||
|
||||
/// @brief Calculate piece information for split-image convolution
|
||||
///
|
||||
/// @par Overview
|
||||
/// Computes spatial coordinates, dimensions, and GPU block range for a single
|
||||
/// piece in split-image convolution. Handles edge pieces that may have different
|
||||
/// sizes due to non-uniform division.
|
||||
///
|
||||
/// @tparam TilePartitioner Type providing MPerBlock and NPerBlock constants
|
||||
///
|
||||
/// @param piece_idx Index of the piece to calculate (0-based)
|
||||
/// @param num_d_pieces Number of pieces in D dimension
|
||||
/// @param num_h_pieces Number of pieces in H dimension
|
||||
/// @param num_w_pieces Number of pieces in W dimension
|
||||
/// @param base_piece_d Base size of each D piece (may differ for last piece)
|
||||
/// @param base_piece_h Base size of each H piece (may differ for last piece)
|
||||
/// @param base_piece_w Base size of each W piece (may differ for last piece)
|
||||
/// @param total_d Total D dimension size (output space)
|
||||
/// @param total_h Total H dimension size (output space)
|
||||
/// @param total_w Total W dimension size (output space)
|
||||
/// @param N Batch size
|
||||
/// @param K Output channels
|
||||
/// @param total_blocks Accumulated block count from previous pieces
|
||||
///
|
||||
/// @return SplitImagePieceInfo containing all metadata for this piece
|
||||
template <typename TilePartitioner>
|
||||
CK_TILE_HOST SplitImagePieceInfo calculate_spatial_piece(ck_tile::index_t piece_idx,
|
||||
ck_tile::index_t num_d_pieces,
|
||||
ck_tile::index_t num_h_pieces,
|
||||
ck_tile::index_t num_w_pieces,
|
||||
ck_tile::index_t base_piece_d,
|
||||
ck_tile::index_t base_piece_h,
|
||||
ck_tile::index_t base_piece_w,
|
||||
ck_tile::index_t total_d,
|
||||
ck_tile::index_t total_h,
|
||||
ck_tile::index_t total_w,
|
||||
ck_tile::index_t N,
|
||||
ck_tile::index_t K,
|
||||
ck_tile::index_t total_blocks)
|
||||
{
|
||||
// Unflatten piece index into 3D coordinates (W-major, then H, then D)
|
||||
const ck_tile::index_t w_idx = piece_idx % num_w_pieces;
|
||||
const ck_tile::index_t h_idx = (piece_idx / num_w_pieces) % num_h_pieces;
|
||||
const ck_tile::index_t d_idx = piece_idx / (num_w_pieces * num_h_pieces);
|
||||
|
||||
// Calculate spatial start positions
|
||||
const ck_tile::index_t w_start = w_idx * base_piece_w;
|
||||
const ck_tile::index_t h_start = h_idx * base_piece_h;
|
||||
const ck_tile::index_t d_start = d_idx * base_piece_d;
|
||||
|
||||
// Calculate piece sizes (last piece may be larger to cover remainder)
|
||||
const ck_tile::index_t w_size =
|
||||
(w_idx == num_w_pieces - 1) ? (total_w - w_start) : base_piece_w;
|
||||
const ck_tile::index_t h_size =
|
||||
(h_idx == num_h_pieces - 1) ? (total_h - h_start) : base_piece_h;
|
||||
const ck_tile::index_t d_size =
|
||||
(d_idx == num_d_pieces - 1) ? (total_d - d_start) : base_piece_d;
|
||||
|
||||
// Calculate GEMM dimensions for this piece
|
||||
const ck_tile::index_t piece_gemm_m = N * d_size * h_size * w_size;
|
||||
const ck_tile::index_t piece_gemm_n = K;
|
||||
|
||||
// Calculate GPU grid size for this piece
|
||||
const ck_tile::index_t piece_grid =
|
||||
((piece_gemm_m + TilePartitioner::MPerBlock - 1) / TilePartitioner::MPerBlock) *
|
||||
((piece_gemm_n + TilePartitioner::NPerBlock - 1) / TilePartitioner::NPerBlock);
|
||||
|
||||
return {
|
||||
total_blocks, total_blocks + piece_grid, d_start, h_start, w_start, d_size, h_size, w_size};
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,81 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
#include <numeric>
|
||||
|
||||
#include "ck_tile/core/utility/env.hpp"
|
||||
#include "ck_tile/core/numeric/integer.hpp"
|
||||
#include "ck_tile/host/device_prop.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <index_t BlockSize, typename KernelArgs, typename KernelImpl>
|
||||
CK_TILE_HOST index_t get_max_occupancy_for_kernel()
|
||||
{
|
||||
constexpr int dynamic_smem_size = 0;
|
||||
constexpr int min_blocks_per_cu = 1;
|
||||
|
||||
const auto kernel_ptr = kentry<min_blocks_per_cu, KernelImpl, KernelArgs>;
|
||||
|
||||
int max_occupancy = 0;
|
||||
hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_occupancy, kernel_ptr, BlockSize, dynamic_smem_size));
|
||||
|
||||
return static_cast<index_t>(max_occupancy);
|
||||
}
|
||||
|
||||
CK_TILE_HOST index_t get_best_occupancy_k_batch_value(index_t max_occupancy, index_t grid_size)
|
||||
{
|
||||
static const index_t num_cus = get_num_cus();
|
||||
const index_t max_capacity = max_occupancy * num_cus;
|
||||
|
||||
index_t k_batch = 1;
|
||||
const auto optimal_split = static_cast<index_t>(std::floor((1.0 * max_capacity) / grid_size));
|
||||
if(optimal_split > 1)
|
||||
{
|
||||
k_batch = optimal_split;
|
||||
}
|
||||
|
||||
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
|
||||
{
|
||||
std::cout << "[SPLIT-K AUTODEDUCE] Max active thread blocks per CU for GEMM kernel: "
|
||||
<< max_occupancy << std::endl;
|
||||
std::cout << "[SPLIT-K AUTODEDUCE] Output grid size: " << grid_size << std::endl;
|
||||
std::cout << "[SPLIT-K AUTODEDUCE] Optimal split-k value " << k_batch << std::endl;
|
||||
}
|
||||
return k_batch;
|
||||
}
|
||||
|
||||
template <index_t BlockSize, typename KernelArgs, typename KernelImpl>
|
||||
struct ActiveWorkgroupsPerCU
|
||||
{
|
||||
CK_TILE_HOST ActiveWorkgroupsPerCU()
|
||||
{
|
||||
max_occupancy_ = get_max_occupancy_for_kernel<BlockSize, KernelArgs, KernelImpl>();
|
||||
}
|
||||
index_t max_occupancy_{1};
|
||||
};
|
||||
|
||||
template <index_t BlockSize, typename KernelImpl, typename TilePartitioner, typename KernelArgs>
|
||||
CK_TILE_HOST index_t calculate_optimal_k_batch(const KernelArgs& kargs)
|
||||
{
|
||||
static ActiveWorkgroupsPerCU<BlockSize, KernelArgs, KernelImpl> active_workgroups_per_cu;
|
||||
|
||||
const auto grid_size = TilePartitioner::GridSize(kargs.GemmM, kargs.GemmN) * kargs.GemmBatch;
|
||||
auto optimal_k_batch =
|
||||
get_best_occupancy_k_batch_value(active_workgroups_per_cu.max_occupancy_, grid_size);
|
||||
|
||||
const auto max_allowed_k_batch = kargs.GemmK;
|
||||
optimal_k_batch = std::min(optimal_k_batch, max_allowed_k_batch);
|
||||
|
||||
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
|
||||
{
|
||||
std::cout << "[SPLIT-K AUTODEDUCE] Final k_batch value: " << optimal_k_batch << std::endl;
|
||||
}
|
||||
|
||||
return optimal_k_batch;
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user