mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-19 20:40:07 +00:00
Merge commit 'f5b0af22722b130f03cac590ca9b8729b1b84991' into develop
This commit is contained in:
@@ -2,6 +2,15 @@
|
||||
|
||||
Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/).
|
||||
|
||||
## (Unreleased) Composable Kernel 1.3.0
|
||||
|
||||
### Added
|
||||
* Added support for explicit GEMM in CK_TILE grouped convolution forward and backward weight.
|
||||
|
||||
### Changed
|
||||
|
||||
### Upcoming changes
|
||||
|
||||
## Composable Kernel 1.2.0 for ROCm 7.2.0
|
||||
|
||||
### Added
|
||||
|
||||
@@ -4,20 +4,20 @@
|
||||
#pragma once
|
||||
|
||||
#include <concepts>
|
||||
#include <ck_tile/builder/conv_builder.hpp>
|
||||
#include <ck_tile/builder/conv_signature_concepts.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits_util.hpp>
|
||||
#include <ck_tile/builder/types.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp>
|
||||
#include <ck/utility/loop_scheduler.hpp>
|
||||
#include <ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp>
|
||||
#include <ck_tile/ops/gemm.hpp>
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/utility/pipeline_enum.hpp"
|
||||
#include "ck/utility/scheduler_enum.hpp"
|
||||
#include "ck_tile/builder/conv_builder.hpp"
|
||||
#include "ck_tile/builder/conv_signature_concepts.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits_util.hpp"
|
||||
#include "ck_tile/builder/types.hpp"
|
||||
#include "ck_tile/ops/epilogue.hpp"
|
||||
#include <ck_tile/ops/grouped_convolution.hpp>
|
||||
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
|
||||
#include "ck_tile/ops/grouped_convolution.hpp"
|
||||
|
||||
namespace ck_tile::reflect::conv {
|
||||
|
||||
|
||||
@@ -8,28 +8,30 @@
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <string>
|
||||
#include <concepts>
|
||||
#include <string_view>
|
||||
#include <sstream>
|
||||
#include <type_traits>
|
||||
#include <limits.h>
|
||||
#include <cmath>
|
||||
#include <ostream>
|
||||
#include <concepts>
|
||||
#include <iostream>
|
||||
#include <ck/utility/data_type.hpp>
|
||||
#include <ck/utility/sequence.hpp>
|
||||
#include <ck/utility/blkgemmpipe_scheduler.hpp>
|
||||
#include <ck/utility/loop_scheduler.hpp>
|
||||
#include <ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
|
||||
#include <ck_tile/ops/common/tensor_layout.hpp>
|
||||
#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
|
||||
#include <ck_tile/ops/gemm.hpp>
|
||||
#include "ck_tile/ops/epilogue.hpp"
|
||||
#include <limits.h>
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <type_traits>
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/pipeline_enum.hpp"
|
||||
#include "ck/utility/scheduler_enum.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck_tile/core/container/tuple.hpp"
|
||||
#include "ck_tile/core/numeric/bfloat16.hpp"
|
||||
#include "ck_tile/core/numeric/float8.hpp"
|
||||
#include "ck_tile/core/numeric/half.hpp"
|
||||
#include "ck_tile/ops/common/tensor_layout.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
|
||||
#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
|
||||
#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
|
||||
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <ck/ck.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_data.hpp>
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_data.hpp"
|
||||
#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
@@ -2,10 +2,12 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <ck/ck.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp>
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp"
|
||||
#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
@@ -1,17 +1,19 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <gmock/gmock.h>
|
||||
#include <ck/ck.hpp>
|
||||
#include <ck/utility/reduction_operator.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp>
|
||||
#include <ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/utility/reduction_operator.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp"
|
||||
#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <gmock/gmock.h>
|
||||
#include <ck_tile/builder/reflect/instance_traits_util.hpp>
|
||||
#include <ck/utility/data_type.hpp>
|
||||
#include <ck/utility/sequence.hpp>
|
||||
#include <ck/utility/blkgemmpipe_scheduler.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
|
||||
#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/scheduler_enum.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck_tile/builder/reflect/instance_traits_util.hpp"
|
||||
|
||||
namespace ck_tile::reflect::detail {
|
||||
namespace {
|
||||
|
||||
@@ -5,24 +5,16 @@
|
||||
|
||||
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
|
||||
#include <iostream>
|
||||
#include <ostream>
|
||||
#endif
|
||||
|
||||
#include "ck/utility/pipeline_enum.hpp"
|
||||
#include "ck/utility/loop_scheduler.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v4_direct_load.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
enum struct PipelineVersion
|
||||
{
|
||||
v1,
|
||||
v2,
|
||||
// v3 is only used in the Stream-K implementation.
|
||||
v4,
|
||||
weight_only,
|
||||
};
|
||||
|
||||
template <PipelineVersion PipelineVer,
|
||||
index_t NumPrefetch = 1,
|
||||
LoopScheduler LoopSched = LoopScheduler::Default,
|
||||
@@ -62,18 +54,3 @@ constexpr auto GridwiseGemmPipeline_Selector()
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
|
||||
inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p)
|
||||
{
|
||||
switch(p)
|
||||
{
|
||||
case ck::PipelineVersion::v1: os << "PipelineVersion::v1"; break;
|
||||
case ck::PipelineVersion::v2: os << "PipelineVersion::v2"; break;
|
||||
case ck::PipelineVersion::v4: os << "PipelineVersion::v4"; break;
|
||||
case ck::PipelineVersion::weight_only: os << "PipelineVersion::weight_only"; break;
|
||||
default: os << "";
|
||||
}
|
||||
return os;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -3,52 +3,12 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/tensor_description/tensor_adaptor.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/utility/scheduler_enum.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
enum struct BlockGemmPipelineVersion
|
||||
{
|
||||
// For GEMM
|
||||
v1, // Naive
|
||||
v2, // Mem
|
||||
v3, // Comp
|
||||
v4, // Comp, double lds buffer
|
||||
v5, // Comp, double global prefetch register buffer
|
||||
|
||||
// For GEMM with preshuffled weight
|
||||
// v1, single lds buffer
|
||||
// v2, double lds buffer
|
||||
};
|
||||
enum struct BlockGemmPipelineScheduler
|
||||
{
|
||||
Intrawave,
|
||||
Interwave,
|
||||
};
|
||||
|
||||
enum struct TailNumber
|
||||
{
|
||||
// Single / Double buffer pipeline
|
||||
Odd,
|
||||
Even,
|
||||
|
||||
// Long prefetch pipeline, up to 8
|
||||
One,
|
||||
Two,
|
||||
Three,
|
||||
Four,
|
||||
Five,
|
||||
Six,
|
||||
Seven,
|
||||
|
||||
// Unroll stages > Prefetch stages, number of loop is multiple of unroll stages
|
||||
Empty,
|
||||
// Unroll stages <= Prefetch stages, number of loop is multiple of unroll stages add
|
||||
// prefetchstages
|
||||
Full,
|
||||
};
|
||||
|
||||
enum SchedulerGroup : uint32_t
|
||||
{
|
||||
SCHED_GROUP_MFMA = 0x008, // Matrix FMA instructions
|
||||
|
||||
@@ -3,40 +3,20 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
|
||||
#include <ostream>
|
||||
#endif
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/utility/scheduler_enum.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
enum struct LoopScheduler
|
||||
{
|
||||
Default,
|
||||
Interwave,
|
||||
};
|
||||
|
||||
/// @brief Helper function to get default loop scheduler
|
||||
/// @details Returns the default loop scheduler based on compile-time configuration.
|
||||
constexpr LoopScheduler make_default_loop_scheduler()
|
||||
{
|
||||
#if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
|
||||
return LoopScheduler::Interwave;
|
||||
#else
|
||||
return LoopScheduler::Default;
|
||||
#endif // if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
|
||||
inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
|
||||
{
|
||||
switch(s)
|
||||
{
|
||||
case ck::LoopScheduler::Default: os << "Default"; break;
|
||||
case ck::LoopScheduler::Interwave: os << "Interwave"; break;
|
||||
default: os << "";
|
||||
}
|
||||
return os;
|
||||
}
|
||||
#endif
|
||||
|
||||
40
include/ck/utility/pipeline_enum.hpp
Normal file
40
include/ck/utility/pipeline_enum.hpp
Normal file
@@ -0,0 +1,40 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
|
||||
#include <ostream>
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
|
||||
/// @brief Pipeline version enumeration for GEMM kernels
|
||||
/// @details Defines different pipeline strategies for data movement and computation overlap
|
||||
/// in GEMM kernels. This is a lightweight header containing only the enum definition,
|
||||
/// extracted from gridwise_gemm_pipeline_selector.hpp to minimize dependencies.
|
||||
enum struct PipelineVersion
|
||||
{
|
||||
v1, ///< Version 1 pipeline
|
||||
v2, ///< Version 2 pipeline
|
||||
// v3 is only used in the Stream-K implementation.
|
||||
v4, ///< Version 4 pipeline
|
||||
weight_only, ///< Weight-only specialized pipeline
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
|
||||
inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p)
|
||||
{
|
||||
switch(p)
|
||||
{
|
||||
case ck::PipelineVersion::v1: os << "PipelineVersion::v1"; break;
|
||||
case ck::PipelineVersion::v2: os << "PipelineVersion::v2"; break;
|
||||
case ck::PipelineVersion::v4: os << "PipelineVersion::v4"; break;
|
||||
case ck::PipelineVersion::weight_only: os << "PipelineVersion::weight_only"; break;
|
||||
default: os << "";
|
||||
}
|
||||
return os;
|
||||
}
|
||||
#endif
|
||||
83
include/ck/utility/scheduler_enum.hpp
Normal file
83
include/ck/utility/scheduler_enum.hpp
Normal file
@@ -0,0 +1,83 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
|
||||
#include <ostream>
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
|
||||
/// @brief Block GEMM pipeline version enumeration
|
||||
/// @details Defines different block GEMM pipeline strategies.
|
||||
/// This is a lightweight header containing only enum definitions,
|
||||
/// extracted from blkgemmpipe_scheduler.hpp to minimize dependencies.
|
||||
enum struct BlockGemmPipelineVersion
|
||||
{
|
||||
// For GEMM
|
||||
v1, ///< Naive pipeline
|
||||
v2, ///< Memory-optimized pipeline
|
||||
v3, ///< Compute-optimized pipeline
|
||||
v4, ///< Compute-optimized with double LDS buffer
|
||||
v5, ///< Compute-optimized with double global prefetch register buffer
|
||||
|
||||
// For GEMM with preshuffled weight
|
||||
// v1, single lds buffer
|
||||
// v2, double lds buffer
|
||||
};
|
||||
|
||||
/// @brief Block GEMM pipeline scheduler enumeration
|
||||
/// @details Defines scheduling strategies for block GEMM pipelines.
|
||||
enum struct BlockGemmPipelineScheduler
|
||||
{
|
||||
Intrawave, ///< Schedule within a single wavefront
|
||||
Interwave, ///< Schedule across multiple wavefronts
|
||||
};
|
||||
|
||||
/// @brief Loop scheduler enumeration
|
||||
/// @details Defines scheduling strategies for computational loops.
|
||||
enum struct LoopScheduler
|
||||
{
|
||||
Default, ///< Default scheduling strategy
|
||||
Interwave, ///< Cross-wavefront scheduling
|
||||
};
|
||||
|
||||
/// @brief Tail number enumeration for pipeline buffering
|
||||
/// @details Defines the number of tail iterations in pipelined loops.
|
||||
enum struct TailNumber
|
||||
{
|
||||
// Single / Double buffer pipeline
|
||||
Odd, ///< Odd number of iterations
|
||||
Even, ///< Even number of iterations
|
||||
|
||||
// Long prefetch pipeline, up to 8
|
||||
One, ///< One tail iteration
|
||||
Two, ///< Two tail iterations
|
||||
Three, ///< Three tail iterations
|
||||
Four, ///< Four tail iterations
|
||||
Five, ///< Five tail iterations
|
||||
Six, ///< Six tail iterations
|
||||
Seven, ///< Seven tail iterations
|
||||
|
||||
// Unroll stages > Prefetch stages, number of loop is multiple of unroll stages
|
||||
Empty, ///< No tail iterations
|
||||
// Unroll stages <= Prefetch stages, number of loop is multiple of unroll stages add
|
||||
// prefetchstages
|
||||
Full, ///< Full tail iterations
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
|
||||
inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
|
||||
{
|
||||
switch(s)
|
||||
{
|
||||
case ck::LoopScheduler::Default: os << "Default"; break;
|
||||
case ck::LoopScheduler::Interwave: os << "Interwave"; break;
|
||||
default: os << "";
|
||||
}
|
||||
return os;
|
||||
}
|
||||
#endif
|
||||
Reference in New Issue
Block a user