mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-30 03:37:38 +00:00
tmp save between remotes
This commit is contained in:
@@ -741,7 +741,7 @@ if (NOT MIOPEN_REQ_LIBS_ONLY)
|
||||
endif()
|
||||
|
||||
if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
|
||||
add_subdirectory(codegen)
|
||||
#add_subdirectory(codegen)
|
||||
endif()
|
||||
|
||||
#Create an interface target for the include only files and call it "composablekernels"
|
||||
|
||||
@@ -190,7 +190,7 @@ check_err(const Range& out,
|
||||
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
if(err_count < 5)
|
||||
if(err_count < 40)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
@@ -244,7 +244,7 @@ check_err(const Range& out,
|
||||
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
if(err_count < 5)
|
||||
if(err_count < 40)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
@@ -298,7 +298,7 @@ check_err(const Range& out,
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
if(err_count < 40)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
@@ -350,7 +350,7 @@ check_err(const Range& out,
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
if(err_count < 40)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
@@ -409,7 +409,7 @@ check_err(const Range& out,
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
if(err_count < 40)
|
||||
{
|
||||
std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
|
||||
<< std::endl;
|
||||
@@ -462,7 +462,7 @@ check_err(const Range& out,
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
if(err_count < 40)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
@@ -511,7 +511,7 @@ check_err(const Range& out,
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
if(err_count < 40)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
@@ -560,7 +560,7 @@ check_err(const Range& out,
|
||||
{
|
||||
max_err = err > max_err ? err : max_err;
|
||||
err_count++;
|
||||
if(err_count < 5)
|
||||
if(err_count < 40)
|
||||
{
|
||||
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
||||
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
||||
|
||||
@@ -10,8 +10,8 @@ struct StreamConfig
|
||||
hipStream_t stream_id_ = nullptr;
|
||||
bool time_kernel_ = false;
|
||||
int log_level_ = 0;
|
||||
int cold_niters_ = 5;
|
||||
int nrepeat_ = 50;
|
||||
int cold_niters_ = 0;
|
||||
int nrepeat_ = 1;
|
||||
|
||||
bool flush_cache = false;
|
||||
int rotating_count = 1;
|
||||
|
||||
@@ -187,6 +187,7 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
|
||||
CThreadBuffer& c_thread_buf,
|
||||
index_t num_loop) const
|
||||
{
|
||||
// if(threadIdx.x == 0) printf("v1 intra\n");
|
||||
auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
|
||||
a_thread_desc_.GetElementSpaceSize());
|
||||
auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataTypeBuf>(
|
||||
@@ -212,6 +213,7 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
|
||||
index_t i = 0;
|
||||
do
|
||||
{
|
||||
// if(threadIdx.x == 0) printf("has Main loop %d\n", i);
|
||||
// -------------------------------------------------------------------------------------------
|
||||
a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
|
||||
b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
|
||||
@@ -280,6 +282,7 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
|
||||
// tail
|
||||
if constexpr(TailNum == TailNumber::Full)
|
||||
{
|
||||
// if(threadIdx.x == 0) printf("Tail full\n");
|
||||
block_sync_lds();
|
||||
static_for<0, KRepeat, 1>{}([&](auto k) {
|
||||
static_for<0, MRepeat, 1>{}([&](auto m0) {
|
||||
@@ -313,6 +316,14 @@ struct BlockwiseGemmXdlops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
|
||||
b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
|
||||
b_thread_buf[Number<b_thread_desc_.CalculateOffset(
|
||||
make_tuple(n0, I0, k0, ik))>{}];
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
// printf("a: %f b: %f\n",
|
||||
// static_cast<float>(a_thread_buf[Number<a_thread_desc_.CalculateOffset(
|
||||
// make_tuple(m0, I0, k0, ik))>{}]),
|
||||
// static_cast<float>(b_thread_buf[Number<b_thread_desc_.CalculateOffset(
|
||||
// make_tuple(n0, I0, k0, ik))>{}]));
|
||||
// }
|
||||
});
|
||||
|
||||
using mfma_input_type =
|
||||
|
||||
@@ -506,7 +506,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffleV3
|
||||
decltype(GridwiseGemmCTranspose::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
|
||||
DsGridDesc_M_N{}, 1, 1));
|
||||
using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
|
||||
decltype(GridwiseGemmCTranspose::MakeDEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
|
||||
decltype(GridwiseGemmCTranspose::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
|
||||
EGridDesc_M_N{}, 1, 1));
|
||||
|
||||
using Block2ETileMap = typename GridwiseGemmCTranspose::Block2CTileMap;
|
||||
|
||||
@@ -1186,8 +1186,10 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
|
||||
const auto clear_workspace = [&]() {
|
||||
if(arg.bwd_needs_zero_out && gemm_set_id == 0)
|
||||
{
|
||||
printf("pre memset\n");
|
||||
hip_check_error(hipMemsetAsync(
|
||||
p_e_grid, 0, arg.e_space_size_bytes, stream_config.stream_id_));
|
||||
printf("post memset\n");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -612,7 +612,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
|
||||
}
|
||||
|
||||
template <typename DsGridDesc>
|
||||
__device__ static constexpr auto MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
|
||||
__host__ __device__ static constexpr auto MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
|
||||
const DsGridDesc& ds_grid_desc_m_n, index_t MBlock, index_t NBlock)
|
||||
{
|
||||
return generate_tuple(
|
||||
@@ -1402,7 +1402,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
|
||||
}
|
||||
|
||||
template <typename CGridDesc>
|
||||
__device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
|
||||
__device__ __host__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
|
||||
const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
|
||||
{
|
||||
const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
|
||||
@@ -1509,7 +1509,9 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
|
||||
const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
|
||||
const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
|
||||
const DsGridDesc_M_N& ds_grid_desc_m_n,
|
||||
const CGridDesc_M_N& c_grid_desc_m_n)
|
||||
const CGridDesc_M_N& c_grid_desc_m_n,
|
||||
const index_t k_batch = 1,
|
||||
const index_t k_idx = 0)
|
||||
{
|
||||
|
||||
const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
@@ -1538,6 +1540,13 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
|
||||
const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
|
||||
const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
|
||||
|
||||
//const index_t n_block_data_idx_on_grid =__builtin_amdgcn_readfirstlane(k_id * KPerBlock);
|
||||
|
||||
const index_t num_ak0_per_block =
|
||||
__builtin_amdgcn_readfirstlane(a_grid_desc_ak0_m_ak1.GetLength(I0) / k_batch);
|
||||
const index_t num_bk0_per_block =
|
||||
__builtin_amdgcn_readfirstlane(b_grid_desc_bk0_n_bk1.GetLength(I0) / k_batch);
|
||||
|
||||
// HACK: this force m/n_block_data_idx_on_grid into SGPR
|
||||
const index_t m_block_data_idx_on_grid =
|
||||
__builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
|
||||
@@ -1571,7 +1580,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
|
||||
2,
|
||||
ABlockTransferSrcScalarPerVector>(
|
||||
a_grid_desc_ak0_m_ak1,
|
||||
make_multi_index(0, m_block_data_idx_on_grid, 0),
|
||||
make_multi_index(num_ak0_per_block * k_idx, m_block_data_idx_on_grid, 0),
|
||||
a_block_desc_ak0_m_ak1,
|
||||
make_multi_index(0, 0, 0));
|
||||
}
|
||||
@@ -1601,7 +1610,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
|
||||
true,
|
||||
BlockwiseGemmPipe::GlobalBufferNum>(
|
||||
a_grid_desc_ak0_m_ak1,
|
||||
make_multi_index(0, m_block_data_idx_on_grid, 0),
|
||||
make_multi_index(num_ak0_per_block * k_idx, m_block_data_idx_on_grid, 0),
|
||||
a_element_op,
|
||||
a_block_desc_ak0_m_ak1,
|
||||
make_multi_index(0, 0, 0),
|
||||
@@ -1627,7 +1636,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
|
||||
2,
|
||||
BBlockTransferSrcScalarPerVector>(
|
||||
b_grid_desc_bk0_n_bk1,
|
||||
make_multi_index(0, n_block_data_idx_on_grid, 0),
|
||||
make_multi_index(num_bk0_per_block * k_idx, n_block_data_idx_on_grid, 0),
|
||||
b_block_desc_bk0_n_bk1,
|
||||
make_multi_index(0, 0, 0));
|
||||
}
|
||||
@@ -1657,7 +1666,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
|
||||
true,
|
||||
BlockwiseGemmPipe::GlobalBufferNum>(
|
||||
b_grid_desc_bk0_n_bk1,
|
||||
make_multi_index(0, n_block_data_idx_on_grid, 0),
|
||||
make_multi_index(num_bk0_per_block * k_idx, n_block_data_idx_on_grid, 0),
|
||||
b_element_op,
|
||||
b_block_desc_bk0_n_bk1,
|
||||
make_multi_index(0, 0, 0),
|
||||
@@ -1691,7 +1700,11 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
|
||||
|
||||
const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
|
||||
(a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
|
||||
KPerBlock);
|
||||
(KPerBlock * k_batch));
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
// printf("num_k block main loop: %d\n m_block_data_idx_on_grid: %d\n n_block_data_idx_on_grid: %d\n", num_k_block_main_loop, m_block_data_idx_on_grid, n_block_data_idx_on_grid);
|
||||
// }
|
||||
|
||||
blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
|
||||
a_block_desc_ak0_m_ak1,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle_v3.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v3.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
@@ -65,7 +65,7 @@ using device_grouped_conv_bwd_data_xdl_v3_f16_instances = std::tuple<
|
||||
// ##############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | Lengths_AK0_M_AK1| ArrangeOrder| | | PerVector| PerVector_AK1| | Lengths_BK0_N_BK1| ArrangeOrder| | | PerVector| PerVector_BK1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| |
|
||||
// ##############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
// generic instance
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffleV3<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 64, 64, 64, 32, 8, 8, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, S<1,1,1>>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffleV3<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 8, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 16, 1, 4>, S<2,2,2>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F16, F16, 1, 1, true>
|
||||
|
||||
// DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffleV3<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 64, 64, 32, 64, 8, 8, 16, 16, 4, 1, S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1, 1, 1, S<1, 16, 1, 4>, S<8,8,8>>,
|
||||
// DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffleV3<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F16, F16, F32, F16, Empty_Tuple, F16, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 64, 64, 64, 32, 8, 8, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1, 1, 1, S<1, 16, 1, 4>, S<8,8,8>>,
|
||||
|
||||
@@ -79,7 +79,7 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
|
||||
is_same_v<ComputeTypeB, F16>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances(op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP32
|
||||
@@ -87,7 +87,7 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F32> && is_same_v<ComputeTypeA, F32> &&
|
||||
is_same_v<ComputeTypeB, F32>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances(op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_BF16
|
||||
@@ -95,8 +95,8 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
|
||||
is_same_v<ComputeTypeB, BF16>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_bf16_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_bf16_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -109,11 +109,11 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<ComputeTypeB, F16>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_v3_nhwgk_gkyxc_nhwgc_f16_instances(op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_optimized_loads_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_optimized_loads_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
if constexpr(is_same_v<InDataType, F32> && is_same_v<WeiDataType, F32> &&
|
||||
@@ -124,23 +124,23 @@ struct DeviceOperationInstanceFactory<
|
||||
#ifdef CK_ENABLE_TF32
|
||||
if constexpr(is_same_v<ComputeTypeA, TF32>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_tf32_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_tf32_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_tf32_optimized_loads_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_tf32_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_tf32_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_tf32_optimized_loads_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP32
|
||||
if constexpr(is_same_v<ComputeTypeA, F32>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_optimized_loads_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_optimized_loads_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -149,12 +149,12 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
|
||||
is_same_v<ComputeTypeB, BF16>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_optimized_loads_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_optimized_loads_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -166,7 +166,7 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
|
||||
is_same_v<ComputeTypeB, F16>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f16_instances(op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f16_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP32
|
||||
@@ -174,7 +174,7 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F32> && is_same_v<ComputeTypeA, F32> &&
|
||||
is_same_v<ComputeTypeB, F32>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f32_instances(op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f32_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_BF16
|
||||
@@ -182,8 +182,8 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
|
||||
is_same_v<ComputeTypeB, BF16>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_bf16_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_bf16_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -195,11 +195,11 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
|
||||
is_same_v<ComputeTypeB, F16>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_instances(op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_vec_transpose_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_instances(op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_vec_transpose_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP32
|
||||
@@ -207,11 +207,11 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F32> && is_same_v<ComputeTypeA, F32> &&
|
||||
is_same_v<ComputeTypeB, F32>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_instances(op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_vec_transpose_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_instances(op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_vec_transpose_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_BF16
|
||||
@@ -219,12 +219,12 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
|
||||
is_same_v<ComputeTypeB, BF16>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_vec_transpose_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_vec_transpose_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -239,8 +239,8 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
|
||||
is_same_v<ComputeTypeB, F16>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP32
|
||||
@@ -248,8 +248,8 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F32> && is_same_v<ComputeTypeA, F32> &&
|
||||
is_same_v<ComputeTypeB, F32>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_BF16
|
||||
@@ -257,8 +257,8 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
|
||||
is_same_v<ComputeTypeB, BF16>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -270,12 +270,12 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
|
||||
is_same_v<ComputeTypeB, F16>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_optimized_loads_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_optimized_loads_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
|
||||
@@ -283,8 +283,8 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, bf8_t> &&
|
||||
is_same_v<ComputeTypeB, f8_t>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_input_f16_comp_bf8f8_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_input_f16_comp_bf8f8_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
if constexpr(is_same_v<InDataType, F32> && is_same_v<WeiDataType, F32> &&
|
||||
@@ -295,23 +295,23 @@ struct DeviceOperationInstanceFactory<
|
||||
#ifdef CK_ENABLE_FP32
|
||||
if constexpr(is_same_v<ComputeTypeA, F32>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_optimized_loads_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_optimized_loads_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_TF32
|
||||
if constexpr(is_same_v<ComputeTypeA, TF32>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_tf32_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_tf32_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_tf32_optimized_loads_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_tf32_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_tf32_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_tf32_optimized_loads_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -320,12 +320,12 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
|
||||
is_same_v<ComputeTypeB, BF16>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_optimized_loads_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_optimized_loads_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -337,8 +337,8 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
|
||||
is_same_v<ComputeTypeB, F16>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f16_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f16_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP32
|
||||
@@ -346,8 +346,8 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F32> && is_same_v<ComputeTypeA, F32> &&
|
||||
is_same_v<ComputeTypeB, F32>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f32_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f32_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_BF16
|
||||
@@ -355,8 +355,8 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
|
||||
is_same_v<ComputeTypeB, BF16>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_bf16_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_bf16_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -368,12 +368,12 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
|
||||
is_same_v<ComputeTypeB, F16>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_vec_transpose_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_vec_transpose_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP32
|
||||
@@ -381,12 +381,12 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F32> && is_same_v<ComputeTypeA, F32> &&
|
||||
is_same_v<ComputeTypeB, F32>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_vec_transpose_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_vec_transpose_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_BF16
|
||||
@@ -394,12 +394,12 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
|
||||
is_same_v<ComputeTypeB, BF16>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_vec_transpose_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_vec_transpose_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -417,10 +417,10 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
|
||||
is_same_v<ComputeTypeB, F16>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_f16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_f16_1x1s1p0_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_f16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_f16_1x1s1p0_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -429,9 +429,9 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, int8_t> && is_same_v<ComputeTypeA, int8_t> &&
|
||||
is_same_v<ComputeTypeB, int8_t>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_i8_instances(op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_i8_1x1s1p0_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_i8_instances(op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_i8_1x1s1p0_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -443,14 +443,14 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
|
||||
is_same_v<ComputeTypeB, F16>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_wmma_v3_nhwgk_gkyxc_nhwgc_f16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_wmma_v3_nhwgk_gkyxc_nhwgc_f16_16_16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_f16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_f16_1x1s1p0_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_wmma_v3_nhwgk_gkyxc_nhwgc_f16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_wmma_v3_nhwgk_gkyxc_nhwgc_f16_16_16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_f16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_f16_1x1s1p0_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_BF16
|
||||
@@ -458,10 +458,10 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
|
||||
is_same_v<ComputeTypeB, BF16>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_wmma_v3_nhwgk_gkyxc_nhwgc_bf16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_wmma_v3_nhwgk_gkyxc_nhwgc_bf16_16_16_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_wmma_v3_nhwgk_gkyxc_nhwgc_bf16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_wmma_v3_nhwgk_gkyxc_nhwgc_bf16_16_16_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_INT8
|
||||
@@ -469,9 +469,9 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, int8_t> && is_same_v<ComputeTypeA, int8_t> &&
|
||||
is_same_v<ComputeTypeB, int8_t>)
|
||||
{
|
||||
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_i8_instances(op_ptrs);
|
||||
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_i8_1x1s1p0_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_i8_instances(op_ptrs);
|
||||
// add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_i8_1x1s1p0_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -486,10 +486,10 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
|
||||
is_same_v<ComputeTypeB, F16>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_f16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_f16_1x1s1p0_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_f16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_f16_1x1s1p0_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -498,10 +498,10 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, int8_t> && is_same_v<ComputeTypeA, int8_t> &&
|
||||
is_same_v<ComputeTypeB, int8_t>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_i8_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_i8_1x1s1p0_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_i8_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_i8_1x1s1p0_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -513,14 +513,14 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
|
||||
is_same_v<ComputeTypeB, F16>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_f16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_f16_1x1s1p0_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_wmma_v3_ndhwgk_gkzyxc_ndhwgc_f16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_wmma_v3_ndhwgk_gkzyxc_ndhwgc_f16_16_16_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_f16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_f16_1x1s1p0_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_wmma_v3_ndhwgk_gkzyxc_ndhwgc_f16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_wmma_v3_ndhwgk_gkzyxc_ndhwgc_f16_16_16_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_BF16
|
||||
@@ -528,10 +528,10 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
|
||||
is_same_v<ComputeTypeB, BF16>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_wmma_v3_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_wmma_v3_ndhwgk_gkzyxc_ndhwgc_bf16_16_16_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_wmma_v3_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_wmma_v3_ndhwgk_gkzyxc_ndhwgc_bf16_16_16_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_INT8
|
||||
@@ -539,10 +539,10 @@ struct DeviceOperationInstanceFactory<
|
||||
is_same_v<OutDataType, int8_t> && is_same_v<ComputeTypeA, int8_t> &&
|
||||
is_same_v<ComputeTypeB, int8_t>)
|
||||
{
|
||||
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_i8_instances(
|
||||
op_ptrs);
|
||||
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_i8_1x1s1p0_instances(
|
||||
op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_i8_instances(
|
||||
// op_ptrs);
|
||||
// add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_i8_1x1s1p0_instances(
|
||||
// op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -32,6 +32,7 @@ add_instance_library(
|
||||
xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp
|
||||
xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp
|
||||
xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp
|
||||
xdl/device_grouped_conv2d_bwd_data_xdl_v3_nhwgc_gkyxc_nhwgk_f16_instance.cpp
|
||||
|
||||
wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp
|
||||
wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp
|
||||
|
||||
@@ -210,6 +210,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
// workspace_sz will be equal to 0 for other layout than NGCHW
|
||||
const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
DeviceMem workspace_dev(workspace_sz);
|
||||
// printf("run impl\n");
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
@@ -224,8 +225,10 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
// printf("prerun\n");
|
||||
float avg_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
// printf("post run\n");
|
||||
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
|
||||
|
||||
@@ -14,118 +14,118 @@ message(STATUS "CK_PROFILER_OP_FILTER: ${CK_PROFILER_OP_FILTER}")
|
||||
message(STATUS "CK_PROFILER_INSTANCE_FILTER: ${CK_PROFILER_INSTANCE_FILTER}")
|
||||
|
||||
set(PROFILER_OPS
|
||||
profile_gemm.cpp
|
||||
profile_reduce.cpp
|
||||
profile_groupnorm_bwd_data.cpp
|
||||
profile_groupnorm_fwd.cpp
|
||||
profile_layernorm_bwd_data.cpp
|
||||
profile_layernorm_bwd_gamma_beta.cpp
|
||||
profile_groupnorm_bwd_gamma_beta.cpp
|
||||
profile_layernorm_fwd.cpp
|
||||
profile_max_pool2d_fwd.cpp
|
||||
profile_pool3d_fwd.cpp
|
||||
profile_avg_pool3d_bwd.cpp
|
||||
profile_max_pool3d_bwd.cpp
|
||||
profile_avg_pool2d_bwd.cpp
|
||||
profile_max_pool2d_bwd.cpp
|
||||
profile_softmax.cpp
|
||||
profile_batchnorm_fwd.cpp
|
||||
profile_batchnorm_bwd.cpp
|
||||
profile_batchnorm_infer.cpp
|
||||
profile_conv_tensor_rearrange.cpp
|
||||
profile_transpose.cpp
|
||||
profile_permute_scale.cpp
|
||||
profile_gemm_quantization.cpp
|
||||
# profile_gemm.cpp
|
||||
# profile_reduce.cpp
|
||||
# profile_groupnorm_bwd_data.cpp
|
||||
# profile_groupnorm_fwd.cpp
|
||||
# profile_layernorm_bwd_data.cpp
|
||||
# profile_layernorm_bwd_gamma_beta.cpp
|
||||
# profile_groupnorm_bwd_gamma_beta.cpp
|
||||
# profile_layernorm_fwd.cpp
|
||||
# profile_max_pool2d_fwd.cpp
|
||||
# profile_pool3d_fwd.cpp
|
||||
# profile_avg_pool3d_bwd.cpp
|
||||
# profile_max_pool3d_bwd.cpp
|
||||
# profile_avg_pool2d_bwd.cpp
|
||||
# profile_max_pool2d_bwd.cpp
|
||||
# profile_softmax.cpp
|
||||
# profile_batchnorm_fwd.cpp
|
||||
# profile_batchnorm_bwd.cpp
|
||||
# profile_batchnorm_infer.cpp
|
||||
# profile_conv_tensor_rearrange.cpp
|
||||
# profile_transpose.cpp
|
||||
# profile_permute_scale.cpp
|
||||
# profile_gemm_quantization.cpp
|
||||
)
|
||||
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
|
||||
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
|
||||
list(APPEND PROFILER_OPS profile_contraction_bilinear.cpp)
|
||||
list(APPEND PROFILER_OPS profile_contraction_scale.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_contraction_bilinear.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_contraction_scale.cpp)
|
||||
endif()
|
||||
if(CK_EXPERIMENTAL_BUILDER)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_tile.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_fwd_tile.cpp)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
|
||||
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
|
||||
list(APPEND PROFILER_OPS profile_gemm_reduce.cpp)
|
||||
list(APPEND PROFILER_OPS profile_batched_gemm_add_relu_gemm_add.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_add.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_gemm.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_streamk.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_add_relu.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_add_relu_add_layernorm.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_gemm_fixed_nk.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_gemm_fastgelu.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_gemm_tile_loop.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_gemm_multiply_tile_loop.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_reduce.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_batched_gemm_add_relu_gemm_add.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_add.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_gemm.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_streamk.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_add_relu.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_add_relu_add_layernorm.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_gemm_fixed_nk.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_gemm_fastgelu.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_gemm_tile_loop.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_gemm_multiply_tile_loop.cpp)
|
||||
endif()
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12")
|
||||
list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_universal_preshuffle.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_universal_preshuffle.cpp)
|
||||
endif()
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
|
||||
list(APPEND PROFILER_OPS profile_gemm_mx.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_mx.cpp)
|
||||
endif()
|
||||
list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_add.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_splitk.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_universal_streamk.cpp)
|
||||
list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu.cpp)
|
||||
list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu_add.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_add.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_splitk.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_universal_streamk.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu_add.cpp)
|
||||
list(APPEND PROFILER_OPS profile_conv_bwd_data.cpp)
|
||||
list(APPEND PROFILER_OPS profile_conv_fwd.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_conv_fwd.cpp)
|
||||
endif()
|
||||
|
||||
if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
|
||||
(SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]"))
|
||||
list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp)
|
||||
endif()
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])")
|
||||
list(APPEND PROFILER_OPS profile_gemm_multiply_multiply.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_multiply_mkultiply.cpp)
|
||||
endif()
|
||||
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
|
||||
list(APPEND PROFILER_OPS profile_gemm_universal.cpp)
|
||||
list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
|
||||
list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_clamp.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_bnorm_clamp.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_clamp.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_universal.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_clamp.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_bnorm_clamp.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_fwd_clamp.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bilinear.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_outelementop.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bilinear.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_fwd_outelementop.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp)
|
||||
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
|
||||
list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_multiply_add.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_add_silu.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_fastgelu.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_add_fastgelu.cpp)
|
||||
list(APPEND PROFILER_OPS profile_gemm_add_add_fastgelu.cpp)
|
||||
list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_multiply_add.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_add_silu.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_fastgelu.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_add_fastgelu.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_add_add_fastgelu.cpp)
|
||||
# list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
|
||||
endif()
|
||||
list(APPEND PROFILER_OPS profile_batched_gemm_gemm.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_batched_gemm_gemm.cpp)
|
||||
endif()
|
||||
|
||||
if(DL_KERNELS)
|
||||
list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
|
||||
endif()
|
||||
|
||||
if(CK_ENABLE_INT8)
|
||||
list(APPEND PROFILER_OPS profile_gemm_quantization.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_quantization.cpp)
|
||||
endif()
|
||||
|
||||
set(PROFILER_SOURCES profiler.cpp)
|
||||
@@ -152,131 +152,131 @@ endif()
|
||||
|
||||
|
||||
set(DEVICE_INSTANCES "")
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_normalization_fwd_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_normalization_bwd_data_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_normalization_bwd_gamma_beta_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_softmax_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_reduce_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_batchnorm_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_pool2d_fwd_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_pool3d_fwd_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_avg_pool2d_bwd_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_avg_pool3d_bwd_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_max_pool_bwd_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_image_to_column_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_column_to_image_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_transpose_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_permute_scale_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_normalization_fwd_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_normalization_bwd_data_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_normalization_bwd_gamma_beta_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_softmax_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_reduce_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_batchnorm_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_pool2d_fwd_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_pool3d_fwd_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_avg_pool2d_bwd_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_avg_pool3d_bwd_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_max_pool_bwd_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_image_to_column_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_column_to_image_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_transpose_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_permute_scale_instance)
|
||||
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
|
||||
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
|
||||
list(APPEND DEVICE_INSTANCES device_contraction_bilinear_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_contraction_scale_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_contraction_bilinear_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_contraction_scale_instance)
|
||||
endif()
|
||||
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_batched_gemm_add_relu_gemm_add_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_gemm_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_streamk_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_add_relu_add_layernorm_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_gemm_fixed_nk_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_gemm_fastgelu_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_gemm_tile_loop_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_batched_gemm_add_relu_gemm_add_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_gemm_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_streamk_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_relu_add_layernorm_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_gemm_fixed_nk_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_gemm_fastgelu_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_gemm_tile_loop_instance)
|
||||
endif()
|
||||
list(APPEND DEVICE_INSTANCES device_batched_gemm_reduce_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_batched_gemm_reduce_instance)
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12")
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_universal_preshuffle_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_universal_preshuffle_instance)
|
||||
endif()
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx1[12]")
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_blockscale_wp_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_blockscale_wp_instance)
|
||||
endif()
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_mx_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_mx_instance)
|
||||
endif()
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_universal_streamk_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_reduce_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_bias_add_reduce_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_conv2d_fwd_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_add_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_universal_streamk_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_reduce_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_bias_add_reduce_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_conv2d_fwd_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_add_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_conv1d_bwd_data_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_conv3d_bwd_data_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_conv2d_bwd_data_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance)
|
||||
endif()
|
||||
|
||||
if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
|
||||
(SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]" ))
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance)
|
||||
endif()
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])")
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance)
|
||||
endif()
|
||||
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_universal_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_batched_gemm_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_universal_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_batched_gemm_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv1d_fwd_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_clamp_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_clamp_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_scale_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_clamp_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_clamp_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_bnorm_clamp_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_bnorm_clamp_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bilinear_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv1d_fwd_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_clamp_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_clamp_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_scale_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_clamp_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_clamp_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_bnorm_clamp_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_bnorm_clamp_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bilinear_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance)
|
||||
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_add_silu_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_add_fastgelu_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_silu_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_fastgelu_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
|
||||
endif()
|
||||
list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
|
||||
endif()
|
||||
|
||||
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
|
||||
if(CK_EXPERIMENTAL_BUILDER)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv_fwd_tile_instances)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv_fwd_tile_instances)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(DL_KERNELS)
|
||||
list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
|
||||
endif()
|
||||
|
||||
if(CK_ENABLE_INT8)
|
||||
list(APPEND DEVICE_INSTANCES device_quantization_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_quantization_instance)
|
||||
endif()
|
||||
|
||||
set(PROFILER_LIBS utility getopt::getopt)
|
||||
|
||||
Reference in New Issue
Block a user