mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 11:30:02 +00:00
@@ -142,12 +142,11 @@ enable_clang_tidy(
|
|||||||
-cppcoreguidelines-prefer-member-initializer
|
-cppcoreguidelines-prefer-member-initializer
|
||||||
|
|
||||||
${MIOPEN_TIDY_CHECKS}
|
${MIOPEN_TIDY_CHECKS}
|
||||||
${MIOPEN_TIDY_ERRORS}
|
${MIOPEN_TIDY_ERRORS}
|
||||||
HEADER_FILTER
|
HEADER_FILTER
|
||||||
"\.hpp$"
|
"\.hpp$"
|
||||||
EXTRA_ARGS
|
EXTRA_ARGS
|
||||||
-DMIOPEN_USE_CLANG_TIDY
|
-DMIOPEN_USE_CLANG_TIDY
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|
||||||
include(CppCheck)
|
include(CppCheck)
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
|
|||||||
static constexpr index_t N0 = N / N1;
|
static constexpr index_t N0 = N / N1;
|
||||||
|
|
||||||
__host__ __device__ static constexpr auto
|
__host__ __device__ static constexpr auto
|
||||||
MakeAKM0M1BlockDescriptor(const AKMBlockDesc& a_k_m_block_desc)
|
MakeAKM0M1BlockDescriptor(const AKMBlockDesc& /* a_k_m_block_desc */)
|
||||||
{
|
{
|
||||||
const auto a_k_m0_m1_block_desc = transform_dynamic_tensor_descriptor(
|
const auto a_k_m0_m1_block_desc = transform_dynamic_tensor_descriptor(
|
||||||
AKMBlockDesc{},
|
AKMBlockDesc{},
|
||||||
@@ -84,7 +84,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
|
|||||||
}
|
}
|
||||||
|
|
||||||
__host__ __device__ static constexpr auto
|
__host__ __device__ static constexpr auto
|
||||||
MakeBKN0N1BlockDescriptor(const BKNBlockDesc& b_k_n_block_desc)
|
MakeBKN0N1BlockDescriptor(const BKNBlockDesc& /* b_k_n_block_desc */)
|
||||||
{
|
{
|
||||||
const auto b_k_n0_n1_block_desc = transform_dynamic_tensor_descriptor(
|
const auto b_k_n0_n1_block_desc = transform_dynamic_tensor_descriptor(
|
||||||
BKNBlockDesc{},
|
BKNBlockDesc{},
|
||||||
@@ -194,7 +194,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
|
|||||||
typename ABlockBuffer,
|
typename ABlockBuffer,
|
||||||
typename BBlockBuffer,
|
typename BBlockBuffer,
|
||||||
typename CThreadBuffer>
|
typename CThreadBuffer>
|
||||||
__device__ void Run(const CM0M1N0N1ThreadDesc& c_m0_m1_n0_n1_thread_desc,
|
__device__ void Run(const CM0M1N0N1ThreadDesc& /* c_m0_m1_n0_n1_thread_desc */,
|
||||||
const ABlockBuffer& a_block_buf,
|
const ABlockBuffer& a_block_buf,
|
||||||
const BBlockBuffer& b_block_buf,
|
const BBlockBuffer& b_block_buf,
|
||||||
CThreadBuffer& c_thread_buf) const
|
CThreadBuffer& c_thread_buf) const
|
||||||
|
|||||||
@@ -120,9 +120,6 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
|
|||||||
"wrong! inconsistent type");
|
"wrong! inconsistent type");
|
||||||
|
|
||||||
constexpr auto I0 = Number<0>{};
|
constexpr auto I0 = Number<0>{};
|
||||||
constexpr auto I1 = Number<1>{};
|
|
||||||
constexpr auto I2 = Number<2>{};
|
|
||||||
constexpr auto I3 = Number<3>{};
|
|
||||||
|
|
||||||
constexpr auto a_block_mtx = BlockMatrixA{};
|
constexpr auto a_block_mtx = BlockMatrixA{};
|
||||||
|
|
||||||
|
|||||||
@@ -270,7 +270,6 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
|
|||||||
const index_t waveId = thread_id / WaveSize;
|
const index_t waveId = thread_id / WaveSize;
|
||||||
const index_t laneId = thread_id % WaveSize;
|
const index_t laneId = thread_id % WaveSize;
|
||||||
const index_t waveId_m = waveId / NWaves;
|
const index_t waveId_m = waveId / NWaves;
|
||||||
const index_t waveId_n = waveId % NWaves;
|
|
||||||
|
|
||||||
if constexpr(xdlops_gemm.IsKReduction)
|
if constexpr(xdlops_gemm.IsKReduction)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -619,17 +619,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
|
|||||||
|
|
||||||
// output: register to global memory
|
// output: register to global memory
|
||||||
{
|
{
|
||||||
constexpr index_t M11 =
|
|
||||||
M1PerThreadM111 * M11N11ThreadClusterM1100 * M11N11ThreadClusterM1101;
|
|
||||||
constexpr index_t N11 =
|
|
||||||
N1PerThreadN111 * M11N11ThreadClusterN1100 * M11N11ThreadClusterN1101;
|
|
||||||
|
|
||||||
constexpr index_t M10 = MPerBlockM1 / M11;
|
|
||||||
constexpr index_t N10 = NPerBlockN1 / N11;
|
|
||||||
|
|
||||||
constexpr index_t M111 = M1PerThreadM111;
|
|
||||||
constexpr index_t N111 = N1PerThreadN111;
|
|
||||||
|
|
||||||
constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
|
constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
|
||||||
make_dynamic_naive_tensor_descriptor_packed_v2(
|
make_dynamic_naive_tensor_descriptor_packed_v2(
|
||||||
make_tuple(I1,
|
make_tuple(I1,
|
||||||
|
|||||||
@@ -191,12 +191,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
|
|||||||
const auto M = a_k0_m_k1_grid_desc.GetLength(I1);
|
const auto M = a_k0_m_k1_grid_desc.GetLength(I1);
|
||||||
const auto N = b_k0_n_k1_grid_desc.GetLength(I1);
|
const auto N = b_k0_n_k1_grid_desc.GetLength(I1);
|
||||||
const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
|
const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
|
||||||
const auto K1 = a_k0_m_k1_grid_desc.GetLength(I2);
|
|
||||||
|
|
||||||
// TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
|
// TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
|
||||||
|
|
||||||
return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
|
return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
|
||||||
K0 == b_k0_n_k1_grid_desc.GetLength(I0) &&
|
K0 == b_k0_n_k1_grid_desc.GetLength(I0) &&
|
||||||
|
K1 == a_k0_m_k1_grid_desc.GetLength(I2) &&
|
||||||
K1 == b_k0_n_k1_grid_desc.GetLength(I2)) &&
|
K1 == b_k0_n_k1_grid_desc.GetLength(I2)) &&
|
||||||
(M % MPerBlockM1 == 0 && N % NPerBlockN1 == 0 && K0 % KPerBlock == 0);
|
(M % MPerBlockM1 == 0 && N % NPerBlockN1 == 0 && K0 % KPerBlock == 0);
|
||||||
}
|
}
|
||||||
@@ -608,19 +608,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
|
|||||||
|
|
||||||
// output: register to global memory
|
// output: register to global memory
|
||||||
{
|
{
|
||||||
constexpr auto M11 =
|
|
||||||
Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies_v2{}, I1) *
|
|
||||||
M1PerThreadM111>{};
|
|
||||||
constexpr auto N11 =
|
|
||||||
Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies_v2{}, I1) *
|
|
||||||
N1PerThreadN111>{};
|
|
||||||
|
|
||||||
constexpr index_t M10 = MPerBlockM1 / M11;
|
|
||||||
constexpr index_t N10 = NPerBlockN1 / N11;
|
|
||||||
|
|
||||||
constexpr index_t M111 = M1PerThreadM111;
|
|
||||||
constexpr index_t N111 = N1PerThreadN111;
|
|
||||||
|
|
||||||
constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
|
constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
|
||||||
make_dynamic_naive_tensor_descriptor_packed_v2(
|
make_dynamic_naive_tensor_descriptor_packed_v2(
|
||||||
make_tuple(I1,
|
make_tuple(I1,
|
||||||
|
|||||||
@@ -102,7 +102,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
|
|||||||
|
|
||||||
// divide block work by [M, N]
|
// divide block work by [M, N]
|
||||||
#if 0
|
#if 0
|
||||||
const auto k_block_work_num = K / Number<KPerBlock>{};
|
|
||||||
const auto ho_block_work_num = Ho / Number<HoPerBlock>{};
|
const auto ho_block_work_num = Ho / Number<HoPerBlock>{};
|
||||||
const auto wo_block_work_num = Wo / Number<WoPerBlock>{};
|
const auto wo_block_work_num = Wo / Number<WoPerBlock>{};
|
||||||
const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
|
const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
|
||||||
@@ -114,7 +113,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
|
|||||||
const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
|
const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
|
||||||
#else
|
#else
|
||||||
// Hack: this force result into SGPR
|
// Hack: this force result into SGPR
|
||||||
const index_t k_block_work_num = __builtin_amdgcn_readfirstlane(K / KPerBlock);
|
|
||||||
const index_t ho_block_work_num = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
|
const index_t ho_block_work_num = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
|
||||||
const index_t wo_block_work_num = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
|
const index_t wo_block_work_num = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
|
||||||
const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
|
const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
|
||||||
|
|||||||
@@ -269,11 +269,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
|
|||||||
const CM0M1M2NGridDesc& c_m0_m1_m2_n_grid_desc,
|
const CM0M1M2NGridDesc& c_m0_m1_m2_n_grid_desc,
|
||||||
const CBlockClusterAdaptor& c_block_cluster_adaptor)
|
const CBlockClusterAdaptor& c_block_cluster_adaptor)
|
||||||
{
|
{
|
||||||
constexpr auto I0 = Number<0>{};
|
|
||||||
constexpr auto I1 = Number<1>{};
|
|
||||||
constexpr auto I2 = Number<2>{};
|
|
||||||
constexpr auto I3 = Number<3>{};
|
|
||||||
|
|
||||||
const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||||
p_a_grid, a_k0_m_k1_grid_desc.GetElementSpaceSize());
|
p_a_grid, a_k0_m_k1_grid_desc.GetElementSpaceSize());
|
||||||
const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
|
||||||
|
|||||||
@@ -57,8 +57,6 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
|
|||||||
|
|
||||||
constexpr auto I0 = Number<0>{};
|
constexpr auto I0 = Number<0>{};
|
||||||
constexpr auto I1 = Number<1>{};
|
constexpr auto I1 = Number<1>{};
|
||||||
constexpr auto I2 = Number<2>{};
|
|
||||||
constexpr auto I3 = Number<3>{};
|
|
||||||
|
|
||||||
constexpr auto E = ADesc{}.GetLength(I0);
|
constexpr auto E = ADesc{}.GetLength(I0);
|
||||||
constexpr auto K = ADesc{}.GetLength(I1);
|
constexpr auto K = ADesc{}.GetLength(I1);
|
||||||
|
|||||||
@@ -34,12 +34,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
|
|||||||
constexpr auto I0 = Number<0>{};
|
constexpr auto I0 = Number<0>{};
|
||||||
constexpr auto I1 = Number<1>{};
|
constexpr auto I1 = Number<1>{};
|
||||||
constexpr auto I2 = Number<2>{};
|
constexpr auto I2 = Number<2>{};
|
||||||
constexpr auto I3 = Number<3>{};
|
|
||||||
constexpr auto I4 = Number<4>{};
|
|
||||||
constexpr auto I5 = Number<5>{};
|
|
||||||
constexpr auto I6 = Number<6>{};
|
|
||||||
constexpr auto I7 = Number<7>{};
|
|
||||||
constexpr auto I8 = Number<8>{};
|
|
||||||
|
|
||||||
DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
|
DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
|
||||||
DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
|
DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
|
||||||
@@ -198,8 +192,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
|
|||||||
in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks,
|
in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks,
|
||||||
nrepeat);
|
nrepeat);
|
||||||
|
|
||||||
float perf = (float)calculate_convolution_flops(
|
float perf = static_cast<float>(calculate_convolution_flops(
|
||||||
in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
|
in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc)) /
|
||||||
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
||||||
|
|
||||||
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
|
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
|
||||||
|
|||||||
@@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
|
|||||||
constexpr auto I1 = Number<1>{};
|
constexpr auto I1 = Number<1>{};
|
||||||
constexpr auto I2 = Number<2>{};
|
constexpr auto I2 = Number<2>{};
|
||||||
constexpr auto I3 = Number<3>{};
|
constexpr auto I3 = Number<3>{};
|
||||||
constexpr auto I4 = Number<4>{};
|
|
||||||
constexpr auto I5 = Number<5>{};
|
|
||||||
constexpr auto I6 = Number<6>{};
|
|
||||||
constexpr auto I7 = Number<7>{};
|
|
||||||
constexpr auto I8 = Number<8>{};
|
|
||||||
|
|
||||||
DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
|
DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
|
||||||
DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
|
DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
|
||||||
@@ -271,7 +266,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
|
|||||||
const auto Y = wei_k_y_x_c_lengths[I1];
|
const auto Y = wei_k_y_x_c_lengths[I1];
|
||||||
const auto X = wei_k_y_x_c_lengths[I2];
|
const auto X = wei_k_y_x_c_lengths[I2];
|
||||||
|
|
||||||
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
|
float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
|
||||||
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
||||||
|
|
||||||
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
||||||
|
|||||||
@@ -34,12 +34,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
|
|||||||
constexpr auto I0 = Number<0>{};
|
constexpr auto I0 = Number<0>{};
|
||||||
constexpr auto I1 = Number<1>{};
|
constexpr auto I1 = Number<1>{};
|
||||||
constexpr auto I2 = Number<2>{};
|
constexpr auto I2 = Number<2>{};
|
||||||
constexpr auto I3 = Number<3>{};
|
|
||||||
constexpr auto I4 = Number<4>{};
|
|
||||||
constexpr auto I5 = Number<5>{};
|
|
||||||
constexpr auto I6 = Number<6>{};
|
|
||||||
constexpr auto I7 = Number<7>{};
|
|
||||||
constexpr auto I8 = Number<8>{};
|
|
||||||
|
|
||||||
DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
|
DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
|
||||||
DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
|
DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
|
||||||
@@ -194,8 +188,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
|
|||||||
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
|
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
|
||||||
nrepeat);
|
nrepeat);
|
||||||
|
|
||||||
float perf = (float)calculate_convolution_flops(
|
float perf = static_cast<float>(calculate_convolution_flops(
|
||||||
in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
|
in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc)) /
|
||||||
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
||||||
|
|
||||||
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
|
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
|
||||||
|
|||||||
@@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
|
|||||||
constexpr auto I1 = Number<1>{};
|
constexpr auto I1 = Number<1>{};
|
||||||
constexpr auto I2 = Number<2>{};
|
constexpr auto I2 = Number<2>{};
|
||||||
constexpr auto I3 = Number<3>{};
|
constexpr auto I3 = Number<3>{};
|
||||||
constexpr auto I4 = Number<4>{};
|
|
||||||
constexpr auto I5 = Number<5>{};
|
|
||||||
constexpr auto I6 = Number<6>{};
|
|
||||||
constexpr auto I7 = Number<7>{};
|
|
||||||
constexpr auto I8 = Number<8>{};
|
|
||||||
|
|
||||||
DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
|
DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
|
||||||
DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
|
DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
|
||||||
@@ -352,7 +347,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
|
|||||||
const auto Y = wei_k_y_x_c_lengths[I1];
|
const auto Y = wei_k_y_x_c_lengths[I1];
|
||||||
const auto X = wei_k_y_x_c_lengths[I2];
|
const auto X = wei_k_y_x_c_lengths[I2];
|
||||||
|
|
||||||
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
|
float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
|
||||||
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
||||||
|
|
||||||
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
|
|||||||
const Tensor<TInWei>& in_n_c_hi_wi,
|
const Tensor<TInWei>& in_n_c_hi_wi,
|
||||||
const Tensor<TInWei>& wei_k_c_y_x,
|
const Tensor<TInWei>& wei_k_c_y_x,
|
||||||
Tensor<TOut>& out_n_k_ho_wo,
|
Tensor<TOut>& out_n_k_ho_wo,
|
||||||
ck::index_t nrepeat)
|
ck::index_t /* nrepeat */)
|
||||||
{
|
{
|
||||||
using namespace ck;
|
using namespace ck;
|
||||||
|
|
||||||
|
|||||||
@@ -232,8 +232,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
|
|||||||
in_grid_move_slice_window_iterator_hacks,
|
in_grid_move_slice_window_iterator_hacks,
|
||||||
nrepeat);
|
nrepeat);
|
||||||
|
|
||||||
float perf = (float)calculate_convolution_flops(
|
float perf = static_cast<float>(calculate_convolution_flops(
|
||||||
in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo) /
|
in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo)) /
|
||||||
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
||||||
|
|
||||||
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
|
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
|
||||||
|
|||||||
@@ -338,10 +338,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
|
|||||||
|
|
||||||
float ave_time = timer.GetElapsedTime() / nrepeat;
|
float ave_time = timer.GetElapsedTime() / nrepeat;
|
||||||
|
|
||||||
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
|
float perf =
|
||||||
wei_k_c_y_x_global_desc,
|
static_cast<float>(calculate_convolution_flops(in_n_c_hi_wi_global_desc,
|
||||||
out_n_k0_ho_wo_k1_global_desc) /
|
wei_k_c_y_x_global_desc,
|
||||||
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
out_n_k0_ho_wo_k1_global_desc)) /
|
||||||
|
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
||||||
|
|
||||||
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
|
|||||||
@@ -354,10 +354,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
|
|||||||
|
|
||||||
float ave_time = timer.GetElapsedTime() / nrepeat;
|
float ave_time = timer.GetElapsedTime() / nrepeat;
|
||||||
|
|
||||||
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
|
float perf =
|
||||||
wei_k_c_y_x_global_desc,
|
static_cast<float>(calculate_convolution_flops(in_n_c_hi_wi_global_desc,
|
||||||
out_n_k0_ho_wo_k1_global_desc) /
|
wei_k_c_y_x_global_desc,
|
||||||
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
out_n_k0_ho_wo_k1_global_desc)) /
|
||||||
|
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
||||||
|
|
||||||
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
|
|||||||
@@ -128,10 +128,8 @@ int main(int argc, char* argv[])
|
|||||||
|
|
||||||
std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
|
std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
|
||||||
|
|
||||||
switch(layout)
|
if(layout == ConvTensorLayout::NCHW)
|
||||||
{
|
{
|
||||||
case ConvTensorLayout::NCHW:
|
|
||||||
// NCHW
|
|
||||||
in_lengths_host[0] = static_cast<std::size_t>(N);
|
in_lengths_host[0] = static_cast<std::size_t>(N);
|
||||||
in_lengths_host[1] = static_cast<std::size_t>(C);
|
in_lengths_host[1] = static_cast<std::size_t>(C);
|
||||||
in_lengths_host[2] = static_cast<std::size_t>(Hi);
|
in_lengths_host[2] = static_cast<std::size_t>(Hi);
|
||||||
@@ -144,9 +142,9 @@ int main(int argc, char* argv[])
|
|||||||
out_lengths_host[1] = static_cast<std::size_t>(K);
|
out_lengths_host[1] = static_cast<std::size_t>(K);
|
||||||
out_lengths_host[2] = static_cast<std::size_t>(Ho);
|
out_lengths_host[2] = static_cast<std::size_t>(Ho);
|
||||||
out_lengths_host[3] = static_cast<std::size_t>(Wo);
|
out_lengths_host[3] = static_cast<std::size_t>(Wo);
|
||||||
break;
|
}
|
||||||
case ConvTensorLayout::NHWC:
|
else if(layout == ConvTensorLayout::NHWC)
|
||||||
// NHWC
|
{
|
||||||
in_lengths_host[0] = static_cast<std::size_t>(N);
|
in_lengths_host[0] = static_cast<std::size_t>(N);
|
||||||
in_lengths_host[1] = static_cast<std::size_t>(Hi);
|
in_lengths_host[1] = static_cast<std::size_t>(Hi);
|
||||||
in_lengths_host[2] = static_cast<std::size_t>(Wi);
|
in_lengths_host[2] = static_cast<std::size_t>(Wi);
|
||||||
@@ -159,8 +157,10 @@ int main(int argc, char* argv[])
|
|||||||
out_lengths_host[1] = static_cast<std::size_t>(Ho);
|
out_lengths_host[1] = static_cast<std::size_t>(Ho);
|
||||||
out_lengths_host[2] = static_cast<std::size_t>(Wo);
|
out_lengths_host[2] = static_cast<std::size_t>(Wo);
|
||||||
out_lengths_host[3] = static_cast<std::size_t>(K);
|
out_lengths_host[3] = static_cast<std::size_t>(K);
|
||||||
break;
|
}
|
||||||
default: throw std::runtime_error("wrong! not implemented");
|
else
|
||||||
|
{
|
||||||
|
throw std::runtime_error("wrong! not implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor<in_data_t> in_host(in_lengths_host);
|
Tensor<in_data_t> in_host(in_lengths_host);
|
||||||
|
|||||||
@@ -467,7 +467,6 @@ int main(int argc, char* argv[])
|
|||||||
|
|
||||||
check_error(out_host, out_device);
|
check_error(out_host, out_device);
|
||||||
|
|
||||||
#if 0
|
|
||||||
if(do_log)
|
if(do_log)
|
||||||
{
|
{
|
||||||
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
|
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
|
||||||
@@ -475,6 +474,5 @@ int main(int argc, char* argv[])
|
|||||||
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
|
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
|
||||||
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
|
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(
|
|||||||
|
|
||||||
template <class InDesc, class WeiDesc, class OutDesc>
|
template <class InDesc, class WeiDesc, class OutDesc>
|
||||||
constexpr std::size_t
|
constexpr std::size_t
|
||||||
calculate_convolution_flops(const InDesc& in_desc, const WeiDesc& wei_desc, const OutDesc& out_desc)
|
calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDesc& out_desc)
|
||||||
{
|
{
|
||||||
using namespace ck;
|
using namespace ck;
|
||||||
|
|
||||||
|
|||||||
@@ -14,15 +14,13 @@ void host_direct_convolution(const Tensor<TIn>& in,
|
|||||||
const ConvStrides& conv_strides,
|
const ConvStrides& conv_strides,
|
||||||
const ConvDilations& conv_dilations,
|
const ConvDilations& conv_dilations,
|
||||||
const InLeftPads& in_left_pads,
|
const InLeftPads& in_left_pads,
|
||||||
const InRightPads& in_right_pads,
|
const InRightPads&,
|
||||||
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
|
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
|
||||||
{
|
{
|
||||||
using namespace ck;
|
using namespace ck;
|
||||||
|
|
||||||
constexpr auto I0 = Number<0>{};
|
constexpr auto I0 = Number<0>{};
|
||||||
constexpr auto I1 = Number<1>{};
|
constexpr auto I1 = Number<1>{};
|
||||||
constexpr auto I2 = Number<2>{};
|
|
||||||
constexpr auto I3 = Number<3>{};
|
|
||||||
|
|
||||||
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
|
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
|
||||||
double v = 0;
|
double v = 0;
|
||||||
@@ -68,23 +66,25 @@ void host_direct_convolution(const Tensor<TIn>& in,
|
|||||||
out(n, ho, wo, k) = v;
|
out(n, ho, wo, k) = v;
|
||||||
};
|
};
|
||||||
|
|
||||||
switch(layout)
|
if(layout == ConvTensorLayout::NCHW)
|
||||||
{
|
{
|
||||||
case ConvTensorLayout::NCHW:
|
|
||||||
make_ParallelTensorFunctor(f_nchw,
|
make_ParallelTensorFunctor(f_nchw,
|
||||||
out.mDesc.GetLengths()[0],
|
out.mDesc.GetLengths()[0],
|
||||||
out.mDesc.GetLengths()[1],
|
out.mDesc.GetLengths()[1],
|
||||||
out.mDesc.GetLengths()[2],
|
out.mDesc.GetLengths()[2],
|
||||||
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
||||||
break;
|
}
|
||||||
case ConvTensorLayout::NHWC:
|
else if(layout == ConvTensorLayout::NHWC)
|
||||||
|
{
|
||||||
make_ParallelTensorFunctor(f_nhwc,
|
make_ParallelTensorFunctor(f_nhwc,
|
||||||
out.mDesc.GetLengths()[0],
|
out.mDesc.GetLengths()[0],
|
||||||
out.mDesc.GetLengths()[1],
|
out.mDesc.GetLengths()[1],
|
||||||
out.mDesc.GetLengths()[2],
|
out.mDesc.GetLengths()[2],
|
||||||
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
||||||
break;
|
}
|
||||||
default: throw std::runtime_error("wrong! not supported layout");
|
else
|
||||||
|
{
|
||||||
|
throw std::runtime_error("wrong! not supported layout");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -100,17 +100,15 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
|
|||||||
constexpr std::size_t HoPerTile = 2;
|
constexpr std::size_t HoPerTile = 2;
|
||||||
constexpr std::size_t WoPerTile = 2;
|
constexpr std::size_t WoPerTile = 2;
|
||||||
|
|
||||||
std::size_t N = in_nchw.mDesc.GetLengths()[0];
|
std::size_t N = in_nchw.mDesc.GetLengths()[0];
|
||||||
std::size_t C = in_nchw.mDesc.GetLengths()[1];
|
std::size_t C = in_nchw.mDesc.GetLengths()[1];
|
||||||
std::size_t HI = in_nchw.mDesc.GetLengths()[2];
|
|
||||||
std::size_t WI = in_nchw.mDesc.GetLengths()[3];
|
|
||||||
|
|
||||||
std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
|
std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
|
||||||
std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
|
std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
|
||||||
std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
|
std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
|
||||||
|
|
||||||
std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
|
std::size_t Ho = out_nkhw.mDesc.GetLengths()[2];
|
||||||
std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
|
std::size_t Wo = out_nkhw.mDesc.GetLengths()[3];
|
||||||
|
|
||||||
index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
|
index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
|
||||||
index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
|
index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
|
||||||
@@ -118,8 +116,8 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
|
|||||||
std::size_t HiPerTile = HoPerTile + Y - 1;
|
std::size_t HiPerTile = HoPerTile + Y - 1;
|
||||||
std::size_t WiPerTile = WoPerTile + X - 1;
|
std::size_t WiPerTile = WoPerTile + X - 1;
|
||||||
|
|
||||||
std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
|
std::size_t HTile = (Ho + HoPerTile - 1) / HoPerTile;
|
||||||
std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;
|
std::size_t WTile = (Wo + WoPerTile - 1) / WoPerTile;
|
||||||
|
|
||||||
Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
|
Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
|
||||||
Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
|
Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ struct GeneratorTensor_1
|
|||||||
int value = 1;
|
int value = 1;
|
||||||
|
|
||||||
template <typename... Is>
|
template <typename... Is>
|
||||||
float operator()(Is... is)
|
float operator()(Is...)
|
||||||
{
|
{
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -99,40 +99,48 @@ struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
|
|||||||
// clang-format on
|
// clang-format on
|
||||||
}
|
}
|
||||||
|
|
||||||
ck::DataTypeEnum_t ABDataTypeEnum;
|
ck::DataTypeEnum_t ABDataTypeEnum = ck::DataTypeEnum_t::Unknown;
|
||||||
ck::DataTypeEnum_t AccDataTypeEnum;
|
ck::DataTypeEnum_t AccDataTypeEnum = ck::DataTypeEnum_t::Unknown;
|
||||||
ck::DataTypeEnum_t CDataTypeEnum;
|
ck::DataTypeEnum_t CDataTypeEnum = ck::DataTypeEnum_t::Unknown;
|
||||||
|
|
||||||
int BlockSize;
|
int BlockSize = 1;
|
||||||
|
|
||||||
int GN0;
|
int GN0 = -1;
|
||||||
int GK1;
|
int GK1 = -1;
|
||||||
|
|
||||||
int GM1PerBlockGM11;
|
int GM1PerBlockGM11 = -1;
|
||||||
int GN1PerBlockGN11;
|
int GN1PerBlockGN11 = -1;
|
||||||
int GK0PerBlock;
|
int GK0PerBlock = -1;
|
||||||
|
|
||||||
int BM1PerThreadBM11;
|
int BM1PerThreadBM11 = -1;
|
||||||
int BN1PerThreadBN11;
|
int BN1PerThreadBN11 = -1;
|
||||||
int BK0PerThread;
|
int BK0PerThread = -1;
|
||||||
|
|
||||||
std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
|
std::array<int, 2> BM10BN10ThreadClusterBM10Xs = {-1, -1};
|
||||||
std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
|
std::array<int, 2> BM10BN10ThreadClusterBN10Xs = {-1, -1};
|
||||||
|
|
||||||
std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
|
std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = {
|
||||||
std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
|
-1, -1, -1, -1, -1};
|
||||||
std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
|
std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = {
|
||||||
std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
|
-1, -1, -1, -1, -1};
|
||||||
|
std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
|
||||||
|
-1, -1, -1, -1, -1};
|
||||||
|
std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
|
||||||
|
-1, -1, -1, -1, -1};
|
||||||
|
|
||||||
std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
|
std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = {
|
||||||
std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
|
-1, -1, -1, -1, -1};
|
||||||
std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
|
std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = {
|
||||||
std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
|
-1, -1, -1, -1, -1};
|
||||||
|
std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
|
||||||
|
-1, -1, -1, -1, -1};
|
||||||
|
std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
|
||||||
|
-1, -1, -1, -1, -1};
|
||||||
|
|
||||||
int CThreadTransferDstScalarPerVector;
|
int CThreadTransferDstScalarPerVector = -1;
|
||||||
|
|
||||||
bool HasMainKBlockLoop;
|
bool HasMainKBlockLoop = false;
|
||||||
bool HasDoubleTailKBlockLoop;
|
bool HasDoubleTailKBlockLoop = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
|
struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
|
||||||
|
|||||||
Reference in New Issue
Block a user