[CK] Fix example_grouped_conv_bwd_data_xdl_fp16 with ksplit = 2 (#2943)

root cause:  AK1 and BK1 may different in class template. so we need calculate k0 per block separately when ksplit is not 1.
This commit is contained in:
linqunAMD
2025-09-29 22:56:33 +08:00
committed by GitHub
parent 5477811670
commit 769c58f133
2 changed files with 13 additions and 9 deletions

View File

@@ -1671,7 +1671,10 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
valid = false;
}
}
else
}
else
{
if constexpr(NXdlPerWave32 > 0)
{
if(!GridwiseGemmCTranspose32::CheckValidity(
arg.a_grid_desc_m_k_container_[i],
@@ -1686,10 +1689,10 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
valid = false;
}
}
if(!valid)
{
return false;
}
}
if(!valid)
{
return false;
}
}

View File

@@ -561,9 +561,10 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
return;
}
const index_t num_k_per_block =
const index_t num_ak0_per_block =
__builtin_amdgcn_readfirstlane(a_grid_desc_ak0_m_ak1.GetLength(I0) / k_batch);
const index_t num_bk0_per_block =
__builtin_amdgcn_readfirstlane(b_grid_desc_bk0_n_bk1.GetLength(I0) / k_batch);
// HACK: this force m/n_block_data_idx_on_grid into SGPR
const index_t m_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
@@ -605,7 +606,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
true,
NumGemmKPrefetchStage>(
a_grid_desc_ak0_m_ak1,
make_multi_index(num_k_per_block * k_idx, m_block_data_idx_on_grid, 0),
make_multi_index(num_ak0_per_block * k_idx, m_block_data_idx_on_grid, 0),
a_element_op,
a_block_desc_ak0_m_ak1,
make_multi_index(0, 0, 0),
@@ -636,7 +637,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
true,
NumGemmKPrefetchStage>(
b_grid_desc_bk0_n_bk1,
make_multi_index(num_k_per_block * k_idx, n_block_data_idx_on_grid, 0),
make_multi_index(num_bk0_per_block * k_idx, n_block_data_idx_on_grid, 0),
b_element_op,
b_block_desc_bk0_n_bk1,
make_multi_index(0, 0, 0),