Add new instances for merging multiple fwd conv groups into a single GEMM batch. Allow group merging for C > 1 when vector load/store size is 1 for the output tensor. (#3639)

Co-authored-by: Ville Pietilä <>
This commit is contained in:
Ville Pietilä
2026-01-25 14:42:23 +02:00
committed by GitHub
parent f5c2f09036
commit 7ac3794284
2 changed files with 8 additions and 4 deletions

View File

@@ -1513,7 +1513,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
if constexpr(NumGroupsToMerge > 1)
{
if(!(C == 1))
if(!(C == 1) && CDEBlockTransferScalarPerVector_NPerBlock > 1)
{
return false;
}