From 06e560d98a5fe8acb975db2c4c26817b6c90acb1 Mon Sep 17 00:00:00 2001 From: dePaul Miller Date: Mon, 10 Mar 2025 11:36:11 -0700 Subject: [PATCH] Blockwise/Groupwise kernel improvement and programatic dependent launch enablement (#2161) Co-authored-by: dePaul Miller <23461061+depaulmillz@users.noreply.github.com> --- CMakeLists.txt | 16 +++++++++++++++- include/cutlass/arch/grid_dependency_control.h | 5 +++++ ...100_mma_warpspecialized_blockwise_scaling.hpp | 6 ++++-- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b68b4350..65821237c 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -382,7 +382,21 @@ endif() if (CUTLASS_ENABLE_GDC_FOR_SM90) message(STATUS "Grid Dependency Control (GDC) is enabled for SM90 kernels (required for programmatic dependent launches).") - list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1) + list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1) +endif() + +if (NOT DEFINED CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT) + set(CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT ON) +endif() + +set(CUTLASS_ENABLE_GDC_FOR_SM100 + ${CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT} + CACHE BOOL + "Enables Grid Dependency Control (GDC) for SM100 kernels (required for PDL).") + +if (CUTLASS_ENABLE_GDC_FOR_SM100) + message(STATUS "Grid Dependency Control (GDC) is enabled for SM100 kernels (required for programmatic dependent launches).") + list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM100=1) endif() set(CUTLASS_ENABLE_SYNCLOG OFF CACHE BOOL "Enable synchronization event logging for race condition debugging. WARNING: This redefines __syncthreads() and __syncwarp() in all downstream code!") diff --git a/include/cutlass/arch/grid_dependency_control.h b/include/cutlass/arch/grid_dependency_control.h index 104214063..ae66de279 100644 --- a/include/cutlass/arch/grid_dependency_control.h +++ b/include/cutlass/arch/grid_dependency_control.h @@ -46,6 +46,11 @@ defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)) #define CUTLASS_GDC_ENABLED #endif + #if (defined(CUTLASS_ENABLE_GDC_FOR_SM100) && \ + __CUDACC_VER_MAJOR__ >= 12 && \ + defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1000 && defined(__CUDA_ARCH_FEAT_SM100_ALL)) + #define CUTLASS_GDC_ENABLED + #endif #endif namespace cutlass { diff --git a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp index 8fc9331cc..cb621a5f8 100644 --- a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp +++ b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp @@ -281,13 +281,15 @@ struct CollectiveMma< static constexpr int LeadingScalesPerTileSFA = size<0,1>(LayoutSFA{}.stride()) == 1 ? ScaleMsPerTile : ScaleKsPerTile; using ScaleCopyTypeA = cute::uint_byte_t(sizeof(ElementAccumulator)) * LeadingScalesPerTileSFA, 16)>; using SmemScalingCopyAtomA = Copy_Atom, ElementAccumulator>; + static constexpr int ElementsPerSFACopy = static_cast(sizeof(ScaleCopyTypeA) / sizeof(ElementAccumulator)); static constexpr int LeadingScalesPerTileSFB = size<0,1>(LayoutSFB{}.stride()) == 1 ? ScaleNsPerTile : ScaleKsPerTile; using ScaleCopyTypeB = cute::uint_byte_t(sizeof(ElementAccumulator)) * LeadingScalesPerTileSFB, 16)>; using SmemScalingCopyAtomB = Copy_Atom, ElementAccumulator>; + static constexpr int ElementsPerSFBCopy = static_cast(sizeof(ScaleCopyTypeB) / sizeof(ElementAccumulator)); - using TiledCopyScaleA = decltype(make_tiled_copy(SmemScalingCopyAtomA{}, Layout>{}, Layout>>{})); - using TiledCopyScaleB = decltype(make_tiled_copy(SmemScalingCopyAtomB{}, Layout>{}, Layout>>{})); + using TiledCopyScaleA = decltype(make_tiled_copy(SmemScalingCopyAtomA{}, Layout>{}, Layout>>{})); + using TiledCopyScaleB = decltype(make_tiled_copy(SmemScalingCopyAtomB{}, Layout>{}, Layout>>{})); struct SharedStorage { struct TensorStorage : cute::aligned_struct<128, _0> {