From 06e560d98a5fe8acb975db2c4c26817b6c90acb1 Mon Sep 17 00:00:00 2001
From: dePaul Miller <depaulmillz@users.noreply.github.com>
Date: Mon, 10 Mar 2025 11:36:11 -0700
Subject: [PATCH] Blockwise/Groupwise kernel improvement and programatic
 dependent launch enablement (#2161)

Co-authored-by: dePaul Miller <23461061+depaulmillz@users.noreply.github.com>
---
 CMakeLists.txt                                   | 16 +++++++++++++++-
 include/cutlass/arch/grid_dependency_control.h   |  5 +++++
 ...100_mma_warpspecialized_blockwise_scaling.hpp |  6 ++++--
 3 files changed, 24 insertions(+), 3 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b68b4350..65821237c 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -382,7 +382,21 @@ endif()
 
 if (CUTLASS_ENABLE_GDC_FOR_SM90)
   message(STATUS "Grid Dependency Control (GDC) is enabled for SM90 kernels (required for programmatic dependent launches).")
-  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1)
+  list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1)
+endif()
+
+if (NOT DEFINED CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT)
+  set(CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT ON)
+endif()
+
+set(CUTLASS_ENABLE_GDC_FOR_SM100
+    ${CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT}
+    CACHE BOOL
+    "Enables Grid Dependency Control (GDC) for SM100 kernels (required for PDL).")
+
+if (CUTLASS_ENABLE_GDC_FOR_SM100)
+  message(STATUS "Grid Dependency Control (GDC) is enabled for SM100 kernels (required for programmatic dependent launches).")
+  list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM100=1)
 endif()
 
 set(CUTLASS_ENABLE_SYNCLOG OFF CACHE BOOL "Enable synchronization event logging for race condition debugging. WARNING: This redefines __syncthreads() and __syncwarp() in all downstream code!")
diff --git a/include/cutlass/arch/grid_dependency_control.h b/include/cutlass/arch/grid_dependency_control.h
index 104214063..ae66de279 100644
--- a/include/cutlass/arch/grid_dependency_control.h
+++ b/include/cutlass/arch/grid_dependency_control.h
@@ -46,6 +46,11 @@
      defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))
     #define CUTLASS_GDC_ENABLED
   #endif
+  #if (defined(CUTLASS_ENABLE_GDC_FOR_SM100) && \
+     __CUDACC_VER_MAJOR__ >= 12 && \
+     defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1000 && defined(__CUDA_ARCH_FEAT_SM100_ALL))
+    #define CUTLASS_GDC_ENABLED
+  #endif
 #endif
 
 namespace cutlass {
diff --git a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
index 8fc9331cc..cb621a5f8 100644
--- a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
@@ -281,13 +281,15 @@ struct CollectiveMma<
   static constexpr int LeadingScalesPerTileSFA = size<0,1>(LayoutSFA{}.stride()) == 1 ? ScaleMsPerTile : ScaleKsPerTile;
   using ScaleCopyTypeA = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementAccumulator)) * LeadingScalesPerTileSFA, 16)>;
   using SmemScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ScaleCopyTypeA>, ElementAccumulator>;
+  static constexpr int ElementsPerSFACopy = static_cast<int>(sizeof(ScaleCopyTypeA) / sizeof(ElementAccumulator));
 
   static constexpr int LeadingScalesPerTileSFB = size<0,1>(LayoutSFB{}.stride()) == 1 ? ScaleNsPerTile : ScaleKsPerTile;
   using ScaleCopyTypeB = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementAccumulator)) * LeadingScalesPerTileSFB, 16)>;
   using SmemScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ScaleCopyTypeB>, ElementAccumulator>;
+  static constexpr int ElementsPerSFBCopy = static_cast<int>(sizeof(ScaleCopyTypeB) / sizeof(ElementAccumulator));
 
-  using TiledCopyScaleA = decltype(make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_1>>{}, Layout<Shape<Int<LeadingScalesPerTileSFA>>>{}));
-  using TiledCopyScaleB = decltype(make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<Int<LeadingScalesPerTileSFB>>>{}));
+  using TiledCopyScaleA = decltype(make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ElementsPerSFACopy>>>{}));
+  using TiledCopyScaleB = decltype(make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ElementsPerSFBCopy>>>{}));
 
   struct SharedStorage {
     struct TensorStorage : cute::aligned_struct<128, _0> {