3.6.0 update (#2005)

* 3.6.0 update * doc and swap stuff --------- Co-authored-by: yuzhai <yuzhai@nvidia.com> Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2026-04-20 14:59:01 +00:00 · 2024-12-24 22:34:40 -08:00
parent e1cd8c7866
commit 3d261a5974
258 changed files with 10863 additions and 3883 deletions
--- a/python/cutlass_library/sm90_utils.py
+++ b/python/cutlass_library/sm90_utils.py
@@ -492,6 +492,21 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
        if not (is_fp8 and is_sparse):
            schedules.append([KernelScheduleType.TmaWarpSpecialized, default_epilogue])
    stream_k_schedules = []
+    
+    if CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+        if can_do_tma_epilogue:
+            assert not requires_transposed_epilogue
+            # Inconsistency: fp8 pingpong only gets stamped out with fast accum
+            if not is_fp8 or level >= 1:
+                schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedPingpong,
+                    EpilogueScheduleType.TmaWarpSpecialized
+                ])
+            if can_do_fp8_fast_accum:
+                schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum,
+                    EpilogueScheduleType.TmaWarpSpecialized
+                ])

    if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
        # Pruning: don't stamp out fp8 ping-ponging kernel with non-tma epilogue
@@ -526,17 +541,6 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
        # persistent kernels with TMA epilogues
        if can_do_tma_epilogue:
            assert not requires_transposed_epilogue
-            # Inconsistency: fp8 pingpong only gets stamped out with fast accum
-            if not is_fp8 or level >= 1:
-                schedules.append([
-                    KernelScheduleType.TmaWarpSpecializedPingpong,
-                    EpilogueScheduleType.TmaWarpSpecialized
-                ])
-            if can_do_fp8_fast_accum:
-                schedules.append([
-                    KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum,
-                    EpilogueScheduleType.TmaWarpSpecialized
-                ])
            if can_do_cooperative:
                # Sparse kernels only support FastAccum FP8 mainloop
                if not (is_fp8 and is_sparse):