releaase 2.11 (#703)

2026-05-11 17:00:05 +00:00 · 2022-11-19 06:02:15 -08:00
parent 3c90f6aea6
commit c975e2ccbb
329 changed files with 47332 additions and 10607 deletions
--- a/examples/03_visualize_layout/CMakeLists.txt
+++ b/examples/03_visualize_layout/CMakeLists.txt
@@ -29,7 +29,6 @@


 set(TEST_COMMAND_00 RowMajor --extent=16,16)
-set(TEST_COMMAND_01 \"ColumnMajorInterleaved<4>\" --extent=32,8 --output-shape=16 --vectorize=4)

 cutlass_example_add_executable(
  03_visualize_layout
@@ -37,6 +36,5 @@ cutlass_example_add_executable(
  register_layout.cu
  TEST_COMMAND_OPTIONS
  TEST_COMMAND_00
-  TEST_COMMAND_01
  )

--- a/examples/03_visualize_layout/register_layout.cu
+++ b/examples/03_visualize_layout/register_layout.cu
@@ -64,15 +64,15 @@ void RegisterLayouts(std::map<std::string, std::unique_ptr<VisualizeLayoutBase>
      // All Ampere/Turing H/Integer matrix multiply tensor core kernels uses the same swizzling
      // layout implementation with different templates.
      //
-      // BMMA 88128  Interleaved-256
-      // BMMA 168256 Interleaved-256
+      // mma.sync.aligned.m8n8k128.s32.b1.b1.s32 Interleaved-256
+      // mma.sync.aligned.m16n8k256.s32.b1.b1.s32 Interleaved-256
      {"TensorOpMultiplicand<1,256>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 256>>},
-      // BMMA 88128  TN kblock512
-      // BMMA 168256 TN kblock512
+      // mma.sync.aligned.m8n8k128.s32.b1.b1.s32 TN kblock512
+      // mma.sync.aligned.m16n8k256.s32.b1.b1.s32 TN kblock512
      {"TensorOpMultiplicand<1,512>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 512>>},
-      // BMMA 168256 TN kblock1024
+      // mma.sync.aligned.m16n8k256.s32.b1.b1.s32 TN kblock1024
      {"TensorOpMultiplicand<1,1024>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 1024>>},
      // Integer matrix multiply.int4 8832  Interleaved-64