v4.1 release update v2. (#2481)

2026-05-11 17:00:05 +00:00 · 2025-07-22 10:03:55 +08:00
parent 9baa06dd57
commit fd6cfe1ed0
179 changed files with 7878 additions and 1286 deletions
--- a/examples/cute/tutorial/blackwell/01_mma_sm100.cu
+++ b/examples/cute/tutorial/blackwell/01_mma_sm100.cu
@@ -259,7 +259,7 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)

  // Step 2: The Mainloop.

-  // Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
+  // Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
  tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;

  // Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
@@ -394,7 +394,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
  // In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
  // Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
  //  and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
-  //  The MMA's partitioning then yeilds the CTA-local work.
+  //  The MMA's partitioning then yields the CTA-local work.

  if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
    std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
--- a/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu
+++ b/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu
@@ -295,7 +295,7 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)

  // Step 2: The Mainloop.

-  // Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
+  // Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
  tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;

  // Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
@@ -433,7 +433,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
  // In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
  // Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
  //  and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
-  //  The MMA's partitioning then yeilds the CTA-local work.
+  //  The MMA's partitioning then yields the CTA-local work.

  if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
    std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
--- a/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu
+++ b/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu
@@ -333,7 +333,7 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)

  // Step 2: The Mainloop.

-  // Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
+  // Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
  tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;

  // Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
@@ -471,7 +471,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
  // In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
  // Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
  //  and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
-  //  The MMA's partitioning then yeilds the CTA-local work.
+  //  The MMA's partitioning then yields the CTA-local work.

  if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
    std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
--- a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu
+++ b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu
@@ -328,7 +328,7 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)

  // Step 2: The Mainloop.

-  // Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
+  // Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
  tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;

  // Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
@@ -473,7 +473,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
  // In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
  // Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
  //  and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
-  //  The MMA's partitioning then yeilds the CTA-local work.
+  //  The MMA's partitioning then yields the CTA-local work.

  if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
    std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
--- a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu
+++ b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu
@@ -341,7 +341,7 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)

  // Step 2: The Mainloop.

-  // Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
+  // Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
  tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;

  // Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
@@ -527,7 +527,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
  // In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
  // Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
  //  and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
-  //  The MMA's partitioning then yeilds the CTA-local work.
+  //  The MMA's partitioning then yields the CTA-local work.

  if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
    std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
--- a/examples/cute/tutorial/tiled_copy.cu
+++ b/examples/cute/tutorial/tiled_copy.cu
@@ -200,7 +200,7 @@ int main(int argc, char** argv)

  // Construct tiled copy, a tiling of copy atoms.
  //
-  // Note, this assumes the vector and thread layouts are aligned with contigous data
+  // Note, this assumes the vector and thread layouts are aligned with contiguous data
  // in GMEM. Alternative thread layouts are possible but may result in uncoalesced
  // reads. Alternative value layouts are also possible, though incompatible layouts
  // will result in compile time errors.