mirror of
https://github.com/NVIDIA/cutlass.git
synced 2026-05-11 17:00:05 +00:00
v4.1 release update v2. (#2481)
This commit is contained in:
@@ -259,7 +259,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@@ -394,7 +394,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@@ -295,7 +295,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@@ -433,7 +433,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@@ -333,7 +333,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@@ -471,7 +471,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@@ -328,7 +328,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@@ -473,7 +473,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@@ -341,7 +341,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@@ -527,7 +527,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@@ -200,7 +200,7 @@ int main(int argc, char** argv)
|
||||
|
||||
// Construct tiled copy, a tiling of copy atoms.
|
||||
//
|
||||
// Note, this assumes the vector and thread layouts are aligned with contigous data
|
||||
// Note, this assumes the vector and thread layouts are aligned with contiguous data
|
||||
// in GMEM. Alternative thread layouts are possible but may result in uncoalesced
|
||||
// reads. Alternative value layouts are also possible, though incompatible layouts
|
||||
// will result in compile time errors.
|
||||
|
||||
Reference in New Issue
Block a user