[CK-Tile] Add the API to load SGPR (#2878)

* Have a workable version for SGPR

* have a workable version for atomic add

* Revert "have a workable version for atomic add"

This reverts commit 792377a590c26cfff9c8f545d9a9e8484a7422eb.

* substitute with the new sgpr read api

* update the CHANGELOG

* have a workable version for atomic add

* Revert "have a workable version for atomic add"

This reverts commit 792377a590c26cfff9c8f545d9a9e8484a7422eb.

* change to static for logic

* have a workable version for atomic add

* Revert "have a workable version for atomic add"

This reverts commit 792377a590c26cfff9c8f545d9a9e8484a7422eb.
This commit is contained in:
Thomas Ning
2025-09-23 01:23:56 -07:00
committed by GitHub
parent b6e8994386
commit 2cbbf5dcb3
40 changed files with 273 additions and 167 deletions

View File

@@ -707,8 +707,8 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
// divide problem
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
long_index_t batch_offset_q = 0;
long_index_t batch_offset_bias = 0;

View File

@@ -690,7 +690,7 @@ struct FmhaBwdDQDKDVKernel
// divide problem
const auto [i_tile_n, i_nhead, i_batch] = GetTileIndex();
const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN0);
const index_t i_n0 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN0);
long_index_t batch_offset_q = 0;
long_index_t batch_offset_k = 0;
@@ -1338,7 +1338,7 @@ struct FmhaBwdOGradDotOKernel
// divide problem
const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex();
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0);
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);
long_index_t batch_offset_o = 0;
long_index_t batch_offset_do = 0;
@@ -1618,7 +1618,7 @@ struct FmhaBwdConvertQGradKernel
// divide problem
const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex();
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0);
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);
long_index_t batch_offset_dq = 0;
long_index_t batch_offset_dq_acc = 0;

View File

@@ -262,8 +262,8 @@ struct FmhaFwdAppendKVKernel
// divide problem
const auto [i_tile, i_nhead, i_batch] = GetTileIndex(kargs);
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kM0);
const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kN0);
const index_t i_m0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kM0);
const index_t i_n0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kN0);
const index_t i_cache_batch = [&, i_batch_ = i_batch] {
if constexpr(kIsPagedKV)

View File

@@ -1060,8 +1060,8 @@ struct FmhaFwdKernel
// divide problem
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
long_index_t batch_offset_q = 0;
long_index_t batch_offset_k = 0;

View File

@@ -880,8 +880,8 @@ struct FmhaFwdPagedKVKernel
// divide problem
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
long_index_t batch_offset_q = 0;
long_index_t batch_offset_k = 0;

View File

@@ -281,8 +281,8 @@ struct FmhaFwdSplitKVCombineKernel
// divide problem
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
long_index_t batch_offset_lse_acc = 0;
long_index_t batch_offset_o_acc = 0;

View File

@@ -589,8 +589,8 @@ struct FmhaFwdSplitKVKernel
// divide problem
const auto [i_tile_m, i_tile_n, i_split, i_nhead, i_batch] = GetTileIndex(kargs);
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
long_index_t batch_offset_q = 0;
long_index_t batch_offset_k = 0; // unused for paged-kvcache

View File

@@ -361,8 +361,8 @@ struct FmhaFwdV3Kernel
// divide problem
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
long_index_t batch_offset_q = 0;
long_index_t batch_offset_k = 0;