mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-07-01 04:07:56 +00:00
update
This commit is contained in:
@@ -125,7 +125,7 @@ struct tile_scatter_gather
|
||||
|
||||
static constexpr auto get_space_filling_curve()
|
||||
{
|
||||
constexpr auto tile_dstr = TileDstr{};
|
||||
[[maybe_unused]] constexpr auto tile_dstr = TileDstr{};
|
||||
|
||||
constexpr auto thread_tensor_lengths_ys =
|
||||
to_sequence(tile_dstr.get_ys_to_d_descriptor().get_lengths());
|
||||
@@ -309,7 +309,7 @@ struct tile_scatter_gather
|
||||
CK_TILE_DEVICE auto load(number<i_access_unsupport_> = {},
|
||||
bool_constant<oob_conditional_check> = {}) const
|
||||
{
|
||||
constexpr auto tile_dstr = TileDstr{};
|
||||
[[maybe_unused]] constexpr auto tile_dstr = TileDstr{};
|
||||
auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
|
||||
load(dst_tensor, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
|
||||
return dst_tensor;
|
||||
@@ -326,7 +326,7 @@ struct tile_scatter_gather
|
||||
using vector_t = typename Traits::vector_t;
|
||||
using SFC_Ys = typename Traits::SFC_Ys;
|
||||
|
||||
constexpr auto tile_dstr = TileDstr{};
|
||||
[[maybe_unused]] constexpr auto tile_dstr = TileDstr{};
|
||||
|
||||
// loop over thread tensor space [y0, y1, ...]
|
||||
static_for<0, NumCoord, 1>{}([&](auto iCoord) {
|
||||
@@ -418,7 +418,7 @@ struct tile_scatter_gather
|
||||
using vector_t = typename Traits::vector_t;
|
||||
using SFC_Ys = typename Traits::SFC_Ys;
|
||||
|
||||
constexpr auto tile_dstr = TileDstr{};
|
||||
[[maybe_unused]] constexpr auto tile_dstr = TileDstr{};
|
||||
|
||||
// Precompute invariant values outside loops
|
||||
const auto window_origin = lds_tile.get_window_origin();
|
||||
@@ -614,7 +614,7 @@ struct tile_scatter_gather
|
||||
using vector_t = typename Traits::vector_t;
|
||||
using SFC_Ys = typename Traits::SFC_Ys;
|
||||
|
||||
constexpr auto tile_dstr = TileDstr{};
|
||||
[[maybe_unused]] constexpr auto tile_dstr = TileDstr{};
|
||||
|
||||
static_for<0, NumCoord, 1>{}([&](auto iCoord) {
|
||||
auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
|
||||
@@ -696,7 +696,7 @@ struct tile_scatter_gather
|
||||
using vector_t = typename Traits::vector_t;
|
||||
using SFC_Ys = typename Traits::SFC_Ys;
|
||||
|
||||
constexpr auto tile_dstr = TileDstr{};
|
||||
[[maybe_unused]] constexpr auto tile_dstr = TileDstr{};
|
||||
// printf("off %d\n", page_idx_[I0]);
|
||||
// loop over thread tensor space [y0, y1, ...]
|
||||
static_for<0, NumCoord, 1>{}([&](auto iCoord) {
|
||||
|
||||
@@ -444,7 +444,7 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
|
||||
typename BFlatBlockWindowTmp,
|
||||
typename DequantBFlatWindow>
|
||||
CK_TILE_HOST_DEVICE auto operator()(ADramBlockWindowTmp a_copy_dram_window_,
|
||||
const AElementFunction& a_element_func,
|
||||
[[maybe_unused]] const AElementFunction& a_element_func,
|
||||
const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
|
||||
const DequantBFlatWindow& scale_b_flat_window,
|
||||
const index_t num_loop,
|
||||
@@ -606,7 +606,7 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
|
||||
scale_b_warp_tensor_pong;
|
||||
|
||||
using ABlockTile = decltype(load_tile(a_copy_dram_window));
|
||||
ABlockTile a_block_tile;
|
||||
[[maybe_unused]] ABlockTile a_block_tile;
|
||||
|
||||
enum
|
||||
{
|
||||
@@ -621,7 +621,7 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
|
||||
if constexpr(prefill_location & PrefillAfterGemm)
|
||||
async_load_tile(lds_tile_a, dram_tile_a);
|
||||
};
|
||||
auto prefill_lds_a_stage2 = [&](auto lds_tile_a) {
|
||||
auto prefill_lds_a_stage2 = [&]([[maybe_unused]] auto lds_tile_a) {
|
||||
// async_load_fence();
|
||||
// __builtin_amdgcn_s_waitcnt(0x03fc);
|
||||
// data has been stored in lds, no need more operation.
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
#define CKTILE_FLATMM_USE_BUFFER_LOAD_LDS_AS_POSSIBLE 0
|
||||
#define CKTILE_FLATMM_USE_BUFFER_LOAD_LDS_AS_POSSIBLE 1
|
||||
|
||||
#if defined(__gfx950__)
|
||||
#define CKTILE_FLATMM_ARCH_SUPPORT_BUFFER_LOAD_LDS_DWORDx4 1
|
||||
|
||||
Reference in New Issue
Block a user