This commit is contained in:
Max Podkorytov
2025-11-10 13:11:13 -06:00
parent 76c4c12f59
commit 1e2dac15a1
7 changed files with 365 additions and 5 deletions

View File

@@ -1326,8 +1326,21 @@ struct xor_t : public base_transform<2, 2>
idx_low(number<0>{}) = idx_up[number<0>{}];
// original (1.0 rate):
// idx_low(number<1>{}) =
// idx_up[number<1>{}] ^ (idx_up[number<0>{}] % up_lengths_[number<1>{}]);
// 0.5 rate
// idx_low(number<1>{}) =
// idx_up[number<1>{}] ^ (2 * idx_up[number<0>{}]) % up_lengths_[number<1>{}];
idx_low(number<1>{}) =
idx_up[number<1>{}] ^ (idx_up[number<0>{}] % up_lengths_[number<1>{}]);
idx_up[number<1>{}] ^ (2 * idx_up[number<0>{}] % up_lengths_[number<1>{}]);
// if (threadIdx.x < 64)
// {
// printf("lane: %u | idx_low: (%d, %d) | idx_up: (%d, %d)\n",
// __lane_id(),
// idx_low[number<0>{}], idx_low[number<1>{}],
// idx_up[number<0>{}], idx_up[number<1>{}]);
// }
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>

View File

@@ -0,0 +1,164 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/config.hpp"
#include "ck_tile/core/numeric/integer.hpp"
#include "ck_tile/core/numeric/integral_constant.hpp"
#include "ck_tile/core/numeric/numeric.hpp"
#include "ck_tile/core/utility/type_traits.hpp"
#include "ck_tile/core/tensor/tensor_descriptor.hpp"
#include "ck_tile/core/tensor/tile_distribution.hpp"
#include "ck_tile/core/arch/arch.hpp"
namespace ck_tile
{
// A static distributed tensor backed by shared memory (LDS),
// with indexing defined by a naive packed tensor descriptor of the Y-dimensions
// of the provided StaticTileDistribution.
//
// Notes:
// - This does NOT allocate LDS. Callers must provide a __shared__-allocated buffer
// of size get_lds_buffer_size() elements of DataType.
// - Index mapping uses a naive packed descriptor derived from the Y lengths of the
// distribution to compute a linear offset into LDS.
// - Distributed indexing API mirrors static_distributed_tensor (operator[]/operator()).
//
// Example (inside __global__ kernel):
// using Dist = decltype(make_static_tile_distribution(my_encoding{}));
// using Tensor = ck_tile::static_distributed_lds_tensor<float, Dist>;
//
// // Compile-time LDS size in elements
// __shared__ float lds[Tensor::get_lds_buffer_size()];
//
// Tensor t{lds};
// // write using distributed indices (compile-time indices)
// t(ck_tile::detail::make_tile_distributed_index(ck_tile::sequence<0>{},
// ck_tile::sequence<0>{})) = 1.0f;
// ck_tile::block_sync_lds();
//
// // read back similarly
// auto v = t[ck_tile::detail::make_tile_distributed_index(ck_tile::sequence<0>{},
// ck_tile::sequence<0>{})];
//
template <typename DataType_, typename StaticTileDistribution_>
struct static_distributed_lds_tensor
{
using DataType = remove_cvref_t<DataType_>;
using StaticTileDistribution = remove_cvref_t<StaticTileDistribution_>;
static_assert(StaticTileDistribution::is_static(),
"StaticTileDistribution must be fully known at compile time");
// Build a naive packed Y descriptor based on the distribution's Y lengths.
// This mirrors the usage pattern from static_distributed_tensor where offsets are
// computed through the Ys-to-D descriptor, but here we explicitly construct a naive
// packed descriptor to satisfy the requirement.
using YsDescriptorOriginal =
remove_cvref_t<decltype(StaticTileDistribution{}.get_ys_to_d_descriptor())>;
using YsNaivePackedDescriptor =
remove_cvref_t<decltype(make_naive_tensor_descriptor_packed(YsDescriptorOriginal{}.get_lengths()))>;
using ThreadTensorDesc = YsNaivePackedDescriptor;
static constexpr index_t PackedSize =
ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
static constexpr index_t kLdsElementSpaceSize = ThreadTensorDesc{}.get_element_space_size();
static_assert(kLdsElementSpaceSize > 0, "Make sure tile distribution is valid for LDS");
// Number of DataType elements required in LDS buffer (considering PackedSize for DataType)
CK_TILE_HOST_DEVICE static constexpr index_t get_lds_buffer_size()
{
return kLdsElementSpaceSize / PackedSize;
}
CK_TILE_HOST_DEVICE static constexpr auto get_num_of_dimension()
{
return StaticTileDistribution::get_num_of_dimension_x();
}
CK_TILE_HOST_DEVICE static constexpr auto get_lengths()
{
return StaticTileDistribution::get_lengths();
}
CK_TILE_HOST_DEVICE static constexpr auto get_tile_distribution()
{
return StaticTileDistribution{};
}
CK_TILE_HOST_DEVICE static constexpr auto get_distributed_spans()
{
return StaticTileDistribution::get_distributed_spans();
}
// Construct with an LDS pointer (caller-provided __shared__ storage).
CK_TILE_HOST_DEVICE explicit constexpr static_distributed_lds_tensor(DataType* lds_ptr)
: p_lds_{lds_ptr}
{
}
// Bind or rebind LDS pointer at runtime if needed.
CK_TILE_DEVICE void bind(DataType* lds_ptr) { p_lds_ = lds_ptr; }
CK_TILE_DEVICE constexpr const DataType* data() const { return p_lds_; }
CK_TILE_DEVICE constexpr DataType* data() { return p_lds_; }
// Distributed indexing: const access
template <typename TileDistributedIndices>
CK_TILE_HOST_DEVICE constexpr const DataType& operator[](TileDistributedIndices) const
{
static_assert(is_static_v<TileDistributedIndices>,
"TileDistributedIndices must be static");
constexpr auto y_idx = get_tile_distribution().get_y_indices_from_distributed_indices(
TileDistributedIndices{});
constexpr index_t linear = ThreadTensorDesc{}.calculate_offset(y_idx) / PackedSize;
return p_lds_[linear];
}
// Distributed indexing: mutable access
template <typename TileDistributedIndices>
CK_TILE_HOST_DEVICE constexpr DataType& operator()(TileDistributedIndices)
{
static_assert(is_static_v<TileDistributedIndices>,
"TileDistributedIndices must be static");
constexpr auto y_idx = get_tile_distribution().get_y_indices_from_distributed_indices(
TileDistributedIndices{});
constexpr index_t linear = ThreadTensorDesc{}.calculate_offset(y_idx) / PackedSize;
return p_lds_[linear];
}
// Returns the naive Y-descriptor (type-only, constexpr-friendly)
CK_TILE_HOST_DEVICE static constexpr auto get_y_naive_packed_descriptor()
{
return ThreadTensorDesc{};
}
private:
DataType* p_lds_ = nullptr; // shared (LDS) pointer provided by caller
};
// Helper: make function
template <typename DataType, typename StaticTileDistribution>
CK_TILE_HOST_DEVICE constexpr auto
make_static_distributed_lds_tensor(DataType* lds_ptr, const StaticTileDistribution&)
{
return static_distributed_lds_tensor<remove_cvref_t<DataType>,
remove_cvref_t<StaticTileDistribution>>{lds_ptr};
}
// Utility to query required LDS size (in elements of DataType) for a given distribution
template <typename DataType, typename StaticTileDistribution>
CK_TILE_HOST_DEVICE constexpr index_t
get_required_lds_size_elems(const StaticTileDistribution&)
{
using tensor_t = static_distributed_lds_tensor<remove_cvref_t<DataType>,
remove_cvref_t<StaticTileDistribution>>;
return tensor_t::get_lds_buffer_size();
}
} // namespace ck_tile

View File

@@ -59,6 +59,10 @@ struct GemmPipelineAgBgCrImplBase
const ElementFunction& element_func) const
{
const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile);
ignore = lds_tile_window;
ignore = src_block_tile;
ignore = element_func;
ignore = block_tile_tmp;
store_tile(lds_tile_window, block_tile_tmp);
}
@@ -66,6 +70,10 @@ struct GemmPipelineAgBgCrImplBase
CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window,
const SrcBlockTile& src_block_tile) const
{
ignore = lds_tile_window;
ignore = src_block_tile;
// CK_PRINT<typename remove_cvref_t<decltype(src_block_tile)>::StaticTileDistribution::DstrEncode>();
store_tile(lds_tile_window, src_block_tile);
}
@@ -74,10 +82,12 @@ struct GemmPipelineAgBgCrImplBase
const SrcTileWindow& lds_tile_window,
bool_constant<LoadTranspose> = {}) const
{
if constexpr(LoadTranspose)
dst_block_tile = load_tile_transpose(lds_tile_window);
else
load_tile(dst_block_tile, lds_tile_window);
// if constexpr(LoadTranspose)
// dst_block_tile = load_tile_transpose(lds_tile_window);
// else
// load_tile(dst_block_tile, lds_tile_window);
ignore = dst_block_tile;
ignore = lds_tile_window;
}
CK_TILE_DEVICE auto GetABLdsTensorViews(void* p_smem) const

View File

@@ -444,12 +444,16 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
constexpr auto b_lds_load_tile_distr =
make_static_tile_distribution(BlockGemm::MakeBBlockDistributionEncode());
// CK_PRINT<typename decltype(a_lds_load_tile_distr)::DstrEncode>();
// A DRAM tile window for load
// A LDS tile window for store
// A LDS tile for block GEMM
auto&& [a_copy_dram_window, a_copy_lds_window, a_lds_gemm_window] =
Base::GetAWindows(a_dram_block_window_tmp, a_lds_block, a_lds_load_tile_distr);
// CK_PRINT<typename remove_cvref_t<decltype(a_copy_dram_window[number<0>{}])>::Base::TileDstr::DstrEncode>();
// B DRAM tile window for load
// B LDS tile window for store
// B LDS tile for block GEMM
@@ -478,6 +482,10 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
auto elementwise_As_res =
load_tile_with_elementwise(a_copy_dram_window, a_element_func);
// CK_PRINT<typename decltype(elementwise_As_res)::StaticTileDistribution::DstrEncode>();
// CK_PRINT<typename remove_cvref_t<decltype(a_copy_dram_window[number<0>{}])>::Base::TileDstr::DstrEncode>();
// Move each A — the enhanced function move_tile_window is executed, which takes a tuple
// as input.
move_tile_window(a_copy_dram_window, a_dram_tile_window_step);

View File

@@ -545,6 +545,8 @@ struct UniversalGemmBasePolicy
VecLoadSize,
getATileAccessPattern(),
NumWaveGroups>;
CK_PRINT<TileEncodingPattern>();
CK_PRINT<typename remove_cvref_t<decltype(TileEncodingPattern::make_2d_static_tile_distribution())>::DstrEncode>();
return TileEncodingPattern::make_2d_static_tile_distribution();
}
// Tile: KPerBlock X MPerBlock

19
shared_banks/test.cpp Normal file
View File

@@ -0,0 +1,19 @@
#include <hip/hip_runtime.h>
__global__ void conflict_simulator()
{
// __shared__ int buf[64];
// buf[threadIdx.x] = threadIdx.x; // -> 0 conflicts
//
// __shared__ int buf[2 * 64];
// buf[2 * threadIdx.x] = threadIdx.x; // -> 1 conflict per access (2-way)
//
__shared__ int buf[4 * 64];
buf[4 * threadIdx.x] = threadIdx.x; // -> 3 conflicts per access (4-way)
}
int main()
{
conflict_simulator<<<1, 64, 0, 0>>>();
return 0;
}

View File

@@ -0,0 +1,144 @@
#include <hip/hip_runtime.h>
#include <ck_tile/core.hpp>
#include <ck_tile/core/utility/debug.hpp>
using ck_tile::number;
using ck_tile::make_naive_tensor_descriptor;
using ck_tile::make_tile_window;
using ck_tile::make_tensor_view;
using ck_tile::make_tuple;
using ck_tile::address_space_enum;
using ck_tile::fp16_t;
using ck_tile::index_t;
using ck_tile::tile_distribution_encoding;
using ck_tile::sequence;
using ck_tile::tuple;
using ck_tile::make_static_distributed_tensor;
using ck_tile::make_static_tile_distribution;
using ck_tile::get_n_lds_banks;
using ck_tile::get_n_words_per_128b;
using ck_tile::make_xor_transform;
using ck_tile::make_pass_through_transform;
using ck_tile::make_unmerge_transform;
constexpr index_t kBlockSize = 256;
template<typename DataType, index_t MPerBlock, index_t KPerBlock>
__device__ constexpr auto make_lds_tensor_descriptor()
{
constexpr auto DataTypeSize = sizeof(DataType);
constexpr index_t KPack = 16 / sizeof(DataType);
constexpr auto MLdsLayer =
ck_tile::max(1UL, get_n_lds_banks() * get_n_words_per_128b() / KPerBlock / DataTypeSize);
constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
make_tuple(number<KPerBlock / KPack * MLdsLayer>{},
number<MPerBlock / MLdsLayer>{},
number<KPack>{}),
make_tuple(number<KPack>{}, number<KPerBlock * MLdsLayer>{}, number<1>{}),
number<KPack>{},
number<1>{});
constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
a_lds_block_desc_0,
make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
number<KPerBlock / KPack * MLdsLayer>{})),
make_pass_through_transform(number<KPack>{})),
make_tuple(sequence<1, 0>{}, sequence<2>{}),
make_tuple(sequence<1, 0>{}, sequence<2>{}));
constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
a_lds_block_desc_permuted,
make_tuple(make_unmerge_transform(
make_tuple(number<MLdsLayer>{}, number<KPerBlock / KPack>{})),
make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
make_pass_through_transform(number<KPack>{})),
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
constexpr auto a_lds_block_desc = transform_tensor_descriptor(
a_lds_block_desc_xk0_mnldslayer_mn_xk1,
make_tuple(make_merge_transform_v3_division_mod(
make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
make_merge_transform_v3_division_mod(
make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
make_tuple(sequence<0>{}, sequence<1>{}));
return a_lds_block_desc;
}
template<typename DType, index_t MPerBlock, index_t KPerBlock>
__device__
constexpr auto make_register_distribution()
{
using ck_tile::tile_distribution_encoding_pattern_2d;
using ck_tile::tile_distribution_pattern;
constexpr auto VecLoadSize = 16 / sizeof(DType);
using TileEncodingPattern =
tile_distribution_encoding_pattern_2d<kBlockSize,
MPerBlock,
KPerBlock,
VecLoadSize,
tile_distribution_pattern::thread_raked>;
return TileEncodingPattern::make_2d_static_tile_distribution();
}
template<typename DType, index_t MTile, index_t KTile>
__global__ void lds_write_simulator()
{
__shared__ int buf[160000 / sizeof(int)];
// lds setup
auto lds_tensor_descriptor = make_lds_tensor_descriptor<DType, MTile, KTile>();
auto lds_tensor_view = make_tensor_view<address_space_enum::lds>(reinterpret_cast<DType*>(buf), lds_tensor_descriptor);
auto lds_write_window = make_tile_window(lds_tensor_view, make_tuple(number<MTile>{}, number<KTile>{}), {0, 0});
// register setup
auto reg_tile_dst = make_register_distribution<DType, MTile, KTile>();
auto reg_tensor = make_static_distributed_tensor<DType>(reg_tile_dst);
// writeout
store_tile(lds_write_window, reg_tensor);
ck_tile::block_sync_lds();
}
__device__
constexpr auto make_lds_read_distribution()
{
return make_static_tile_distribution(tile_distribution_encoding<
ck_tile::sequence<2>,
ck_tile::tuple<ck_tile::sequence<4, 2, 16>, ck_tile::sequence<4, 4, 4>>,
ck_tile::tuple<ck_tile::sequence<1, 0>, ck_tile::sequence<2, 1>>,
ck_tile::tuple<ck_tile::sequence<1, 0>, ck_tile::sequence<1, 2>>,
ck_tile::sequence<1, 2, 2>,
ck_tile::sequence<0, 0, 2>>{});
}
template<typename DType, index_t MTile, index_t KTile>
__global__ void lds_read_simulator()
{
__shared__ int buf[160000 / sizeof(int)];
auto lds_tensor_descriptor = make_lds_tensor_descriptor<DType, MTile, KTile>();
auto lds_tensor_view = make_tensor_view<address_space_enum::lds>(reinterpret_cast<DType*>(buf), lds_tensor_descriptor);
constexpr auto reg_tile_dst = make_lds_read_distribution();
auto lds_read_window = make_tile_window(lds_tensor_view, make_tuple(number<MTile>{}, number<KTile>{}), {0, 0}, reg_tile_dst);
[[maybe_unused]] auto reg_tile = load_tile(lds_read_window);
ck_tile::block_sync_lds();
}
int main()
{
constexpr auto kGrid = 1;
constexpr auto kBlockM = 128;
constexpr auto kBlockK = 64;
lds_write_simulator<fp16_t, kBlockM, kBlockK><<<kGrid, kBlockSize>>>();
lds_read_simulator<fp16_t, kBlockM, kBlockK><<<kGrid, kBlockSize>>>();
return 0;
}