mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-30 11:47:48 +00:00
save tmp
This commit is contained in:
@@ -1326,8 +1326,21 @@ struct xor_t : public base_transform<2, 2>
|
||||
|
||||
idx_low(number<0>{}) = idx_up[number<0>{}];
|
||||
|
||||
// original (1.0 rate):
|
||||
// idx_low(number<1>{}) =
|
||||
// idx_up[number<1>{}] ^ (idx_up[number<0>{}] % up_lengths_[number<1>{}]);
|
||||
// 0.5 rate
|
||||
// idx_low(number<1>{}) =
|
||||
// idx_up[number<1>{}] ^ (2 * idx_up[number<0>{}]) % up_lengths_[number<1>{}];
|
||||
idx_low(number<1>{}) =
|
||||
idx_up[number<1>{}] ^ (idx_up[number<0>{}] % up_lengths_[number<1>{}]);
|
||||
idx_up[number<1>{}] ^ (2 * idx_up[number<0>{}] % up_lengths_[number<1>{}]);
|
||||
// if (threadIdx.x < 64)
|
||||
// {
|
||||
// printf("lane: %u | idx_low: (%d, %d) | idx_up: (%d, %d)\n",
|
||||
// __lane_id(),
|
||||
// idx_low[number<0>{}], idx_low[number<1>{}],
|
||||
// idx_up[number<0>{}], idx_up[number<1>{}]);
|
||||
// }
|
||||
}
|
||||
|
||||
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
|
||||
|
||||
164
include/ck_tile/core/tensor/distributed_lds_tensor.hpp
Normal file
164
include/ck_tile/core/tensor/distributed_lds_tensor.hpp
Normal file
@@ -0,0 +1,164 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include "ck_tile/core/numeric/integer.hpp"
|
||||
#include "ck_tile/core/numeric/integral_constant.hpp"
|
||||
#include "ck_tile/core/numeric/numeric.hpp"
|
||||
#include "ck_tile/core/utility/type_traits.hpp"
|
||||
#include "ck_tile/core/tensor/tensor_descriptor.hpp"
|
||||
#include "ck_tile/core/tensor/tile_distribution.hpp"
|
||||
#include "ck_tile/core/arch/arch.hpp"
|
||||
|
||||
namespace ck_tile
|
||||
{
|
||||
|
||||
// A static distributed tensor backed by shared memory (LDS),
|
||||
// with indexing defined by a naive packed tensor descriptor of the Y-dimensions
|
||||
// of the provided StaticTileDistribution.
|
||||
//
|
||||
// Notes:
|
||||
// - This does NOT allocate LDS. Callers must provide a __shared__-allocated buffer
|
||||
// of size get_lds_buffer_size() elements of DataType.
|
||||
// - Index mapping uses a naive packed descriptor derived from the Y lengths of the
|
||||
// distribution to compute a linear offset into LDS.
|
||||
// - Distributed indexing API mirrors static_distributed_tensor (operator[]/operator()).
|
||||
//
|
||||
// Example (inside __global__ kernel):
|
||||
// using Dist = decltype(make_static_tile_distribution(my_encoding{}));
|
||||
// using Tensor = ck_tile::static_distributed_lds_tensor<float, Dist>;
|
||||
//
|
||||
// // Compile-time LDS size in elements
|
||||
// __shared__ float lds[Tensor::get_lds_buffer_size()];
|
||||
//
|
||||
// Tensor t{lds};
|
||||
// // write using distributed indices (compile-time indices)
|
||||
// t(ck_tile::detail::make_tile_distributed_index(ck_tile::sequence<0>{},
|
||||
// ck_tile::sequence<0>{})) = 1.0f;
|
||||
// ck_tile::block_sync_lds();
|
||||
//
|
||||
// // read back similarly
|
||||
// auto v = t[ck_tile::detail::make_tile_distributed_index(ck_tile::sequence<0>{},
|
||||
// ck_tile::sequence<0>{})];
|
||||
//
|
||||
template <typename DataType_, typename StaticTileDistribution_>
|
||||
struct static_distributed_lds_tensor
|
||||
{
|
||||
using DataType = remove_cvref_t<DataType_>;
|
||||
using StaticTileDistribution = remove_cvref_t<StaticTileDistribution_>;
|
||||
|
||||
static_assert(StaticTileDistribution::is_static(),
|
||||
"StaticTileDistribution must be fully known at compile time");
|
||||
|
||||
// Build a naive packed Y descriptor based on the distribution's Y lengths.
|
||||
// This mirrors the usage pattern from static_distributed_tensor where offsets are
|
||||
// computed through the Ys-to-D descriptor, but here we explicitly construct a naive
|
||||
// packed descriptor to satisfy the requirement.
|
||||
using YsDescriptorOriginal =
|
||||
remove_cvref_t<decltype(StaticTileDistribution{}.get_ys_to_d_descriptor())>;
|
||||
|
||||
using YsNaivePackedDescriptor =
|
||||
remove_cvref_t<decltype(make_naive_tensor_descriptor_packed(YsDescriptorOriginal{}.get_lengths()))>;
|
||||
|
||||
using ThreadTensorDesc = YsNaivePackedDescriptor;
|
||||
|
||||
static constexpr index_t PackedSize =
|
||||
ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
|
||||
|
||||
static constexpr index_t kLdsElementSpaceSize = ThreadTensorDesc{}.get_element_space_size();
|
||||
static_assert(kLdsElementSpaceSize > 0, "Make sure tile distribution is valid for LDS");
|
||||
|
||||
// Number of DataType elements required in LDS buffer (considering PackedSize for DataType)
|
||||
CK_TILE_HOST_DEVICE static constexpr index_t get_lds_buffer_size()
|
||||
{
|
||||
return kLdsElementSpaceSize / PackedSize;
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr auto get_num_of_dimension()
|
||||
{
|
||||
return StaticTileDistribution::get_num_of_dimension_x();
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr auto get_lengths()
|
||||
{
|
||||
return StaticTileDistribution::get_lengths();
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr auto get_tile_distribution()
|
||||
{
|
||||
return StaticTileDistribution{};
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr auto get_distributed_spans()
|
||||
{
|
||||
return StaticTileDistribution::get_distributed_spans();
|
||||
}
|
||||
|
||||
// Construct with an LDS pointer (caller-provided __shared__ storage).
|
||||
CK_TILE_HOST_DEVICE explicit constexpr static_distributed_lds_tensor(DataType* lds_ptr)
|
||||
: p_lds_{lds_ptr}
|
||||
{
|
||||
}
|
||||
|
||||
// Bind or rebind LDS pointer at runtime if needed.
|
||||
CK_TILE_DEVICE void bind(DataType* lds_ptr) { p_lds_ = lds_ptr; }
|
||||
|
||||
CK_TILE_DEVICE constexpr const DataType* data() const { return p_lds_; }
|
||||
CK_TILE_DEVICE constexpr DataType* data() { return p_lds_; }
|
||||
|
||||
// Distributed indexing: const access
|
||||
template <typename TileDistributedIndices>
|
||||
CK_TILE_HOST_DEVICE constexpr const DataType& operator[](TileDistributedIndices) const
|
||||
{
|
||||
static_assert(is_static_v<TileDistributedIndices>,
|
||||
"TileDistributedIndices must be static");
|
||||
constexpr auto y_idx = get_tile_distribution().get_y_indices_from_distributed_indices(
|
||||
TileDistributedIndices{});
|
||||
constexpr index_t linear = ThreadTensorDesc{}.calculate_offset(y_idx) / PackedSize;
|
||||
return p_lds_[linear];
|
||||
}
|
||||
|
||||
// Distributed indexing: mutable access
|
||||
template <typename TileDistributedIndices>
|
||||
CK_TILE_HOST_DEVICE constexpr DataType& operator()(TileDistributedIndices)
|
||||
{
|
||||
static_assert(is_static_v<TileDistributedIndices>,
|
||||
"TileDistributedIndices must be static");
|
||||
constexpr auto y_idx = get_tile_distribution().get_y_indices_from_distributed_indices(
|
||||
TileDistributedIndices{});
|
||||
constexpr index_t linear = ThreadTensorDesc{}.calculate_offset(y_idx) / PackedSize;
|
||||
return p_lds_[linear];
|
||||
}
|
||||
|
||||
// Returns the naive Y-descriptor (type-only, constexpr-friendly)
|
||||
CK_TILE_HOST_DEVICE static constexpr auto get_y_naive_packed_descriptor()
|
||||
{
|
||||
return ThreadTensorDesc{};
|
||||
}
|
||||
|
||||
private:
|
||||
DataType* p_lds_ = nullptr; // shared (LDS) pointer provided by caller
|
||||
};
|
||||
|
||||
// Helper: make function
|
||||
template <typename DataType, typename StaticTileDistribution>
|
||||
CK_TILE_HOST_DEVICE constexpr auto
|
||||
make_static_distributed_lds_tensor(DataType* lds_ptr, const StaticTileDistribution&)
|
||||
{
|
||||
return static_distributed_lds_tensor<remove_cvref_t<DataType>,
|
||||
remove_cvref_t<StaticTileDistribution>>{lds_ptr};
|
||||
}
|
||||
|
||||
// Utility to query required LDS size (in elements of DataType) for a given distribution
|
||||
template <typename DataType, typename StaticTileDistribution>
|
||||
CK_TILE_HOST_DEVICE constexpr index_t
|
||||
get_required_lds_size_elems(const StaticTileDistribution&)
|
||||
{
|
||||
using tensor_t = static_distributed_lds_tensor<remove_cvref_t<DataType>,
|
||||
remove_cvref_t<StaticTileDistribution>>;
|
||||
return tensor_t::get_lds_buffer_size();
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -59,6 +59,10 @@ struct GemmPipelineAgBgCrImplBase
|
||||
const ElementFunction& element_func) const
|
||||
{
|
||||
const auto block_tile_tmp = tile_elementwise_in(element_func, src_block_tile);
|
||||
ignore = lds_tile_window;
|
||||
ignore = src_block_tile;
|
||||
ignore = element_func;
|
||||
ignore = block_tile_tmp;
|
||||
store_tile(lds_tile_window, block_tile_tmp);
|
||||
}
|
||||
|
||||
@@ -66,6 +70,10 @@ struct GemmPipelineAgBgCrImplBase
|
||||
CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window,
|
||||
const SrcBlockTile& src_block_tile) const
|
||||
{
|
||||
ignore = lds_tile_window;
|
||||
ignore = src_block_tile;
|
||||
// CK_PRINT<typename remove_cvref_t<decltype(src_block_tile)>::StaticTileDistribution::DstrEncode>();
|
||||
|
||||
store_tile(lds_tile_window, src_block_tile);
|
||||
}
|
||||
|
||||
@@ -74,10 +82,12 @@ struct GemmPipelineAgBgCrImplBase
|
||||
const SrcTileWindow& lds_tile_window,
|
||||
bool_constant<LoadTranspose> = {}) const
|
||||
{
|
||||
if constexpr(LoadTranspose)
|
||||
dst_block_tile = load_tile_transpose(lds_tile_window);
|
||||
else
|
||||
load_tile(dst_block_tile, lds_tile_window);
|
||||
// if constexpr(LoadTranspose)
|
||||
// dst_block_tile = load_tile_transpose(lds_tile_window);
|
||||
// else
|
||||
// load_tile(dst_block_tile, lds_tile_window);
|
||||
ignore = dst_block_tile;
|
||||
ignore = lds_tile_window;
|
||||
}
|
||||
|
||||
CK_TILE_DEVICE auto GetABLdsTensorViews(void* p_smem) const
|
||||
|
||||
@@ -444,12 +444,16 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
|
||||
constexpr auto b_lds_load_tile_distr =
|
||||
make_static_tile_distribution(BlockGemm::MakeBBlockDistributionEncode());
|
||||
|
||||
// CK_PRINT<typename decltype(a_lds_load_tile_distr)::DstrEncode>();
|
||||
|
||||
// A DRAM tile window for load
|
||||
// A LDS tile window for store
|
||||
// A LDS tile for block GEMM
|
||||
auto&& [a_copy_dram_window, a_copy_lds_window, a_lds_gemm_window] =
|
||||
Base::GetAWindows(a_dram_block_window_tmp, a_lds_block, a_lds_load_tile_distr);
|
||||
|
||||
// CK_PRINT<typename remove_cvref_t<decltype(a_copy_dram_window[number<0>{}])>::Base::TileDstr::DstrEncode>();
|
||||
|
||||
// B DRAM tile window for load
|
||||
// B LDS tile window for store
|
||||
// B LDS tile for block GEMM
|
||||
@@ -478,6 +482,10 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
|
||||
auto elementwise_As_res =
|
||||
load_tile_with_elementwise(a_copy_dram_window, a_element_func);
|
||||
|
||||
|
||||
// CK_PRINT<typename decltype(elementwise_As_res)::StaticTileDistribution::DstrEncode>();
|
||||
// CK_PRINT<typename remove_cvref_t<decltype(a_copy_dram_window[number<0>{}])>::Base::TileDstr::DstrEncode>();
|
||||
|
||||
// Move each A — the enhanced function move_tile_window is executed, which takes a tuple
|
||||
// as input.
|
||||
move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
|
||||
|
||||
@@ -545,6 +545,8 @@ struct UniversalGemmBasePolicy
|
||||
VecLoadSize,
|
||||
getATileAccessPattern(),
|
||||
NumWaveGroups>;
|
||||
CK_PRINT<TileEncodingPattern>();
|
||||
CK_PRINT<typename remove_cvref_t<decltype(TileEncodingPattern::make_2d_static_tile_distribution())>::DstrEncode>();
|
||||
return TileEncodingPattern::make_2d_static_tile_distribution();
|
||||
}
|
||||
// Tile: KPerBlock X MPerBlock
|
||||
|
||||
19
shared_banks/test.cpp
Normal file
19
shared_banks/test.cpp
Normal file
@@ -0,0 +1,19 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
__global__ void conflict_simulator()
|
||||
{
|
||||
// __shared__ int buf[64];
|
||||
// buf[threadIdx.x] = threadIdx.x; // -> 0 conflicts
|
||||
//
|
||||
// __shared__ int buf[2 * 64];
|
||||
// buf[2 * threadIdx.x] = threadIdx.x; // -> 1 conflict per access (2-way)
|
||||
//
|
||||
__shared__ int buf[4 * 64];
|
||||
buf[4 * threadIdx.x] = threadIdx.x; // -> 3 conflicts per access (4-way)
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
conflict_simulator<<<1, 64, 0, 0>>>();
|
||||
return 0;
|
||||
}
|
||||
144
shared_banks/test_with_elementwise.cpp
Normal file
144
shared_banks/test_with_elementwise.cpp
Normal file
@@ -0,0 +1,144 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <ck_tile/core.hpp>
|
||||
#include <ck_tile/core/utility/debug.hpp>
|
||||
|
||||
using ck_tile::number;
|
||||
using ck_tile::make_naive_tensor_descriptor;
|
||||
using ck_tile::make_tile_window;
|
||||
using ck_tile::make_tensor_view;
|
||||
using ck_tile::make_tuple;
|
||||
using ck_tile::address_space_enum;
|
||||
using ck_tile::fp16_t;
|
||||
using ck_tile::index_t;
|
||||
|
||||
using ck_tile::tile_distribution_encoding;
|
||||
using ck_tile::sequence;
|
||||
using ck_tile::tuple;
|
||||
using ck_tile::make_static_distributed_tensor;
|
||||
using ck_tile::make_static_tile_distribution;
|
||||
using ck_tile::get_n_lds_banks;
|
||||
using ck_tile::get_n_words_per_128b;
|
||||
using ck_tile::make_xor_transform;
|
||||
using ck_tile::make_pass_through_transform;
|
||||
using ck_tile::make_unmerge_transform;
|
||||
|
||||
constexpr index_t kBlockSize = 256;
|
||||
|
||||
template<typename DataType, index_t MPerBlock, index_t KPerBlock>
|
||||
__device__ constexpr auto make_lds_tensor_descriptor()
|
||||
{
|
||||
constexpr auto DataTypeSize = sizeof(DataType);
|
||||
constexpr index_t KPack = 16 / sizeof(DataType);
|
||||
|
||||
constexpr auto MLdsLayer =
|
||||
ck_tile::max(1UL, get_n_lds_banks() * get_n_words_per_128b() / KPerBlock / DataTypeSize);
|
||||
|
||||
constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
|
||||
make_tuple(number<KPerBlock / KPack * MLdsLayer>{},
|
||||
number<MPerBlock / MLdsLayer>{},
|
||||
number<KPack>{}),
|
||||
make_tuple(number<KPack>{}, number<KPerBlock * MLdsLayer>{}, number<1>{}),
|
||||
number<KPack>{},
|
||||
number<1>{});
|
||||
|
||||
constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
|
||||
a_lds_block_desc_0,
|
||||
make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
|
||||
number<KPerBlock / KPack * MLdsLayer>{})),
|
||||
make_pass_through_transform(number<KPack>{})),
|
||||
make_tuple(sequence<1, 0>{}, sequence<2>{}),
|
||||
make_tuple(sequence<1, 0>{}, sequence<2>{}));
|
||||
|
||||
constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
|
||||
a_lds_block_desc_permuted,
|
||||
make_tuple(make_unmerge_transform(
|
||||
make_tuple(number<MLdsLayer>{}, number<KPerBlock / KPack>{})),
|
||||
make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
|
||||
make_pass_through_transform(number<KPack>{})),
|
||||
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
|
||||
make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
|
||||
|
||||
constexpr auto a_lds_block_desc = transform_tensor_descriptor(
|
||||
a_lds_block_desc_xk0_mnldslayer_mn_xk1,
|
||||
make_tuple(make_merge_transform_v3_division_mod(
|
||||
make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
|
||||
make_merge_transform_v3_division_mod(
|
||||
make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
|
||||
make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
|
||||
make_tuple(sequence<0>{}, sequence<1>{}));
|
||||
|
||||
return a_lds_block_desc;
|
||||
}
|
||||
|
||||
template<typename DType, index_t MPerBlock, index_t KPerBlock>
|
||||
__device__
|
||||
constexpr auto make_register_distribution()
|
||||
{
|
||||
using ck_tile::tile_distribution_encoding_pattern_2d;
|
||||
using ck_tile::tile_distribution_pattern;
|
||||
constexpr auto VecLoadSize = 16 / sizeof(DType);
|
||||
using TileEncodingPattern =
|
||||
tile_distribution_encoding_pattern_2d<kBlockSize,
|
||||
MPerBlock,
|
||||
KPerBlock,
|
||||
VecLoadSize,
|
||||
tile_distribution_pattern::thread_raked>;
|
||||
return TileEncodingPattern::make_2d_static_tile_distribution();
|
||||
}
|
||||
|
||||
template<typename DType, index_t MTile, index_t KTile>
|
||||
__global__ void lds_write_simulator()
|
||||
{
|
||||
__shared__ int buf[160000 / sizeof(int)];
|
||||
|
||||
// lds setup
|
||||
auto lds_tensor_descriptor = make_lds_tensor_descriptor<DType, MTile, KTile>();
|
||||
auto lds_tensor_view = make_tensor_view<address_space_enum::lds>(reinterpret_cast<DType*>(buf), lds_tensor_descriptor);
|
||||
auto lds_write_window = make_tile_window(lds_tensor_view, make_tuple(number<MTile>{}, number<KTile>{}), {0, 0});
|
||||
|
||||
// register setup
|
||||
auto reg_tile_dst = make_register_distribution<DType, MTile, KTile>();
|
||||
auto reg_tensor = make_static_distributed_tensor<DType>(reg_tile_dst);
|
||||
|
||||
// writeout
|
||||
store_tile(lds_write_window, reg_tensor);
|
||||
|
||||
ck_tile::block_sync_lds();
|
||||
}
|
||||
|
||||
__device__
|
||||
constexpr auto make_lds_read_distribution()
|
||||
{
|
||||
return make_static_tile_distribution(tile_distribution_encoding<
|
||||
ck_tile::sequence<2>,
|
||||
ck_tile::tuple<ck_tile::sequence<4, 2, 16>, ck_tile::sequence<4, 4, 4>>,
|
||||
ck_tile::tuple<ck_tile::sequence<1, 0>, ck_tile::sequence<2, 1>>,
|
||||
ck_tile::tuple<ck_tile::sequence<1, 0>, ck_tile::sequence<1, 2>>,
|
||||
ck_tile::sequence<1, 2, 2>,
|
||||
ck_tile::sequence<0, 0, 2>>{});
|
||||
}
|
||||
|
||||
template<typename DType, index_t MTile, index_t KTile>
|
||||
__global__ void lds_read_simulator()
|
||||
{
|
||||
__shared__ int buf[160000 / sizeof(int)];
|
||||
|
||||
auto lds_tensor_descriptor = make_lds_tensor_descriptor<DType, MTile, KTile>();
|
||||
auto lds_tensor_view = make_tensor_view<address_space_enum::lds>(reinterpret_cast<DType*>(buf), lds_tensor_descriptor);
|
||||
|
||||
constexpr auto reg_tile_dst = make_lds_read_distribution();
|
||||
auto lds_read_window = make_tile_window(lds_tensor_view, make_tuple(number<MTile>{}, number<KTile>{}), {0, 0}, reg_tile_dst);
|
||||
|
||||
[[maybe_unused]] auto reg_tile = load_tile(lds_read_window);
|
||||
ck_tile::block_sync_lds();
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
constexpr auto kGrid = 1;
|
||||
constexpr auto kBlockM = 128;
|
||||
constexpr auto kBlockK = 64;
|
||||
lds_write_simulator<fp16_t, kBlockM, kBlockK><<<kGrid, kBlockSize>>>();
|
||||
lds_read_simulator<fp16_t, kBlockM, kBlockK><<<kGrid, kBlockSize>>>();
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user