mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-03 21:21:22 +00:00
Add column to image kernel (#930)
* Add column to image kernel
* Minor fixes for dtypes and client examples
* Disable tests for disabled dtypes
* Disable add instances functions for disabled data types
* Minor stylistic fixes
* Revert "Disable add instances functions for disabled data types"
This reverts commit 728b869563.
* Instances reduction
* Add comments in device_column_to_image_impl
* Update changelog and Copyrights
* Improve changelog
This commit is contained in:
@@ -16,6 +16,36 @@
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename InputGridDesc,
|
||||
typename InputDataType,
|
||||
typename OutputGridDesc,
|
||||
typename OutputDataType,
|
||||
typename Block2ETileMap,
|
||||
typename GridwiseTensorRearrangeKernel>
|
||||
__global__ void
|
||||
#if CK_USE_LAUNCH_BOUNDS
|
||||
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
|
||||
#endif
|
||||
kernel_tensor_rearrange(const InputGridDesc in_grid_desc,
|
||||
const InputDataType* __restrict__ p_in_global,
|
||||
const OutputGridDesc out_grid_desc,
|
||||
OutputDataType* __restrict__ p_out_global,
|
||||
const Block2ETileMap block_2_tile_map)
|
||||
{
|
||||
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
|
||||
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
|
||||
defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
|
||||
GridwiseTensorRearrangeKernel::Run(
|
||||
in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map);
|
||||
#else
|
||||
ignore = in_grid_desc;
|
||||
ignore = p_in_global;
|
||||
ignore = out_grid_desc;
|
||||
ignore = p_out_global;
|
||||
ignore = block_2_tile_map;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename InputGridDesc,
|
||||
typename InputDataType,
|
||||
typename OutputGridDesc,
|
||||
@@ -25,8 +55,9 @@ template <typename InputGridDesc,
|
||||
index_t KPerBlock,
|
||||
typename ThreadClusterLengths,
|
||||
index_t ScalarPerVector,
|
||||
InMemoryDataOperationEnum DstInMemOp,
|
||||
typename Block2ETileMap>
|
||||
struct GridwiseImageToColumn
|
||||
struct GridwiseTensorRearrange
|
||||
{
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
@@ -55,27 +86,27 @@ struct GridwiseImageToColumn
|
||||
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_out_global, out_grid_desc.GetElementSpaceSize());
|
||||
|
||||
auto copy_global_to_global = ThreadGroupTensorSliceTransfer_v7<
|
||||
ThisThreadBlock,
|
||||
Tuple<InputDataType>,
|
||||
Tuple<OutputDataType>,
|
||||
decltype(tie(in_grid_desc)),
|
||||
decltype(tie(out_grid_desc)),
|
||||
tensor_operation::element_wise::PassThrough,
|
||||
Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
|
||||
Sequence<MPerBlock, KPerBlock>,
|
||||
ThreadClusterLengths,
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
I1,
|
||||
ScalarPerVector,
|
||||
Sequence<true>,
|
||||
Sequence<true>>{
|
||||
in_grid_desc,
|
||||
make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
|
||||
out_grid_desc,
|
||||
make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
|
||||
tensor_operation::element_wise::PassThrough{}};
|
||||
auto copy_global_to_global =
|
||||
ThreadGroupTensorSliceTransfer_v7<ThisThreadBlock,
|
||||
Tuple<InputDataType>,
|
||||
Tuple<OutputDataType>,
|
||||
decltype(tie(in_grid_desc)),
|
||||
decltype(tie(out_grid_desc)),
|
||||
tensor_operation::element_wise::PassThrough,
|
||||
Sequence<static_cast<index_t>(DstInMemOp)>,
|
||||
Sequence<MPerBlock, KPerBlock>,
|
||||
ThreadClusterLengths,
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
I1,
|
||||
ScalarPerVector,
|
||||
Sequence<true>,
|
||||
Sequence<true>>{
|
||||
in_grid_desc,
|
||||
make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
|
||||
out_grid_desc,
|
||||
make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
|
||||
tensor_operation::element_wise::PassThrough{}};
|
||||
|
||||
copy_global_to_global.Run(
|
||||
tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
|
||||
Reference in New Issue
Block a user